# Example notebook

Here we get the Article classes for all given bio.tools tools.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bh24_literature_mining.utils import load_biotools_pub
from bh24_literature_mining.biotools import Tool_entry

tools = load_biotools_pub("../biotoolspub/biotoolspub.tsv")

# Initialize set to track unique names and a list to store Biotool objects
tools_lower = set()
unique_biotools = []

# Iterate over the rows in the DataFrame
for _, row in tools.iterrows():
    name = row["name"]
    biotools_id = row["biotoolsID"]
    name_lower = name.lower()

    # Only add unique tools based on name
    if name_lower not in tools_lower:
        tools_lower.add(name_lower)  # Add to the set to track uniqueness
        # Create a Biotool object and add it to the list
        biotool = Tool_entry(biotool_id=biotools_id, name=name)
        unique_biotools.append(biotool)


print(len(unique_biotools))

15005


## Example on how to use europepmc_api

In [3]:
from pathlib import Path
from bh24_literature_mining.europepmc_api import EuropePMCClient, write_tool_mentions_to_file
from bh24_literature_mining.europepmc_api import identify_tool_mentions_in_sentences
import pandas as pd

results_list = []

client = EuropePMCClient()

for tool in unique_biotools[:100]:
    # Call bio.tools query and get a list of Article objects
    
    biotools_articles = client.search_mentions(tool.name)

    if len(biotools_articles) == 0:
        continue
    first_article = biotools_articles[0]

    relevant_parahraphs = client.get_relevant_paragraphs(first_article.pmcid, tool.name)
    if len(relevant_parahraphs) == 0:
        print("No relevant paragraphs found", tool.name)
        continue
    
    tool.get_topics()


    result = identify_tool_mentions_in_sentences(first_article.pmcid, tool, relevant_parahraphs)
    results_list.extend(result)


No relevant paragraphs found iPHoP
No relevant paragraphs found SLiM
No relevant paragraphs found BCFtools
No relevant paragraphs found AluMine
No relevant paragraphs found ARTEM
No relevant paragraphs found DataDiscovery
No relevant paragraphs found ADAPTIVE
No relevant paragraphs found Marine Metagenomics Portal
No relevant paragraphs found mi-CNN
No relevant paragraphs found SpaceM
No relevant paragraphs found SPUTNIK
No relevant paragraphs found PatchDock
No relevant paragraphs found Reactome
No relevant paragraphs found MAGMA
No relevant paragraphs found DISGENET
No relevant paragraphs found Phobius (EBI)
No relevant paragraphs found hmrbase
No relevant paragraphs found CDpred
No relevant paragraphs found CancerLivER
No relevant paragraphs found Stitch
No relevant paragraphs found JCat
No relevant paragraphs found Expasy
No relevant paragraphs found ENDscript 2.0
No relevant paragraphs found Smoother
No relevant paragraphs found iVar


In [4]:
results_list

[['PMC11286849',
  'Using this technique, we identified 445 proteins with high confidence from trace amounts of highly pure spore preparations, including 52 of the 79 proteins (approximately 70%) previously demonstrated to be localized in spores in the SubtiWiki database and detected through direct protein analysis.',
  [(233, 242, 'SubtiWiki', 'subtiwiki')],
  'Molecular interactions, pathways and networks, Gene expression, Model organisms, Endocrinology and metabolism, Phylogenetics'],
 ['PMC11286849',
  'The 445 identified proteins were searched against the SubtiWiki and UniProt B.',
  [(54, 63, 'SubtiWiki', 'subtiwiki')],
  'Molecular interactions, pathways and networks, Gene expression, Model organisms, Endocrinology and metabolism, Phylogenetics'],
 ['PMC11286849',
  'The identified proteins also included 52 (70%) of the 79 proteins previously annotated as "sporulation proteins" in the SubtiWiki database and found in direct protein analysis (marked in yellow in the Supplementary 

In [5]:

result_df = pd.DataFrame(results_list, columns=["PMCID", "Sentence", "NER_Tags", "Topics"])
result_df = result_df.explode("NER_Tags")

# p = Path(__file__).resolve().parents[2]
# print(p)
result_df.to_csv("/Users/vedran/Desktop/tmp.csv", index=False)



In [6]:
len(results_list)

2026

In [7]:
result_df.sample(1)

Unnamed: 0,PMCID,Sentence,NER_Tags,Topics
1476,PMC11031912,"Next, we calculated the differences in AUC bet...","(55, 70, GenoMycAnalyzer, genomycanalyzer)","Whole genome sequencing, Public health and epi..."
