In [None]:
import json
import pandas as pd

from bh24_literature_mining.utils import load_biotools_from_zip
from bh24_literature_mining.europepmc_api import Article

Load co-citations data 

In [None]:
tools_cites = Article.read_cites_from_json("./biotools_cites.json")

print(f"Loaded {len(tools_cites)} tools.")

Loaded 9453 tools.


Get all publications IDs

In [None]:
all_ids = []
for tool in tools_cites:
    for article in tool["articles"]:
        all_ids.append(article.id)

all_repeated_ids = all_ids

all_ids = list(set(all_ids))

print(f"Total number of articles: {len(all_ids)}")

Total number of articles: 366828


Load biotools dump

In [None]:
biotools = load_biotools_from_zip("./biotoolsdump.zip", "biotools.json")

print(f"Loaded {len(biotools)} bio.tools entries")

Loaded 30239 bio.tools entries


Match articles in citations with bio.tools tools

In [None]:
publication_to_tool = {}

for tool in biotools:
    for article in tool.get("publication", []):
        for pub_id in (article.get("pmid"), article.get("pmcid")):
            if pub_id:
                publication_to_tool[pub_id] = tool["biotoolsID"]


articles_biotoolsID = [publication_to_tool.get(key, "") for key in all_ids]

total_matches = len([id for id in articles_biotoolsID if id != ""])

print(f"Matched {total_matches} publication IDs with a bio.tools entry")

Matched 9544 publication IDs with a bio.tools entry


Merge results with original co-citation data

In [None]:
id_to_biotools = {
    article_id: biotools_id
    for article_id, biotools_id in zip(all_ids, articles_biotoolsID)
}

for tool in tools_cites:
    biotools_ids = [
        id_to_biotools[article.id]
        for article in tool["articles"]
        if article.id in id_to_biotools
    ]
    tool["biotools"] = biotools_ids