# Co-mentioning  

* Remove preprints from articles
* Create co-mentioning matrix
* Check if (mentions) articles are about a tool in bio.tools 


Mentions file should contain a list where each element has (tool name, pmid and list of articles mentioning the tool)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bh24_literature_mining.utils import read_cites_from_json

path = '../var/biotools_cites.json' # REPLACE with the path to the mentions file

tools = read_cites_from_json(path)

print(f'Loaded {len(tools)} tools.')

Loaded 9453 tools.


### Remove preprints from articles in every tool

For published articles there can also be preprints included since DOI and other IDs are different from the final publication to the preprint.

In [3]:
for tool in tools:
    tool['articles'] = [article for article in tool['articles'] if article.pubType != 'preprint']

print(f'Filtered out preprints. Now {len(tools)} tools.')

Filtered out preprints. Now 9453 tools.


### Collect unique article IDs

In [4]:
publication_ids = list({article.id for tool in tools for article in tool['articles']})

print(f'Found {len(publication_ids)} unique publication IDs.')

Found 366828 unique publication IDs.


### Create binary matrix for tools vs. articles

In [7]:
import pandas as pd

tools_short = tools[:20]  # Replace with the list of tools you want to process

# Total number of tools to process
total_tools = len(tools_short)

# Initialize the matrix
matrix = []

for i, tool in enumerate(tools_short):
    # Compute row with boolean values for each article_id
    row = [
        article_id in [article.id for article in tool["articles"]]
        for article_id in publication_ids
    ]
    matrix.append(row)

    # Print progress every 10 tools, or adjust the frequency as needed
    if (i + 1) % 10 == 0 or (i + 1) == total_tools:
        print(
            f"Progress: {i + 1}/{total_tools} tools processed ({(i + 1) / total_tools * 100:.2f}%)"
        )

comentions_df = pd.DataFrame(matrix, columns=publication_ids)

# Set rownames to tool names
comentions_df.index = [tool["name"] for tool in tools_short]

Progress: 10/20 tools processed (50.00%)
Progress: 20/20 tools processed (100.00%)


In [None]:
duplicated_indices = comentions_df.index[comentions_df.index.duplicated(keep=False)]

duplicated_rows = comentions_df.loc[duplicated_indices]

unique_rows = duplicated_rows.groupby(level=0).filter(lambda x: x.nunique().eq(1).all())

# Remove exact duplicates by dropping all but the first instance
comentions_matrix = pd.concat([comentions_df.drop(index=unique_rows.index), unique_rows.groupby(level=0).first()])

print("Deduplicated DataFrame:")
print(comentions_matrix.head())


### Check if articles are about a tool in bio.tools

In [None]:
# Load bio.tools data

biotoolspath = 'path/to/biotools.json' # REPLACE

with open(biotoolspath) as f:
    biotools = json.load(f)


publication_ids_in_biotools = [
    (id, tool['biotoolsID']) 
    for id in publication_ids 
    for tool in biotools 
    if id in [article.get('pmid') or article.get('pmcid') for article in tool.get('publication', [])]
]

print(f'Matched {len(publication_ids_in_biotools)} publication IDs with a bio.tools tool.')