# Create adjacency matrices of co-mentioned or co-cited tools from text mining results

This notebook gets the lists of tool mentionings (or citations) and creates a pkl file of the adjacency matrix.

In [None]:
import numpy as np
import json
import pandas as pd
import requests
import igraph as ig

from bh24_literature_mining.europepmc_api import EuropePMCClient, Article
from bh24_literature_mining.utils import (
    load_biotools_pub,
    load_biotools_from_zip,
    load_biotools_from_json,
)

### Read the data
E.g. "biotools_cites.json" or "biotools_mentions.json"

In [None]:
tools = Article.read_cites_from_json("biotools_cites.json")

print(f"Loaded {len(tools)} tools.")

Loaded 9453 tools.


### Check number of articles in every tool


In [None]:
all_ids = []
for tool in tools:
    for article in tool["articles"]:
        all_ids.append(article.id)
all_ids = set(all_ids)

print(f"Total number of articles: {len(all_ids)}")

Total number of articles: 366828


### Remove preprints from articles in every tool

For published articles there can also be preprints included since DOI and other IDs are different from the final publication to the preprint.

In [None]:
for tool in tools:
    tool["articles"] = [
        article for article in tool["articles"] if article.pubType != "preprint"
    ]
all_ids = []
for tool in tools:
    for article in tool["articles"]:
        all_ids.append(article.id)
all_ids = set(all_ids)

print(f"Filtered out preprints. Now {len(all_ids)} articles.")

Filtered out preprints. Now 366828 articles.


### Check for duplicates

Check duplicates with respect to tool_name (these can be the same for different tools), and then check duplicates for tool_name + pubmedid combination (these should be unique). 

In [None]:
from collections import Counter

tool_names = [tool["name"] for tool in tools]
duplicates_names = [name for name, count in Counter(tool_names).items() if count > 1]
print("Duplicate tool names:", duplicates_names)

tool_names_pubmedid = [tool["name"] + "_" + str(tool["pubmedid"]) for tool in tools]
duplicates_names_id = [
    name for name, count in Counter(tool_names_pubmedid).items() if count > 1
]
print("Duplicate tool names + pubmedids:", duplicates_names_id)

triplicated_names_id = [
    name for name, count in Counter(tool_names_pubmedid).items() if count > 2
]
print("Triplicated tool names + pubmedids:", triplicated_names_id)

Duplicate tool names: ['ITHANET', 'BCFtools', 'SAMtools', 'REPET', 'Integrated Microbial Genomes (IMG)', 'Reactome', 'Expasy', 'BUSCO', 'BLAST (EBI)', 'E-RNAi', 'Interactive Tree Of Life (iTOL)', 'IMP', 'H++', 'BioMet Toolbox', 'E-SNPs and GO', 'DeepMito', 'cd-hit', 'SNAP', 'Bowtie 2', 'ClinVar', 'HGMD', 'compareMS2', 'MS Amanda', 'REDIportal', 'miEAA', 'ReMap', 'EBI Tools', 'PREP Suite', 'ODNA', 'ImaGene', 'PDB-REDO databank', 'FIPRESCI', 'InterProScan (EBI)', 'FastTree', 'antiSMASH', 'V-pipe', 'HMMER3', 'MAFFT', 'MINT', 'EvolView', 'RNAget', 'shinyHTGQC', 'Planet Microbe', '3SRP', 'JASPAR', 'JASPAR RESTful API', 'CHOPCHOP', 'WEBnma', 'CLAIRE', 'Salmobase', 'MirGeneDB', 'VETA', 'RINspector', 'MetExplore', '3D-BioInfo + BioExcel: Protein Conformational Ensembles Generation', 'COVID19 Outbreak Simulator', 'Watchdog', 'VariantSpark', 'DDGun', 'PomBase', 'PhysiCell', 'Metabolic Atlas', 'PhylDiag', 'MobiDB', 'SEMA', 'IGV', 'LNM', 'Conserved domain database CDD', 'Open Targets Platform', 'A

### Leave unique by tool_name + pubmedid
Leave first copy of copies of tool_name + pubmedid (this is the same tool added twice/three times to biotools)

In [None]:
unique_tools = []
seen = set()
for tool in tools:
    identifier = tool["name"] + "_" + str(tool["pubmedid"])
    if identifier not in seen:
        seen.add(identifier)
        unique_tools.append(tool)

print(f"Filtered out duplicates. Now {len(unique_tools)} tools.")

Filtered out duplicates. Now 9435 tools.


### Get all publication IDs

In [None]:
all_ids = []
for tool in unique_tools:
    for article in tool["articles"]:
        all_ids.append(article.id)
all_ids = set(all_ids)

print(f"Total number of articles: {len(all_ids)}")

Total number of articles: 366828


### Create binary matrix for tools vs. articles

In [None]:
tool_names = [tool["name"] + "_" + str(tool["pubmedid"]) for tool in unique_tools]
matrix = pd.DataFrame(0, index=tool_names, columns=sorted(set(all_ids)))
i = 0
for tool in unique_tools:
    print(f"{i}/{len(unique_tools)}")
    i = i + 1
    for article in tool["articles"]:
        matrix.loc[tool["name"] + "_" + str(tool["pubmedid"]), article.id] = 1

0/9435
1/9435
2/9435
3/9435
4/9435
5/9435
6/9435
7/9435
8/9435
9/9435
10/9435
11/9435
12/9435
13/9435
14/9435
15/9435
16/9435
17/9435
18/9435
19/9435
20/9435
21/9435
22/9435
23/9435
24/9435
25/9435
26/9435
27/9435
28/9435
29/9435
30/9435
31/9435
32/9435
33/9435
34/9435
35/9435
36/9435
37/9435
38/9435
39/9435
40/9435
41/9435
42/9435
43/9435
44/9435
45/9435
46/9435
47/9435
48/9435
49/9435
50/9435
51/9435
52/9435
53/9435
54/9435
55/9435
56/9435
57/9435
58/9435
59/9435
60/9435
61/9435
62/9435
63/9435
64/9435
65/9435
66/9435
67/9435
68/9435
69/9435
70/9435
71/9435
72/9435
73/9435
74/9435
75/9435
76/9435
77/9435
78/9435
79/9435
80/9435
81/9435
82/9435
83/9435
84/9435
85/9435
86/9435
87/9435
88/9435
89/9435
90/9435
91/9435
92/9435
93/9435
94/9435
95/9435
96/9435
97/9435
98/9435
99/9435
100/9435
101/9435
102/9435
103/9435
104/9435
105/9435
106/9435
107/9435
108/9435
109/9435
110/9435
111/9435
112/9435
113/9435
114/9435
115/9435
116/9435
117/9435
118/9435
119/9435
120/9435
121/9435
122/9435
123

In [26]:
matrix

Unnamed: 0,10811817,11125070,11178258,11574056,11597340,11869452,12019020,12171605,12877744,14611664,...,PPR7024,PPR7025,PPR7026,PPR7027,PPR7028,PPR7029,PPR7032,PPR7033,PPR7034,PPR7035
Seurat_34062119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bakta_34739369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
gget_38377393,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SIBiLS_32379317,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Variomes_35274687,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MeDeCom_28340624,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MToolBox_25028726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCBI Bookshelf_23203889,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
EXTRACT_26896844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Filter publications that occur in zero/one tool


In [27]:
matrix_filt = matrix.loc[:, matrix.sum(axis=0) > 1]

### Creating adjacency matrix

This is much faster using sparse matrices. Fill diagonal values with zeros (to avoid self loops in the graph).

In [None]:
# Running as sparse matrices
from scipy.sparse import csr_matrix

# Example sparse arrays
a_sparse = csr_matrix(matrix.values)
b_sparse = csr_matrix(matrix.values.T)

# Dot product for sparse matrices
adj = a_sparse.dot(b_sparse).toarray()

adj_df = pd.DataFrame(adj, index=matrix.index, columns=matrix.index)

# Fill diagonal with zeros
np.fill_diagonal(adj_df.values, 0)

### Save adjacency matrix  
As compressed pickle file       

In [29]:
adj_df.to_pickle("../biotoolspub/adjancency_cites_filt.pkl", compression="gzip")