## Libraries

In [None]:
! pip install -q biopython
! pip install -q pyvis

[K     |████████████████████████████████| 2.3 MB 6.8 MB/s 
[?25h

In [None]:
# Fetching PubMed article metadata
from Bio import Entrez, Medline

# Graph creation and visualisation
from pyvis.network import Network
import networkx as nx 

import time
from tqdm import tqdm
import os 
from operator import itemgetter

# data handling
import pandas as pd
import numpy as np

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

plt.style.use('fivethirtyeight')

In [None]:
starttime = time.time()

## Helper Code

In [None]:
def process_pmid_txt(text_file_path):

  pmids = list()

  f = open(text_file_path, "r")

  for pmid in f.read().split('\n'):
    pmids.append(pmid.strip())  
  
  f.close()
  
  return pmids

In [None]:
def fetch_data(pmids):
    """Returns pubmed record associated with the PMID(s)"""
    
    Entrez.email = 'akishirsath@gmail.com'

    handle = Entrez.efetch(db="pubmed", 
                           id=pmids, 
                           rettype="medline", 
                           retmode="text")

    records = Medline.parse(handle)    
    
    return list(records)

## Fetching the records

In [None]:
sarscov_file_path = "/content/drive/MyDrive/05-Data/PubMed-Common-Enzymes/pmid-sarscov2-set.txt"

sarscov_pmids = process_pmid_txt(sarscov_file_path)

sarscov_data = fetch_data(",".join(sarscov_pmids))

In [None]:
len(sarscov_data)

10000

In [None]:
enzyme_classes = {
    '1':'Oxidoreductases',
    '2':'Transferases',
    '3':'Hydrolases',
    '4':'Lyases',
    '5':'Isomerases',
    '6':'Ligases',
    '7':'Translocases',
}

In [None]:
"""

for record in sarscov_data:
  substances = record.get('RN', "NONE")
  if substances != "NONE":
    for molecule in substances:
      if molecule.startswith('EC'):
        # Primary PMID node
        main_node = str(record.get('PMID', "NONE")).strip()
        G.add_node(main_node)
        # Secondary Enzyme node
        G.add_node(molecule)
        G.add_node(molecule, size=15, title='Hydrolases', group=1)
        G.add_edge(main_node, molecule)


"""

'\n\nfor record in sarscov_data:\n  substances = record.get(\'RN\', "NONE")\n  if substances != "NONE":\n    for molecule in substances:\n      if molecule.startswith(\'EC\'):\n        # Primary PMID node\n        main_node = str(record.get(\'PMID\', "NONE")).strip()\n        G.add_node(main_node)\n        # Secondary Enzyme node\n        G.add_node(molecule)\n        G.add_node(molecule, size=15, title=\'Hydrolases\', group=1)\n        G.add_edge(main_node, molecule)\n\n\n'

In [None]:
G = nx.Graph()

In [None]:
for record in sarscov_data:
  article_substances = record.get('RN', "NONE")
  article_pmid = str(record.get('PMID', "NONE")).strip()
  article_title = str(record.get('TI', "NONE")).strip()

  if article_substances != "NONE":
    for molecule in article_substances:
      for classno in enzyme_classes.keys():
        G.add_node(article_pmid, size=20, group=10)
        if molecule.startswith('EC') and molecule.split(' ')[1].startswith(classno):
          G.add_node(molecule, size=15, title=enzyme_classes.get(classno), group=int(classno))
          G.add_edge(article_pmid, molecule)

In [None]:
G.number_of_nodes()

6143

In [None]:
G.number_of_edges()

2246

In [None]:
nx.write_graphml_lxml(G, "covid_19_enzymes.graphml")

In [None]:
nt = Network('720px', '720px', notebook=True)

nt.from_nx(G)

nt.show('covid_19_enzymes.html')

In [None]:
endtime = time.time()

In [None]:
total = (endtime - starttime)/60
total

1.741008953253428