# Enzymes Network Graph

## Libraries

In [1]:
! pip install -q biopython
! pip install -q pyvis

In [2]:
# Fetching PubMed article metadata
from Bio import Entrez, Medline

# Graph creation and visualisation
from pyvis.network import Network

import time
from tqdm import tqdm

import pandas as pd

## Helper Code

In [3]:
def fetch_data(pmids):
    """Returns pubmed record associated with the PMID(s)"""
    
    Entrez.email = 'akishirsath@gmail.com'

    handle = Entrez.efetch(db="pubmed", 
                           id=pmids, 
                           rettype="medline", 
                           retmode="text")

    records = Medline.parse(handle)    
    
    return list(records)

In [4]:
def process_pmid_txt(text_file_path):

  pmids = list()

  f = open(text_file_path, "r")

  for pmid in f.read().split('\n'):
    pmids.append(pmid.strip())  
  
  f.close()
  
  return pmids

## Processing PMIDs txt file

In [5]:
cancer = process_pmid_txt("/content/pmid-cancer-set.txt")
covid = process_pmid_txt("/content/pmid-covid19-set.txt")
ebola = process_pmid_txt("/content/pmid-Ebola-set.txt")

In [6]:
print(len(cancer), cancer[:10])

10000 ['31761807', '28244479', '27741350', '29949179', '26667886', '30713326', '28831912', '28574057', '19491253', '27839715']


In [7]:
print(len(covid), covid[:10])

10000 ['33308664', '33522478', '33189872', '33666147', '33139420', '32383182', '33126180', '33322035', '33572857', '33301459']


In [8]:
print(len(ebola), ebola[:10])

10000 ['27959626', '32080199', '26923959', '25910510', '32441897', '25694096', '30777297', '31668200', '25694094', '31567063']


## Fetching the records

In [9]:
ebola_data = fetch_data(",".join(ebola))
time.sleep(1)

In [10]:
covid_data = fetch_data(",".join(covid))
time.sleep(1)

In [None]:
cancer_data = fetch_data(",".join(cancer))

## Records to Pandas Dataframe


### Covid

In [None]:
len(covid_data)

In [None]:
covid_data[0]

In [None]:
required_keys = ['PMID', 'TI', 'AB', 'AD', 'FAU', 'DP', 'TA', 'JT', 'LA', 
                 'MH', 'OAB', 'OT', 'PL', 'PST', 'PT', 'RN', 'SI']

In [None]:
covid_data_for_df = list()

for single_data in tqdm(covid_data):
  temp = list()
  for key in required_keys:
    value = single_data.get(key, "NONE")
    if isinstance(value, list):
      value = ";".join(set(value))
      temp.append(value)
    else:
      temp.append(value)

  covid_data_for_df.append(temp)

In [None]:
covid_data_for_df[0]

In [None]:
covid_dataframe = pd.DataFrame(covid_data_for_df, columns=required_keys)

In [None]:
covid_dataframe['RN']

In [None]:
covid_dataframe['Class'] = ['COVID']*len(covid_dataframe)

In [None]:
covid_dataframe.head(3)

In [None]:
covid_dataframe.to_csv("10k-covid-pubmed-metadata.csv", index=False)

### Cancer

In [None]:
len(cancer_data)

In [None]:
cancer_data[0]

In [None]:
required_keys = ['PMID', 'TI', 'AB', 'AD', 'FAU', 'DP', 'TA', 'JT', 'LA', 
                 'MH', 'OAB', 'OT', 'PL', 'PST', 'PT', 'RN', 'SI']

In [None]:
cancer_data_for_df = list()

for single_data in tqdm(cancer_data):
  temp = list()
  for key in required_keys:
    value = single_data.get(key, "NONE")
    if isinstance(value, list):
      value = ";".join(set(value))
      temp.append(value)
    else:
      temp.append(value)

  cancer_data_for_df.append(temp)

In [None]:
cancer_data_for_df[0]

In [None]:
cancer_dataframe = pd.DataFrame(cancer_data_for_df, columns=required_keys)

In [None]:
cancer_dataframe['RN']

In [None]:
cancer_dataframe['Class'] = ['CANCER']*len(cancer_dataframe)

In [None]:
cancer_dataframe.to_csv("10k-cancer-pubmed-metadata.csv", index=False)

### Ebola

In [None]:
cancer_dataframe['RN']

In [None]:
len(ebola_data)

In [None]:
ebola_data[0]

In [None]:
required_keys = ['PMID', 'TI', 'AB', 'AD', 'FAU', 'DP', 'TA', 'JT', 'LA', 
                 'MH', 'OAB', 'OT', 'PL', 'PST', 'PT', 'RN', 'SI']

In [None]:
ebola_data_for_df = list()

for single_data in tqdm(ebola_data):
  temp = list()
  for key in required_keys:
    value = single_data.get(key, "NONE")
    if isinstance(value, list):
      value = ";".join(set(value))
      temp.append(value)
    else:
      temp.append(value)

  ebola_data_for_df.append(temp)

In [None]:
ebola_data_for_df[0]

In [None]:
ebola_dataframe = pd.DataFrame(ebola_data_for_df, columns=required_keys)

In [None]:
ebola_dataframe['RN']

In [None]:
ebola_dataframe['Class'] = ['EBOLA']*len(ebola_dataframe)

In [None]:
ebola_dataframe.head(3)

In [None]:
ebola_dataframe.to_csv("10k-ebola-pubmed-metadata.csv", index=False)

### Combing dataframes

In [None]:
frames = [cancer_dataframe, ebola_dataframe, covid_dataframe]

#concatenate dataframes
main_dataframe = pd.concat(frames)

# reset index
main_dataframe.reset_index(drop=True, inplace=True)

In [None]:
main_dataframe.info()

In [None]:
main_dataframe.head()

In [None]:
main_dataframe.to_csv("10k-combine-pubmed-metadata.csv", index=False)

#### Saving copy to Google Drive

In [None]:
! cp -r /content/10k-combine-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/

! cp -r /content/10k-ebola-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/

! cp -r /content/10k-covid-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/

! cp -r /content/10k-cancer-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/