# Enzymes Network Graph

## Libraries

In [1]:
! pip install -q biopython
! pip install -q pyvis

[?25l[K     |▏                               | 10 kB 17.5 MB/s eta 0:00:01[K     |▎                               | 20 kB 24.1 MB/s eta 0:00:01[K     |▍                               | 30 kB 24.1 MB/s eta 0:00:01[K     |▋                               | 40 kB 10.1 MB/s eta 0:00:01[K     |▊                               | 51 kB 9.5 MB/s eta 0:00:01[K     |▉                               | 61 kB 11.1 MB/s eta 0:00:01[K     |█                               | 71 kB 9.9 MB/s eta 0:00:01[K     |█▏                              | 81 kB 10.3 MB/s eta 0:00:01[K     |█▎                              | 92 kB 11.4 MB/s eta 0:00:01[K     |█▍                              | 102 kB 9.9 MB/s eta 0:00:01[K     |█▋                              | 112 kB 9.9 MB/s eta 0:00:01[K     |█▊                              | 122 kB 9.9 MB/s eta 0:00:01[K     |█▉                              | 133 kB 9.9 MB/s eta 0:00:01[K     |██                              | 143 kB 9.9 MB/s eta 0:00:01

In [2]:
# Fetching PubMed article metadata
from Bio import Entrez, Medline

# Graph creation and visualisation
from pyvis.network import Network

import time
from tqdm import tqdm

import pandas as pd

In [3]:
starttime = time.time()

## Helper Code

In [4]:
def fetch_data(pmids):
    """Returns pubmed record associated with the PMID(s)"""
    
    Entrez.email = 'akishirsath@gmail.com'

    handle = Entrez.efetch(db="pubmed", 
                           id=pmids, 
                           rettype="medline", 
                           retmode="text")

    records = Medline.parse(handle)    
    
    return list(records)

In [5]:
def process_pmid_txt(text_file_path):

  pmids = list()

  f = open(text_file_path, "r")

  for pmid in f.read().split('\n'):
    pmids.append(pmid.strip())  
  
  f.close()
  
  return pmids

## Processing PMIDs txt file

In [6]:
cancer = process_pmid_txt("/content/pmid-cancer-set.txt")
covid = process_pmid_txt("/content/pmid-covid19-set.txt")
ebola = process_pmid_txt("/content/pmid-Ebola-set.txt")

In [7]:
print(len(cancer), cancer[:10])

10000 ['31761807', '28244479', '27741350', '29949179', '26667886', '30713326', '28831912', '28574057', '19491253', '27839715']


In [8]:
print(len(covid), covid[:10])

10000 ['33308664', '33522478', '33189872', '33666147', '33139420', '32383182', '33126180', '33322035', '33572857', '33301459']


In [9]:
print(len(ebola), ebola[:10])

10000 ['27959626', '32080199', '26923959', '25910510', '32441897', '25694096', '30777297', '31668200', '25694094', '31567063']


## Fetching the records

In [10]:
ebola_data = fetch_data(",".join(ebola))
time.sleep(1)

In [11]:
covid_data = fetch_data(",".join(covid))
time.sleep(1)

In [12]:
cancer_data = fetch_data(",".join(cancer))

## Records to Pandas Dataframe


### Covid

In [13]:
len(covid_data)

10000

In [14]:
covid_data[0]

{'AB': 'Accurate and rapid diagnostic tests are critical for achieving control of coronavirus disease 2019 (covid-19), a pandemic illness caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). Diagnostic tests for covid-19 fall into two main categories: molecular tests that detect viral RNA, and serological tests that detect anti-SARS-CoV-2 immunoglobulins. Reverse transcriptase polymerase chain reaction (RT-PCR), a molecular test, has become the gold standard for diagnosis of covid-19; however, this test has many limitations that include potential false negative results, changes in diagnostic accuracy over the disease course, and precarious availability of test materials. Serological tests have generated substantial interest as an alternative or complement to RT-PCR and other Nucleic acid tests in the diagnosis of acute infection, as some might be cheaper and easier to implement at the point of care. A clear advantage of these tests over RT-PCR is that they can identi

In [15]:
required_keys = ['PMID', 'TI', 'AB', 'AD', 'FAU', 'DP', 'TA', 'JT', 'LA', 
                 'MH', 'OAB', 'OT', 'PL', 'PST', 'PT', 'RN', 'SI']

In [16]:
covid_data_for_df = list()

for single_data in tqdm(covid_data):
  temp = list()
  for key in required_keys:
    value = single_data.get(key, "NONE")
    if isinstance(value, list):
      value = ";".join(set(value))
      temp.append(value)
    else:
      temp.append(value)

  covid_data_for_df.append(temp)

100%|██████████| 10000/10000 [00:00<00:00, 41141.20it/s]


In [17]:
covid_data_for_df[0]

['33308664',
 'Benefits and limitations of serological assays in COVID-19 infection.',
 'Accurate and rapid diagnostic tests are critical for achieving control of coronavirus disease 2019 (covid-19), a pandemic illness caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). Diagnostic tests for covid-19 fall into two main categories: molecular tests that detect viral RNA, and serological tests that detect anti-SARS-CoV-2 immunoglobulins. Reverse transcriptase polymerase chain reaction (RT-PCR), a molecular test, has become the gold standard for diagnosis of covid-19; however, this test has many limitations that include potential false negative results, changes in diagnostic accuracy over the disease course, and precarious availability of test materials. Serological tests have generated substantial interest as an alternative or complement to RT-PCR and other Nucleic acid tests in the diagnosis of acute infection, as some might be cheaper and easier to implement at the po

In [18]:
covid_dataframe = pd.DataFrame(covid_data_for_df, columns=required_keys)

In [19]:
covid_dataframe['RN']

0                                                   NONE
1                                  0 (COVID-19 Vaccines)
2       0 (Antigens, Viral);0 (Reagent Kits, Diagnostic)
3                                  0 (COVID-19 Vaccines)
4       0 (Antigens, Viral);0 (Reagent Kits, Diagnostic)
                              ...                       
9995                               0 (COVID-19 Vaccines)
9996                                                NONE
9997                                                NONE
9998                               0 (COVID-19 Vaccines)
9999          0 (Antiviral Agents);0 (COVID-19 Vaccines)
Name: RN, Length: 10000, dtype: object

In [20]:
covid_dataframe['Class'] = ['COVID']*len(covid_dataframe)

In [21]:
covid_dataframe.head(3)

Unnamed: 0,PMID,TI,AB,AD,FAU,DP,TA,JT,LA,MH,OAB,OT,PL,PST,PT,RN,SI,Class
0,33308664,Benefits and limitations of serological assays...,Accurate and rapid diagnostic tests are critic...,"New Delhi Tuberculosis Centre, JLN Marg, New D...","Chopra, K K;Hanif, M;Dwivedi, Kaushal Kumar;Si...",2020 Dec,Indian J Tuberc,The Indian journal of tuberculosis,eng,COVID-19/*diagnosis;Sensitivity and Specificit...,NONE,SARS CoV-2;COVID-19;IgG;Serology;IgM,India,ppublish,Review;Journal Article,NONE,NONE,COVID
1,33522478,Postvaccination COVID-19 among Healthcare Work...,Coronavirus disease (COVID-19) symptoms can be...,NONE,"Beni, Sharon Alexsandra;Biber, Asaf;Amit, Shar...",2021 Apr,Emerg Infect Dis,Emerging infectious diseases,eng,*COVID-19 Serological Testing/methods/statisti...,NONE,*COVID-19;*SARS;*Israel;*zoonoses;*vaccination...,United States,ppublish,Journal Article,0 (COVID-19 Vaccines),NONE,COVID
2,33189872,Field evaluation of a rapid antigen test (Panb...,OBJECTIVES: To our knowledge no previous study...,"Instituto Valenciano de Microbiologia, Betera,...","Navarro, David;Ferrer, Josep;Albert, Eliseo;Fe...",2021 Mar,Clin Microbiol Infect,Clinical microbiology and infection : the offi...,eng,COVID-19/*diagnosis;Infant;*Point-of-Care Test...,NONE,SARS-CoV-2;Primary healthcare centre;Early dia...,England,ppublish,Journal Article;Evaluation Study,"0 (Antigens, Viral);0 (Reagent Kits, Diagnostic)",NONE,COVID


In [22]:
covid_dataframe.to_csv("10k-covid-pubmed-metadata.csv", index=False)

### Cancer

In [23]:
len(cancer_data)

10000

In [24]:
cancer_data[0]

{'AB': "This essay focuses on themes in Explaining Cancer: Finding Order in Disorder (2018) by Anya Plutynski, a monograph that has important things to say about both the peculiarities of cancers and our theories about them. Cancer's agents of destruction are human cells that have been recruited and to some extent transformed into pathological organisms or the building blocks of tumors. Cancers both undermine and exploit mechanisms of multicellular organization, and understanding them gives rise to difficult philosophical problems. In addition to sketching Plutynski's discussion of these problems, this essay defends Christopher Boorse's account of disease from Plutynski's criticisms, and it expresses some qualms about her treatment of scientific explanation.",
 'AID': ['S1529879519400112 [pii]', '10.1353/pbm.2019.0046 [doi]'],
 'AU': ['Hausman DM'],
 'CRDT': ['2019/11/26 06:00'],
 'DCOM': '20200529',
 'DP': '2019',
 'EDAT': '2019/11/26 06:00',
 'FAU': ['Hausman, Daniel M'],
 'IP': '4',

In [25]:
required_keys = ['PMID', 'TI', 'AB', 'AD', 'FAU', 'DP', 'TA', 'JT', 'LA', 
                 'MH', 'OAB', 'OT', 'PL', 'PST', 'PT', 'RN', 'SI']

In [26]:
cancer_data_for_df = list()

for single_data in tqdm(cancer_data):
  temp = list()
  for key in required_keys:
    value = single_data.get(key, "NONE")
    if isinstance(value, list):
      value = ";".join(set(value))
      temp.append(value)
    else:
      temp.append(value)

  cancer_data_for_df.append(temp)

100%|██████████| 10000/10000 [00:00<00:00, 44186.99it/s]


In [27]:
cancer_data_for_df[0]

['31761807',
 'What Is Cancer?',
 "This essay focuses on themes in Explaining Cancer: Finding Order in Disorder (2018) by Anya Plutynski, a monograph that has important things to say about both the peculiarities of cancers and our theories about them. Cancer's agents of destruction are human cells that have been recruited and to some extent transformed into pathological organisms or the building blocks of tumors. Cancers both undermine and exploit mechanisms of multicellular organization, and understanding them gives rise to difficult philosophical problems. In addition to sketching Plutynski's discussion of these problems, this essay defends Christopher Boorse's account of disease from Plutynski's criticisms, and it expresses some qualms about her treatment of scientific explanation.",
 'NONE',
 'Hausman, Daniel M',
 '2019',
 'Perspect Biol Med',
 'Perspectives in biology and medicine',
 'eng',
 'Philosophy, Medical;Humans;Neoplasms/*etiology',
 'NONE',
 'NONE',
 'United States',
 'pp

In [28]:
cancer_dataframe = pd.DataFrame(cancer_data_for_df, columns=required_keys)

In [29]:
cancer_dataframe['RN']

0                                      NONE
1                                      NONE
2                                      NONE
3                 0 (Antineoplastic Agents)
4                                      NONE
                       ...                 
9995                                   NONE
9996                                   NONE
9997                                   NONE
9998    0 (Chromates);0R0008Q3JB (Chromium)
9999              0 (Antineoplastic Agents)
Name: RN, Length: 10000, dtype: object

In [30]:
cancer_dataframe['Class'] = ['CANCER']*len(cancer_dataframe)

In [31]:
cancer_dataframe.to_csv("10k-cancer-pubmed-metadata.csv", index=False)

### Ebola

In [32]:
cancer_dataframe['RN']

0                                      NONE
1                                      NONE
2                                      NONE
3                 0 (Antineoplastic Agents)
4                                      NONE
                       ...                 
9995                                   NONE
9996                                   NONE
9997                                   NONE
9998    0 (Chromates);0R0008Q3JB (Chromium)
9999              0 (Antineoplastic Agents)
Name: RN, Length: 10000, dtype: object

In [33]:
len(ebola_data)

10000

In [34]:
ebola_data[0]

{'AB': "For almost 50 years, ebolaviruses and related filoviruses have been repeatedly reemerging across the vast equatorial belt of the African continent to cause epidemics of highly fatal hemorrhagic fever. The 2013-2015 West African epidemic, by far the most geographically extensive, most fatal, and longest lasting epidemic in Ebola's history, presented an enormous international public health challenge, but it also provided insights into Ebola's pathogenesis and natural history, clinical expression, treatment, prevention, and control. Growing understanding of ebolavirus pathogenetic mechanisms and important new clinical observations of the disease course provide fresh clues about prevention and treatment approaches. Although viral cytopathology and immune-mediated cell damage in ebolavirus disease often result in severe compromise of multiple organs, tissue repair and organ function recovery can be expected if patients receive supportive care with fluids and electrolytes; maintenanc

In [35]:
required_keys = ['PMID', 'TI', 'AB', 'AD', 'FAU', 'DP', 'TA', 'JT', 'LA', 
                 'MH', 'OAB', 'OT', 'PL', 'PST', 'PT', 'RN', 'SI']

In [36]:
ebola_data_for_df = list()

for single_data in tqdm(ebola_data):
  temp = list()
  for key in required_keys:
    value = single_data.get(key, "NONE")
    if isinstance(value, list):
      value = ";".join(set(value))
      temp.append(value)
    else:
      temp.append(value)

  ebola_data_for_df.append(temp)

100%|██████████| 10000/10000 [00:00<00:00, 42448.77it/s]


In [37]:
ebola_data_for_df[0]

['27959626',
 'The Pathogenesis of Ebola Virus Disease.',
 "For almost 50 years, ebolaviruses and related filoviruses have been repeatedly reemerging across the vast equatorial belt of the African continent to cause epidemics of highly fatal hemorrhagic fever. The 2013-2015 West African epidemic, by far the most geographically extensive, most fatal, and longest lasting epidemic in Ebola's history, presented an enormous international public health challenge, but it also provided insights into Ebola's pathogenesis and natural history, clinical expression, treatment, prevention, and control. Growing understanding of ebolavirus pathogenetic mechanisms and important new clinical observations of the disease course provide fresh clues about prevention and treatment approaches. Although viral cytopathology and immune-mediated cell damage in ebolavirus disease often result in severe compromise of multiple organs, tissue repair and organ function recovery can be expected if patients receive supp

In [38]:
ebola_dataframe = pd.DataFrame(ebola_data_for_df, columns=required_keys)

In [39]:
ebola_dataframe['RN']

0                                                    NONE
1                                                    NONE
2           0 (Ebola Vaccines);0 (Membrane Glycoproteins)
3                 0 (Ebola Vaccines);0 (Antiviral Agents)
4       OF5P57N2ZX (Alanine);3QKI37EEHE (remdesivir);0...
                              ...                        
9995                                                 NONE
9996                                                 NONE
9997           0 (Antiviral Agents);0 (Interferon Type I)
9998                                                 NONE
9999                                                 NONE
Name: RN, Length: 10000, dtype: object

In [40]:
ebola_dataframe['Class'] = ['EBOLA']*len(ebola_dataframe)

In [41]:
ebola_dataframe.head(3)

Unnamed: 0,PMID,TI,AB,AD,FAU,DP,TA,JT,LA,MH,OAB,OT,PL,PST,PT,RN,SI,Class
0,27959626,The Pathogenesis of Ebola Virus Disease.,"For almost 50 years, ebolaviruses and related ...","Office of the Director, National Institute of ...","Morens, David M;Johnson, Karl M;Baseler, Laura...",2017 Jan 24,Annu Rev Pathol,Annual review of pathology,eng,Disease Outbreaks/*prevention & control;Ebolav...,NONE,filovirus;epidemic;viral hemorrhagic fever;Ebo...,United States,ppublish,Review;Journal Article,NONE,NONE,EBOLA
1,32080199,Ebola virus disease.,Ebola virus disease (EVD) is a severe and freq...,"Global Health Security Department, Infectious ...","Kraft, Colleen S;Kuhn, Jens H;Vega, Marc-Antoi...",2020 Feb 20,Nat Rev Dis Primers,Nature reviews. Disease primers,eng,"Africa, Western/epidemiology;Hemorrhagic Fever...",NONE,NONE,England,epublish,"Research Support, U.S. Gov't, Non-P.H.S.;Journ...",NONE,NONE,EBOLA
2,26923959,Ebola vaccine and treatment.,Filoviruses (Ebola and Marburg viruses) cause ...,Hokkaido University Research Center for Zoonos...,"Takada, Ayato",2015,Uirusu,Uirusu,jpn,Membrane Glycoproteins/chemistry/physiology;*D...,NONE,NONE,Japan,ppublish,Review;Journal Article,0 (Ebola Vaccines);0 (Membrane Glycoproteins),NONE,EBOLA


In [42]:
ebola_dataframe.to_csv("10k-ebola-pubmed-metadata.csv", index=False)

### Combing dataframes

In [43]:
frames = [cancer_dataframe, ebola_dataframe, covid_dataframe]

#concatenate dataframes
main_dataframe = pd.concat(frames)

# reset index
main_dataframe.reset_index(drop=True, inplace=True)

In [44]:
main_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   PMID    30000 non-null  object
 1   TI      30000 non-null  object
 2   AB      30000 non-null  object
 3   AD      30000 non-null  object
 4   FAU     30000 non-null  object
 5   DP      30000 non-null  object
 6   TA      30000 non-null  object
 7   JT      30000 non-null  object
 8   LA      30000 non-null  object
 9   MH      30000 non-null  object
 10  OAB     30000 non-null  object
 11  OT      30000 non-null  object
 12  PL      30000 non-null  object
 13  PST     30000 non-null  object
 14  PT      30000 non-null  object
 15  RN      30000 non-null  object
 16  SI      30000 non-null  object
 17  Class   30000 non-null  object
dtypes: object(18)
memory usage: 4.1+ MB


In [45]:
main_dataframe.head()

Unnamed: 0,PMID,TI,AB,AD,FAU,DP,TA,JT,LA,MH,OAB,OT,PL,PST,PT,RN,SI,Class
0,31761807,What Is Cancer?,This essay focuses on themes in Explaining Can...,NONE,"Hausman, Daniel M",2019,Perspect Biol Med,Perspectives in biology and medicine,eng,"Philosophy, Medical;Humans;Neoplasms/*etiology",NONE,NONE,United States,ppublish,Journal Article,NONE,NONE,CANCER
1,28244479,Cancer and cure: A critical analysis.,Cancer is one of the most dreaded diseases of ...,"Department of Medical Oncology, Dr. B. Borooah...","Saikia, B J;Roy, P S",2016 Jul-Sep,Indian J Cancer,Indian journal of cancer,eng,Neoplasms/epidemiology/mortality/*pathology/*t...,NONE,NONE,India,ppublish,Review;Journal Article,NONE,NONE,CANCER
2,27741350,Measuring cancer evolution from the genome.,The temporal dynamics of cancer evolution rema...,Cancer Evolutionary Genomics and Modelling Lab...,"Graham, Trevor A;Sottoriva, Andrea",2017 Jan,J Pathol,The Journal of pathology,eng,Neoplasms/*diagnosis/*genetics/pathology;*Gene...,NONE,*selection;*saltation;*punctuated equilibrium;...,England,ppublish,Review;Journal Article,NONE,NONE,CANCER
3,29949179,Tumor microenvironment: recent advances in var...,This is a review regarding different types of ...,"Department of General Surgery, Chun'an First P...","Wang, J-J;Lei, K-F;Han, F",2018 Jun,Eur Rev Med Pharmacol Sci,European review for medical and pharmacologica...,eng,Antineoplastic Agents/therapeutic use;Radiatio...,NONE,NONE,Italy,ppublish,"Review;Journal Article;Research Support, Non-U...",0 (Antineoplastic Agents),NONE,CANCER
4,26667886,Global Cancer Incidence and Mortality Rates an...,There are limited published data on recent can...,American Cancer Society Surveillance and Healt...,"Jemal, Ahmedin;Torre, Lindsey A;Siegel, Rebecc...",2016 Jan,Cancer Epidemiol Biomarkers Prev,"Cancer epidemiology, biomarkers & prevention :...",eng,Prognosis;Global Health/*statistics & numerica...,NONE,NONE,United States,ppublish,Review;Journal Article,NONE,NONE,CANCER


In [46]:
main_dataframe.to_csv("10k-combine-pubmed-metadata.csv", index=False)

#### Saving copy to Google Drive

In [47]:
! cp -r /content/10k-combine-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/

! cp -r /content/10k-ebola-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/

! cp -r /content/10k-covid-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/

! cp -r /content/10k-cancer-pubmed-metadata.csv /content/drive/MyDrive/05-Data/enzyme-network/

In [48]:
!ls /content/drive/MyDrive/05-Data/enzyme-network/

10k-cancer-pubmed-metadata.csv	 10k-covid-pubmed-metadata.csv
10k-combine-pubmed-metadata.csv  10k-ebola-pubmed-metadata.csv


In [49]:
endtime = time.time()

In [50]:
print(f"It took around ~{round(endtime-starttime)} Seconds to run whole notebook.")

It took around ~263 Seconds to run whole notebook.
