In [15]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from tqdm import tqdm
import time
from collections import Counter

In [2]:
path = "/content/drive/MyDrive/05-Data/enzyme-network/10k-combine-pubmed-metadata.csv"

data = pd.read_csv(path)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   PMID    30000 non-null  int64 
 1   TI      30000 non-null  object
 2   AB      30000 non-null  object
 3   AD      30000 non-null  object
 4   FAU     30000 non-null  object
 5   DP      30000 non-null  object
 6   TA      30000 non-null  object
 7   JT      30000 non-null  object
 8   LA      30000 non-null  object
 9   MH      30000 non-null  object
 10  OAB     30000 non-null  object
 11  OT      29999 non-null  object
 12  PL      30000 non-null  object
 13  PST     30000 non-null  object
 14  PT      30000 non-null  object
 15  RN      30000 non-null  object
 16  SI      30000 non-null  object
 17  Class   30000 non-null  object
dtypes: int64(1), object(17)
memory usage: 4.1+ MB


In [6]:
rn_present_data = data[data['RN']!='NONE']

In [8]:
rn_present_data['RN']

3                                0 (Antineoplastic Agents)
6            0 (Biomarkers, Tumor);0 (Heat-Shock Proteins)
8        0 (Cancer Vaccines);1406-18-4 (Vitamin E);H624...
16       0 (Biomarkers, Tumor);0 (MicroRNAs);0 (MIRN29a...
17                                           0 (Cytokines)
                               ...                        
29993             0 (RNA, Messenger);0 (COVID-19 Vaccines)
29994                   0 (Vaccines);0 (COVID-19 Vaccines)
29995                                0 (COVID-19 Vaccines)
29998                                0 (COVID-19 Vaccines)
29999           0 (Antiviral Agents);0 (COVID-19 Vaccines)
Name: RN, Length: 15405, dtype: object

In [9]:
rn_present_data['Class'].value_counts()

COVID     7531
CANCER    3963
EBOLA     3911
Name: Class, dtype: int64

In [11]:
rn_cancer = rn_present_data[rn_present_data['Class']=='CANCER']
rn_ebola = rn_present_data[rn_present_data['Class']=='EBOLA']
rn_covid = rn_present_data[rn_present_data['Class']=='COVID']

In [12]:
covid_enzymes = list()

for ec_number_molecules in (rn_covid.RN):
  if isinstance(ec_number_molecules, str):
    for molecule in ec_number_molecules.split(';'):
      if molecule.startswith('EC'):
        covid_enzymes.append(molecule)

In [13]:
ebola_enzymes = list()

for ec_number_molecules in (rn_ebola.RN):
  if isinstance(ec_number_molecules, str):
    for molecule in ec_number_molecules.split(';'):
      if molecule.startswith('EC'):
        ebola_enzymes.append(molecule)

In [14]:
cancer_enzymes = list()

for ec_number_molecules in (rn_cancer.RN):
  if isinstance(ec_number_molecules, str):
    for molecule in ec_number_molecules.split(';'):
      if molecule.startswith('EC'):
        cancer_enzymes.append(molecule)

In [16]:
cancer_enzymes_count = Counter(cancer_enzymes, )
ebola_enzymes_count = Counter(ebola_enzymes, )
covid_enzymes_count = Counter(covid_enzymes, )

In [18]:
covid_enzymes_count.most_common()[:10]

[('EC 3.4.17.23 (Angiotensin-Converting Enzyme 2)', 106),
 ('EC 3.4.17.23 (ACE2 protein, human)', 78),
 ('EC 2.7.7.48 (Coronavirus RNA-Dependent RNA Polymerase)', 15),
 ('EC 2.7.7.48 (RNA-Dependent RNA Polymerase)', 15),
 ('EC 2.7.7.48 (NSP12 protein, SARS-CoV-2)', 12),
 ('EC 3.4.21.- (Serine Endopeptidases)', 10),
 ('EC 3.4.21.- (TMPRSS2 protein, human)', 9),
 ('EC 3.4.15.1 (Peptidyl-Dipeptidase A)', 9),
 ('EC 3.1.26.5 (Ribonuclease P)', 8),
 ('EC 3.4.22.28 (Coronavirus 3C Proteases)', 6)]

In [19]:
ebola_enzymes_count.most_common()[:10]

[('EC 2.7.7.48 (RNA-Dependent RNA Polymerase)', 30),
 ('EC 2.3.2.27 (Ubiquitin-Protein Ligases)', 20),
 ('EC 3.4.22.15 (Cathepsin L)', 18),
 ('EC 3.4.22.1 (Cathepsin B)', 17),
 ('EC 3.4.- (Cathepsins)', 16),
 ('EC 2.7.7.6 (DNA-Directed RNA Polymerases)', 16),
 ('EC 1.13.12.- (Luciferases)', 14),
 ('EC 3.6.4.13 (DEAD Box Protein 58)', 12),
 ('EC 3.6.4.13 (DEAD-box RNA Helicases)', 12),
 ('EC 3.6.1.- (DDX58 protein, human)', 11)]

In [20]:
cancer_enzymes_count.most_common()[:10]

[('EC 2.7.7.49 (Telomerase)', 39),
 ('EC 2.7.10.1 (ErbB Receptors)', 24),
 ('EC 2.7.10.1 (Receptor, ErbB-2)', 21),
 ('EC 2.7.11.1 (Protein Serine-Threonine Kinases)', 20),
 ('EC 1.14.99.1 (Cyclooxygenase 2)', 18),
 ('EC 3.4.21.77 (Prostate-Specific Antigen)', 17),
 ('EC 1.14.99.1 (Prostaglandin-Endoperoxide Synthases)', 15),
 ('EC 3.1.3.2 (Phosphoric Monoester Hydrolases)', 14),
 ('EC 1.14.99.1 (PTGS2 protein, human)', 14),
 ('EC 3.6.5.2 (Proto-Oncogene Proteins p21(ras))', 13)]

In [26]:
cancer_enzymes[1].replace('(', '').replace(')', '')

'EC 4.2.1.2 Fumarate Hydratase'

In [28]:
cancer_enzymes_set = set(cancer_enzymes)
ebola_enzymes_set = set(ebola_enzymes)
covid_enzymes_set = set(covid_enzymes)

In [38]:
covid_ebola_common_enzymes = covid_enzymes_set & ebola_enzymes_set
len(covid_ebola_common_enzymes)

35

In [40]:
list(covid_ebola_common_enzymes)[:10]

['EC 2.7.7.48 (NSP12 protein, SARS-CoV-2)',
 'EC 3.4.22.- (Cysteine Endopeptidases)',
 'EC 3.5.4.37 (ADAR protein, human)',
 'EC 2.7.3.2 (Creatine Kinase)',
 'EC 2.6.1.1 (Aspartate Aminotransferases)',
 'EC 3.4.21.64 (Endopeptidase K)',
 'EC 3.4.17.23 (Angiotensin-Converting Enzyme 2)',
 'EC 3.1.- (CRISPR-Associated Protein 9)',
 'EC 2.7.7.49 (RNA-Directed DNA Polymerase)',
 'EC 3.4.- (Serine Proteases)']

In [36]:
len(cancer_enzymes_set & covid_enzymes_set)

32

In [32]:
len(cancer_enzymes_set & ebola_enzymes_set)

95

In [33]:
len(cancer_enzymes_set)

527

In [34]:
len(ebola_enzymes_set)

270