In [1]:
import pandas as pd 
import numpy as np

In [2]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# PubChem

In [2]:
important_columns = {
    'cid',
    'cmpdname',
    'cmpdsynonym',
    'inchikey',
    'meshheadings'
}

In [3]:
pubchem_df = pd.read_csv(
    "data/PubChem.csv",      # relative python path to subdirectory
    sep=',',
    usecols=important_columns,
    dtype=str
)

print(pubchem_df.shape)
pubchem_df.head()

(130173, 5)


Unnamed: 0,cid,cmpdname,cmpdsynonym,inchikey,meshheadings
0,1,Acetylcarnitine,Acetyl-DL-carnitine|acetylcarnitine|DL-O-Acety...,RDHQFKQIGNGIED-UHFFFAOYSA-N,Acetylcarnitine
1,4,1-Aminopropan-2-ol,1-Aminopropan-2-ol|1-AMINO-2-PROPANOL|78-96-6|...,HXKKHQJGJAFBHI-UHFFFAOYSA-N,
2,6,"1-Chloro-2,4-dinitrobenzene","1-chloro-2,4-dinitrobenzene|2,4-Dinitrochlorob...",VYZAHLCBVHPDDF-UHFFFAOYSA-N,Dinitrochlorobenzene
3,7,9-Ethyladenine,9-Ethyladenine|2715-68-6|9-ethyl-9h-purin-6-am...,MUIPLRMGAXZWSQ-UHFFFAOYSA-N,
4,8,"2,3-Dihydroxy-3-methylpentanoic acid","2,3-Dihydroxy-3-methylpentanoic acid|2,3-dihyd...",PDGXJDXVGMHUIR-UHFFFAOYSA-N,


In [4]:
pubchem_df.fillna('', inplace=True)

In [5]:
pubchem_df.sample(5)

Unnamed: 0,cid,cmpdname,cmpdsynonym,inchikey,meshheadings
93537,23681230,Omeprazole sodium hydrate,OMEPRAZOLE SODIUM|Omeprazole sodium hydrate|95...,RRFCKCAQHRITRG-UHFFFAOYSA-N,Omeprazole
71642,9808654,CID 9808654,u-83836e|U83836E|137018-55-4|PNU-83836E|(2R)-2...,KJGXSMZHYBXPIS-ZHXNKHNVSA-N,
115267,101950409,(2S)-2-[[(2S)-2-[[(2S)-2-[[(2S)-2-[[(2S)-2-[[(...,T-241,QTWWWOBSQULXHE-MTVPJWGZSA-N,
19601,95955,Dibenzyl diselenide,"Dibenzyl diselenide|1482-82-2|1,2-Dibenzyldise...",HYAVEDMFTNAZQE-UHFFFAOYSA-N,
125707,136900208,Prianosin A,Prianosin A,XJZVECLAJOTPHE-YRUZYCQGSA-N,


# CHEBI

In [6]:
important_columns_chebi = {
    'Class ID',
    'Preferred Label',
    'Synonyms',
    'Obsolete',
    'Parents',
    'database_cross_reference',
    'http://purl.obolibrary.org/obo/chebi/inchikey',
    'http://www.w3.org/2004/02/skos/core#notation'
}

In [7]:
chebi_df = pd.read_csv(
    "data/CHEBI.csv",      # relative python path to subdirectory
    sep=',',
    usecols=important_columns_chebi,
    dtype=str
)

print(chebi_df.shape)
chebi_df.head()

(156098, 8)


Unnamed: 0,Class ID,Preferred Label,Synonyms,Obsolete,Parents,database_cross_reference,http://purl.obolibrary.org/obo/chebi/inchikey,http://www.w3.org/2004/02/skos/core#notation
0,http://purl.obolibrary.org/obo/CHEBI_101465,"(2S,3S,4R)-4-(hydroxymethyl)-1-(2-methoxy-1-ox...",,False,http://purl.obolibrary.org/obo/CHEBI_38193,LINCS:LSM-12828,NONDGOMIDWLUNU-AOIWGVFYSA-N,CHEBI:101465
1,http://purl.obolibrary.org/obo/CHEBI_159237,Leu-His-Glu,(2S)-2-[[(2S)-2-[[(2S)-2-amino-4-methylpentano...,False,http://purl.obolibrary.org/obo/CHEBI_25676,Chemspider:58808728,KXODZBLFVFSLAI-AVGNSLFASA-N,CHEBI:159237
2,http://purl.obolibrary.org/obo/CHEBI_101448,"2-fluoro-N-[(4S,7R,8S)-8-methoxy-4,7,10-trimet...",,False,http://purl.obolibrary.org/obo/CHEBI_52898|htt...,LINCS:LSM-12811,ULFQVHHILZHREN-ZMPRRUGASA-N,CHEBI:101448
3,http://purl.obolibrary.org/obo/CHEBI_85476,O-hydroxyvaleroyl-L-carnitine,O-hydroxyvaleroyl-L-carnitines|O-hydroxyvalero...,False,http://purl.obolibrary.org/obo/CHEBI_133449,,,CHEBI:85476
4,http://purl.obolibrary.org/obo/CHEBI_87468,benzo scarlet 4BNS,trisodium 4-hydroxy-7-[({5-hydroxy-6-[(3-metho...,False,http://purl.obolibrary.org/obo/CHEBI_38700,,ARFFDLGGWDRGGT-UHFFFAOYSA-K,CHEBI:87468


In [8]:
chebi_df = chebi_df[chebi_df['Obsolete']=='false']
chebi_df.drop(['Obsolete','Class ID'],axis = 1, inplace=True)
print(chebi_df.shape)
chebi_df.head()

(137592, 6)


Unnamed: 0,Preferred Label,Synonyms,Parents,database_cross_reference,http://purl.obolibrary.org/obo/chebi/inchikey,http://www.w3.org/2004/02/skos/core#notation
0,"(2S,3S,4R)-4-(hydroxymethyl)-1-(2-methoxy-1-ox...",,http://purl.obolibrary.org/obo/CHEBI_38193,LINCS:LSM-12828,NONDGOMIDWLUNU-AOIWGVFYSA-N,CHEBI:101465
1,Leu-His-Glu,(2S)-2-[[(2S)-2-[[(2S)-2-amino-4-methylpentano...,http://purl.obolibrary.org/obo/CHEBI_25676,Chemspider:58808728,KXODZBLFVFSLAI-AVGNSLFASA-N,CHEBI:159237
2,"2-fluoro-N-[(4S,7R,8S)-8-methoxy-4,7,10-trimet...",,http://purl.obolibrary.org/obo/CHEBI_52898|htt...,LINCS:LSM-12811,ULFQVHHILZHREN-ZMPRRUGASA-N,CHEBI:101448
3,O-hydroxyvaleroyl-L-carnitine,O-hydroxyvaleroyl-L-carnitines|O-hydroxyvalero...,http://purl.obolibrary.org/obo/CHEBI_133449,,,CHEBI:85476
4,benzo scarlet 4BNS,trisodium 4-hydroxy-7-[({5-hydroxy-6-[(3-metho...,http://purl.obolibrary.org/obo/CHEBI_38700,,ARFFDLGGWDRGGT-UHFFFAOYSA-K,CHEBI:87468


In [9]:
renamed_columns_chebi = {
    'Preferred Label':'ChEBI_label',
    'Synonyms': 'ChEBI_synonyms',
    'Parents':'ChEBI_parents',
    'database_cross_reference':'ChEBI_cross_reference',
    'http://purl.obolibrary.org/obo/chebi/inchikey':'inchikey',
    'http://www.w3.org/2004/02/skos/core#notation':'ChEBI_id'
}

In [10]:
chebi_df = chebi_df.rename(renamed_columns_chebi,axis=1)

# MESH

In [97]:
important_columns_mesh = [
    'Class ID',
    'Preferred Label',
    'Synonyms',
    'Obsolete',
    'Semantic Types'
]

In [98]:
mesh_df = pd.read_csv(
    "data/MESH.csv",      # relative python path to subdirectory
    sep=',',
    usecols=important_columns_mesh,
    dtype=str
)
mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,Obsolete,Semantic Types
0,http://purl.bioontology.org/ontology/MESH/C000...,technetium 99m hydroxyethylene-diphosphonate,99mTc-HDP|99mTc-hydroxyethylene-diphosphonate,false,http://purl.bioontology.org/ontology/STY/T130|...
1,http://purl.bioontology.org/ontology/MESH/C585345,"Tardbp protein, zebrafish","Tardbpl-FL protein, zebrafish|Tardbpl protein,...",false,http://purl.bioontology.org/ontology/STY/T123|...
2,http://purl.bioontology.org/ontology/MESH/C000...,Autographa californica multiple nuclear polyhe...,Trichoplusia ni multiple nucleopolyhedrovirus|...,false,http://purl.bioontology.org/ontology/STY/T005
3,http://purl.bioontology.org/ontology/MESH/C000...,Leeuwenhoekiella blandensis,,false,http://purl.bioontology.org/ontology/STY/T007
4,http://purl.bioontology.org/ontology/MESH/C049340,N-demethylmenogaril,,false,http://purl.bioontology.org/ontology/STY/T109
...,...,...,...,...,...
87269,http://purl.bioontology.org/ontology/MESH/C088692,2-(4-(biscarboxymethyl)amino-3-(carboxymethoxy...,Mag-indo-1,false,http://purl.bioontology.org/ontology/STY/T109|...
87270,http://purl.bioontology.org/ontology/MESH/C481277,"Yrr1 protein, S cerevisiae","Yor162c protein, S cerevisiae",false,http://purl.bioontology.org/ontology/STY/T116
87271,http://purl.bioontology.org/ontology/MESH/C111764,3-epi-actinobolin,3-epiactinobolin,false,http://purl.bioontology.org/ontology/STY/T109
87272,http://purl.bioontology.org/ontology/MESH/C000...,"2-tridecanyl-1,4-naphthoquinone",,false,http://purl.bioontology.org/ontology/STY/T109


In [99]:
mesh_df['Class ID'] = mesh_df['Class ID'].apply(lambda x: x.split('/')[-1])

In [100]:
print(mesh_df.shape)

(87274, 5)


In [101]:
mesh_df = mesh_df[mesh_df['Obsolete']=='false']
mesh_df = mesh_df.drop('Obsolete',axis=1)
print(mesh_df.shape)

(87273, 4)


In [102]:
mesh_df[['ST1','ST2']] = mesh_df['Semantic Types'].str.split('|', 1, expand=True)
mesh_df[['ST2','ST3']] = mesh_df['ST2'].str.split('|', 1, expand=True)
mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,Semantic Types,ST1,ST2,ST3
0,C000624633,technetium 99m hydroxyethylene-diphosphonate,99mTc-HDP|99mTc-hydroxyethylene-diphosphonate,http://purl.bioontology.org/ontology/STY/T130|...,http://purl.bioontology.org/ontology/STY/T130,http://purl.bioontology.org/ontology/STY/T197,
1,C585345,"Tardbp protein, zebrafish","Tardbpl-FL protein, zebrafish|Tardbpl protein,...",http://purl.bioontology.org/ontology/STY/T123|...,http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,
2,C000623720,Autographa californica multiple nuclear polyhe...,Trichoplusia ni multiple nucleopolyhedrovirus|...,http://purl.bioontology.org/ontology/STY/T005,http://purl.bioontology.org/ontology/STY/T005,,
3,C000644313,Leeuwenhoekiella blandensis,,http://purl.bioontology.org/ontology/STY/T007,http://purl.bioontology.org/ontology/STY/T007,,
4,C049340,N-demethylmenogaril,,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T109,,
...,...,...,...,...,...,...,...
87268,C503576,"bis di-N,N'-1-deoxyfructosyl glutathione disul...","bis di-N,N'-1-deoxyfructosyl GSSG|F-GSSG-F",http://purl.bioontology.org/ontology/STY/T116|...,http://purl.bioontology.org/ontology/STY/T116,http://purl.bioontology.org/ontology/STY/T121,
87269,C088692,2-(4-(biscarboxymethyl)amino-3-(carboxymethoxy...,Mag-indo-1,http://purl.bioontology.org/ontology/STY/T109|...,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T121,
87270,C481277,"Yrr1 protein, S cerevisiae","Yor162c protein, S cerevisiae",http://purl.bioontology.org/ontology/STY/T116,http://purl.bioontology.org/ontology/STY/T116,,
87271,C111764,3-epi-actinobolin,3-epiactinobolin,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T109,,


In [103]:
mesh_df = mesh_df.drop('Semantic Types',axis=1)

In [104]:
mesh_df.sample(10)

Unnamed: 0,Class ID,Preferred Label,Synonyms,ST1,ST2,ST3
18962,C000615401,Nickel-49,49Ni radioisotope|Ni-49 radioisotope,http://purl.bioontology.org/ontology/STY/T196,,
58825,C504819,"Fmc1 protein, S cerevisiae",,http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,
24581,C409038,nucleus-encoded phage-type RNA polymerase,NEP-1 enzyme|NEP-2 enzyme,http://purl.bioontology.org/ontology/STY/T126,http://purl.bioontology.org/ontology/STY/T116,
36420,C479695,"CED-2 protein, C elegans",,http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,
1837,C006469,allyl propyl disulfide,,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T121,
72215,D008828,Microbiological Techniques,Microbiologic Techniques|Microbiologic Technic...,http://purl.bioontology.org/ontology/STY/T059,,
51429,C031315,"2,9-dibromobenzanthrone","7H-Benz(de)anthracen-7-one, 2,9-dibromo-",http://purl.bioontology.org/ontology/STY/T109,,
5188,C531259,C4a-hydroperoxyflavin,,http://purl.bioontology.org/ontology/STY/T109,,
49403,C520224,"Cfd1 protein, S cerevisiae","YIL003W protein, S cerevisiae",http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,
21010,C476798,"Crk protein, mouse",v-crk sarcoma virus CT10 oncogene homolog (avi...,http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,


## Semantic Types

In [63]:
semantic_type_df = pd.read_csv(
    "data/STY.csv",      # relative python path to subdirectory
    sep=',',
    dtype=str,
    usecols=['Class ID','Preferred Label']
)
semantic_type_df

Unnamed: 0,Class ID,Preferred Label
0,http://purl.bioontology.org/ontology/STY/T057,Occupational Activity
1,http://purl.bioontology.org/ontology/STY/T047,Disease or Syndrome
2,http://purl.bioontology.org/ontology/STY/T167,Substance
3,http://purl.bioontology.org/ontology/STY/T066,Machine Activity
4,http://purl.bioontology.org/ontology/STY/T184,Sign or Symptom
...,...,...
122,http://purl.bioontology.org/ontology/STY/T194,Archaeon
123,http://purl.bioontology.org/ontology/STY/T012,Bird
124,http://purl.bioontology.org/ontology/STY/T087,Amino Acid Sequence
125,http://purl.bioontology.org/ontology/STY/T122,Biomedical or Dental Material


In [107]:
semantic_type_df.rename(columns={'Class ID': 'Semantic_type_url','Preferred Label':'type'}, inplace=True)

In [108]:
mesh_df = pd.merge(mesh_df,semantic_type_df,left_on='ST1',right_on='Semantic_type_url',how='inner')

In [109]:
mesh_df = pd.merge(mesh_df,semantic_type_df,left_on='ST2',right_on='Semantic_type_url',how='outer')

In [110]:
mesh_df = pd.merge(mesh_df,semantic_type_df,left_on='ST3',right_on='Semantic_type_url',how='outer')

In [111]:
print(mesh_df.shape)

(87350, 12)


In [112]:
mesh_df.sample(10)

Unnamed: 0,Class ID,Preferred Label,Synonyms,ST1,ST2,ST3,Semantic_type_url_x,type_x,Semantic_type_url_y,type_y,Semantic_type_url,type
55119,C034427,HB 180,,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T121,,http://purl.bioontology.org/ontology/STY/T109,Organic Chemical,http://purl.bioontology.org/ontology/STY/T121,Pharmacologic Substance,,
18280,C570570,clauraila A,,http://purl.bioontology.org/ontology/STY/T109,,,http://purl.bioontology.org/ontology/STY/T109,Organic Chemical,,,,
42501,D002534,"Hypoxia, Brain",Hypoxic Encephalopathy|Brain Anoxia|Cerebral H...,http://purl.bioontology.org/ontology/STY/T046,,,http://purl.bioontology.org/ontology/STY/T046,Pathologic Function,,,,
29809,C110304,SR 144190,"3-(1-(2-(4-benzoyl-2-(3,4-difluorophenyl)morph...",http://purl.bioontology.org/ontology/STY/T109,,,http://purl.bioontology.org/ontology/STY/T109,Organic Chemical,,,,
54878,C064076,compound 1929,,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T121,,http://purl.bioontology.org/ontology/STY/T109,Organic Chemical,http://purl.bioontology.org/ontology/STY/T121,Pharmacologic Substance,,
68467,C518612,"RSRC2 protein, human","arginine-serine-rich coiled-coil 2 protein, human",http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,,http://purl.bioontology.org/ontology/STY/T123,Biologically Active Substance,http://purl.bioontology.org/ontology/STY/T116,"Amino Acid, Peptide, or Protein",,
66510,C496303,hemoglobin Tripoli,Hb Tripoli,http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,,http://purl.bioontology.org/ontology/STY/T123,Biologically Active Substance,http://purl.bioontology.org/ontology/STY/T116,"Amino Acid, Peptide, or Protein",,
85410,C524644,Yi-Gan San,yokukansan|YiGan San|TJ-54|yoku-kan-san,http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T121,http://purl.bioontology.org/ontology/STY/T123,Biologically Active Substance,http://purl.bioontology.org/ontology/STY/T109,Organic Chemical,http://purl.bioontology.org/ontology/STY/T121,Pharmacologic Substance
54886,C045046,3-(4-fluorobenzoyl)tropane,3-FBOT,http://purl.bioontology.org/ontology/STY/T109,http://purl.bioontology.org/ontology/STY/T121,,http://purl.bioontology.org/ontology/STY/T109,Organic Chemical,http://purl.bioontology.org/ontology/STY/T121,Pharmacologic Substance,,
65636,C556345,"CIB22 protein, Arabidopsis","AtCIB22 protein, Arabidopsis",http://purl.bioontology.org/ontology/STY/T123,http://purl.bioontology.org/ontology/STY/T116,,http://purl.bioontology.org/ontology/STY/T123,Biologically Active Substance,http://purl.bioontology.org/ontology/STY/T116,"Amino Acid, Peptide, or Protein",,


In [113]:
mesh_df.columns

Index(['Class ID', 'Preferred Label', 'Synonyms', 'ST1', 'ST2', 'ST3',
       'Semantic_type_url_x', 'type_x', 'Semantic_type_url_y', 'type_y',
       'Semantic_type_url', 'type'],
      dtype='object')

In [114]:
mesh_df= mesh_df[['Class ID','Preferred Label','Synonyms','type_x','type_y','type']]

In [122]:
mesh_df.sample(10)

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type
31864,C050934,neurofilament protein 150,neurofilament protein NF 150|NF-150,"Amino Acid, Peptide, or Protein",,
17925,C534405,cespihypotin T,,Organic Chemical,,
49460,C014507,"N,N-dimethyl-N-(3-chlorophenyl)guanidine","N,N-dimethyl-N-3-chlorophenylguanidine|N,N-dim...",Organic Chemical,Pharmacologic Substance,
78617,C534960,"UNC-108 protein, C elegans","Unc108 protein, C elegans|RAB-2 protein, C ele...",Enzyme,"Amino Acid, Peptide, or Protein",
70323,C572196,"TTC25 protein, Xenopus","TTC25.S protein, Xenopus|TTC25.L protein, Xenopus",Biologically Active Substance,"Amino Acid, Peptide, or Protein",
41624,D000494,Allosteric Regulation,"Regulations, Allosteric|Regulation, Allosteric...",Molecular Function,,
73868,C000597542,(Dpr3(octanoyl)Lys19(Cy5))ghrelin(1-19),,Biologically Active Substance,"Amino Acid, Peptide, or Protein",
17593,C022855,(tetraphenylthio)oxymolybdate,tetraphenylarsonium tetrakis(benzenethiolato)o...,Organic Chemical,,
69796,C082717,"cppC protein, Neisseria gonorrhoeae","cryptic plasmid protein C, Neisseria gonorrhoeae",Biologically Active Substance,"Amino Acid, Peptide, or Protein",
67874,C099483,"CB protein, N15 bacteriophage",,Biologically Active Substance,"Amino Acid, Peptide, or Protein",


In [117]:
print(mesh_df.shape)

(87350, 6)


In [123]:
mesh_df.rename(columns={'type': 'type_z'}, inplace=True)
mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type_z
0,C000624633,technetium 99m hydroxyethylene-diphosphonate,99mTc-HDP|99mTc-hydroxyethylene-diphosphonate,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
1,C020354,hexacyanoferrate II,hexacyanoiron(II)|ferrocyanate|ferrocyanide io...,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
2,C044522,neptunium nitrate,"neptunium nitrate, 237Np-labeled cpd Np(5+)|ne...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
3,D010772,Phosphotungstic Acid,"Tungsten hydroxide oxide phosphate|Acid, Phosp...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
4,C045076,Fenton's reagent,Fenton reagent,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
...,...,...,...,...,...,...
87345,,,,,,Organism Attribute
87346,,,,,,Professional or Occupational Group
87347,,,,,,Group Attribute
87348,,,,,,Archaeon


In [126]:
mesh_df.isnull().sum(axis = 0)

Class ID               0
Preferred Label        0
Synonyms           28522
type_x                 0
type_y             41464
type_z             85368
dtype: int64

In [125]:
mesh_df = mesh_df[mesh_df['Class ID'].notna()]

In [127]:
mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type_z
0,C000624633,technetium 99m hydroxyethylene-diphosphonate,99mTc-HDP|99mTc-hydroxyethylene-diphosphonate,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
1,C020354,hexacyanoferrate II,hexacyanoiron(II)|ferrocyanate|ferrocyanide io...,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
2,C044522,neptunium nitrate,"neptunium nitrate, 237Np-labeled cpd Np(5+)|ne...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
3,D010772,Phosphotungstic Acid,"Tungsten hydroxide oxide phosphate|Acid, Phosp...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
4,C045076,Fenton's reagent,Fenton reagent,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,
...,...,...,...,...,...,...
87265,D008393,Marriage,"Union, Consensual|Age, Marriage|Ages, Marriage...",Qualitative Concept,Temporal Concept,
87266,D004868,Equipment Failure,"Misuses, Equipment|Failures, Equipment|Equipme...",Machine Activity,Phenomenon or Process,Activity
87267,D005298,Fertility,Fertility Preferences|Fertility Preference|Mar...,Organism Function,Entity,
87268,D016247,Information Storage and Retrieval,"Data Source|Retrieval, Data|Machine-Readable D...",Occupational Activity,Machine Activity,


In [149]:
def set_specific_type(type_x,type_y,type_z):
    if not pd.isnull(type_z):
        return type_z
    if not pd.isnull(type_y):
        return type_y
    return type_x

In [150]:
mesh_df['type'] = mesh_df.apply(lambda x: set_specific_type(x.type_x,x.type_y,x.type_z),axis=1)

In [152]:
mesh_df.sample(10)

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type_z,type
55888,C575821,"5,7-dihydroxy-4'-methoxy-8-prenylflavanone",,Organic Chemical,Pharmacologic Substance,,Pharmacologic Substance
79631,D010625,Phenylethanolamine N-Methyltransferase,Phenethanolamine N Methyltransferase|Noradrena...,Enzyme,"Amino Acid, Peptide, or Protein",,"Amino Acid, Peptide, or Protein"
71274,C486597,"Nub1 protein, mouse",negative regulator of ubiquitin-like proteins ...,Biologically Active Substance,"Amino Acid, Peptide, or Protein",,"Amino Acid, Peptide, or Protein"
7370,C000639493,Sphingorhabdus rigui,Sphingopyxis rigui,Bacterium,,,Bacterium
1183,C467528,rhizophorin E,"3beta,6alpha-diacetoxy-8(14),15-isopimaradien-...",Biologically Active Substance,Organic Chemical,,Organic Chemical
54993,C012148,A28695A,"septamycin, 6,16-didemethyl-27-methyl-",Organic Chemical,Pharmacologic Substance,,Pharmacologic Substance
62628,C514504,"Dynll2 protein, mouse","dynein light chain LC8-type 2 protein, mouse",Biologically Active Substance,"Amino Acid, Peptide, or Protein",,"Amino Acid, Peptide, or Protein"
69781,C077032,"FcrV protein, Streptococcus",,Biologically Active Substance,"Amino Acid, Peptide, or Protein",,"Amino Acid, Peptide, or Protein"
2414,C027329,naringenin chalcone,"2',4,4',6'-tetrahydroxychalcone|chalconaringen...",Biologically Active Substance,Organic Chemical,,Organic Chemical
34427,C082338,FN-C-H II peptide,,"Amino Acid, Peptide, or Protein",,,"Amino Acid, Peptide, or Protein"


In [153]:
mesh_df.to_csv('data/MESH_with_Semantic_types.csv',index=False)

## Chemicals in MESH

In [11]:
mesh_df = pd.read_csv('data/MESH_with_Semantic_types.csv')
mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type_z,type
0,C000624633,technetium 99m hydroxyethylene-diphosphonate,99mTc-HDP|99mTc-hydroxyethylene-diphosphonate,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
1,C020354,hexacyanoferrate II,hexacyanoiron(II)|ferrocyanate|ferrocyanide io...,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
2,C044522,neptunium nitrate,"neptunium nitrate, 237Np-labeled cpd Np(5+)|ne...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
3,D010772,Phosphotungstic Acid,"Tungsten hydroxide oxide phosphate|Acid, Phosp...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
4,C045076,Fenton's reagent,Fenton reagent,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
...,...,...,...,...,...,...,...
87228,D008393,Marriage,"Union, Consensual|Age, Marriage|Ages, Marriage...",Qualitative Concept,Temporal Concept,,Temporal Concept
87229,D004868,Equipment Failure,"Misuses, Equipment|Failures, Equipment|Equipme...",Machine Activity,Phenomenon or Process,Activity,Activity
87230,D005298,Fertility,Fertility Preferences|Fertility Preference|Mar...,Organism Function,Entity,,Entity
87231,D016247,Information Storage and Retrieval,"Data Source|Retrieval, Data|Machine-Readable D...",Occupational Activity,Machine Activity,,Machine Activity


In [160]:
list(mesh_df.type.unique())

['Inorganic Chemical',
 'Organic Chemical',
 'Immunologic Factor',
 'Indicator, Reagent, or Diagnostic Aid',
 'Biologically Active Substance',
 'Virus',
 'Bacterium',
 'Enzyme',
 'Amino Acid, Peptide, or Protein',
 'Disease or Syndrome',
 'Element, Ion, or Isotope',
 'Body Part, Organ, or Organ Component',
 'Biomedical or Dental Material',
 'Diagnostic Procedure',
 'Nucleic Acid, Nucleoside, or Nucleotide',
 'Antibiotic',
 'Pharmacologic Substance',
 'Plant',
 'Body Location or Region',
 'Geographic Area',
 'Congenital Abnormality',
 'Molecular Function',
 'Cell',
 'Therapeutic or Preventive Procedure',
 'Biomedical Occupation or Discipline',
 'Pathologic Function',
 'Intellectual Product',
 'Functional Concept',
 'Tissue',
 'Health Care Activity',
 'Eukaryote',
 'Population Group',
 'Hazardous or Poisonous Substance',
 'Vitamin',
 'Injury or Poisoning',
 'Body Space or Junction',
 'Laboratory Procedure',
 'Finding',
 'Medical Device',
 'Social Behavior',
 'Neoplastic Process',
 'Quant

In [12]:
chemical_types = ['Inorganic Chemical',
         'Organic Chemical',
         'Immunologic Factor',
         'Indicator, Reagent, or Diagnostic Aid',
         'Biologically Active Substance',
         'Element, Ion, or Isotope',
         'Antibiotic',
         'Pharmacologic Substance',
         'Molecular Function',
         'Hazardous or Poisonous Substance',
         'Vitamin',
         'Chemical Viewed Structurally',
         'Chemical',
         'Clinical Attribute',
         'Hormone',
         'Clinical Drug',
         'Substance',
         'Body Substance',
         'Chemical Viewed Functionally',
         'Laboratory or Test Result'
 ]

In [13]:
chemical_mesh_df = mesh_df[mesh_df.type.isin(chemical_types)]

In [14]:
chemical_mesh_df.sample(10)

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type_z,type
23861,C098546,ethyl tert-butyl ether,2-ethoxy-2-methylpropane|ethyl tertiary-butyl ...,Organic Chemical,,,Organic Chemical
59570,C058287,glidobactin H,Glidobactin H,"Amino Acid, Peptide, or Protein",Pharmacologic Substance,,Pharmacologic Substance
61526,D001278,Atractyloside,"Atractylic Acid|Acid, Atractylic|19-Norkaur-16...",Organic Chemical,Hazardous or Poisonous Substance,,Hazardous or Poisonous Substance
30236,C532539,luteolin 6-C-beta-boivinopyranoside,,Organic Chemical,,,Organic Chemical
3441,C495698,hu3S193-N-acetyl gamma calicheamicin dimethyl ...,hu3S193-CalichDMH,Organic Chemical,Immunologic Factor,,Immunologic Factor
85784,C000613469,tafasitamab,XmAb5574|XMAB-5574|MOR00208|MOR-208|MOR208 mon...,"Amino Acid, Peptide, or Protein",Immunologic Factor,Pharmacologic Substance,Pharmacologic Substance
15359,C013266,3-sulfonamido-4-chlorobenzoic acid,,Organic Chemical,,,Organic Chemical
102,C037129,"gold tetrabromide, acid","potassium tetrabromoaurate|gold tetrabromide, ...",Pharmacologic Substance,Inorganic Chemical,,Inorganic Chemical
61207,C000613050,L01-750,,Immunologic Factor,Pharmacologic Substance,,Pharmacologic Substance
48148,C555157,SGI-1252,,Organic Chemical,Pharmacologic Substance,,Pharmacologic Substance


In [15]:
print(chemical_mesh_df.shape)

(45292, 7)


In [177]:
chemical_mesh_df.to_csv('data/MESH_chemicals.csv',index=False)

In [11]:
chemical_mesh_df = pd.read_csv('data/MESH_chemicals.csv')
chemical_mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type_z,type
0,C000624633,technetium 99m hydroxyethylene-diphosphonate,99mTc-HDP|99mTc-hydroxyethylene-diphosphonate,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
1,C020354,hexacyanoferrate II,hexacyanoiron(II)|ferrocyanate|ferrocyanide io...,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
2,C044522,neptunium nitrate,"neptunium nitrate, 237Np-labeled cpd Np(5+)|ne...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
3,D010772,Phosphotungstic Acid,"Tungsten hydroxide oxide phosphate|Acid, Phosp...","Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
4,C045076,Fenton's reagent,Fenton reagent,"Indicator, Reagent, or Diagnostic Aid",Inorganic Chemical,,Inorganic Chemical
...,...,...,...,...,...,...,...
45287,C031149,glycolic acid,"glycolate|glycolic acid, monopotassium salt|gl...",Chemical Viewed Structurally,"Indicator, Reagent, or Diagnostic Aid",,"Indicator, Reagent, or Diagnostic Aid"
45288,C029063,"fructose-1,6-diphosphate","fructose-1,6-diphosphate, disodium salt|fructo...",Biologically Active Substance,Chemical Viewed Structurally,,Chemical Viewed Structurally
45289,C444119,technetium Tc 99m depreotide,NeoTec|Tc-99m depreotide,"Amino Acid, Peptide, or Protein",Hormone,,Hormone
45290,D012662,Semen Preservation,"Frozen Semen|Preservation, Sperm|Semen, Frozen...",Health Care Activity,Laboratory Procedure,Body Substance,Body Substance


# Union of Databases

In [12]:
df = pd.merge(pubchem_df,chebi_df,on='inchikey',how='outer')
df

Unnamed: 0,cid,cmpdname,cmpdsynonym,inchikey,meshheadings,ChEBI_label,ChEBI_synonyms,ChEBI_parents,ChEBI_cross_reference,ChEBI_id
0,1,Acetylcarnitine,Acetyl-DL-carnitine|acetylcarnitine|DL-O-Acety...,RDHQFKQIGNGIED-UHFFFAOYSA-N,Acetylcarnitine,O-acetylcarnitine,3-(acetyloxy)-4-(trimethylazaniumyl)butanoate|...,http://purl.obolibrary.org/obo/CHEBI_17387,CAS:870-77-9|PMID:23315938|Reaxys:4136458,CHEBI:73024
1,4,1-Aminopropan-2-ol,1-Aminopropan-2-ol|1-AMINO-2-PROPANOL|78-96-6|...,HXKKHQJGJAFBHI-UHFFFAOYSA-N,,1-aminopropan-2-ol,1-aminopropan-2-ol|1-methyl-2-aminoethanol|2-h...,http://purl.obolibrary.org/obo/CHEBI_35681|htt...,HMDB:HMDB0012136|PMID:4362743|CAS:78-96-6|Reax...,CHEBI:19030
2,6,"1-Chloro-2,4-dinitrobenzene","1-chloro-2,4-dinitrobenzene|2,4-Dinitrochlorob...",VYZAHLCBVHPDDF-UHFFFAOYSA-N,Dinitrochlorobenzene,"1-chloro-2,4-dinitrobenzene","1-chloro-2,4-dinitrobenzene|1-Chloro-2,4-dinit...",http://purl.obolibrary.org/obo/CHEBI_35716|htt...,CAS:97-00-7|PMID:25515858|Reaxys:613161|PMID:1...,CHEBI:34718
3,7,9-Ethyladenine,9-Ethyladenine|2715-68-6|9-ethyl-9h-purin-6-am...,MUIPLRMGAXZWSQ-UHFFFAOYSA-N,,,,,,
4,8,"2,3-Dihydroxy-3-methylpentanoic acid","2,3-Dihydroxy-3-methylpentanoic acid|2,3-dihyd...",PDGXJDXVGMHUIR-UHFFFAOYSA-N,,"2,3-Dihydroxy-3-methylpentanoate","2,3-Dihydroxy-3-methylpentanoate|2,3-Dihydroxy...",http://purl.obolibrary.org/obo/CHEBI_24654,KEGG:C04104,CHEBI:882
...,...,...,...,...,...,...,...,...,...,...
246486,,,,RYZLCEZTTCNESU-ISYZDWFVSA-N,,"(2S,4S,5R,6R)-5-Acetamido-2-[(2R,3R,4S,5S,6R)-...","WURCS=2.0/5,7,6/[h2112h_2*NCC/3=O][a2112h-1b_1...",http://purl.obolibrary.org/obo/CHEBI_37909,GlyTouCan:G49104WF|GlyGen:G49104WF,CHEBI:151828
246487,,,,OETBDAZRINTQAP-SNUYCWBESA-N,,"(5Z,7E)-(1R,2R,3R)-3-(hydroxy-propyl)-19-nor-9...","(1R,2R,3R,5Z,7E,17beta)-17-[(2R)-6-hydroxy-6-m...",http://purl.obolibrary.org/obo/CHEBI_27300,,CHEBI:137133
246488,,,,DEWDMTSMCKXBNP-BYPYZUCNSA-M,,N-carbamoyl-L-methioninate,(2S)-2-(carbamoylamino)-4-(methylsulfanyl)buta...,http://purl.obolibrary.org/obo/CHEBI_58865,PMID:1732229,CHEBI:137116
246489,,,,GIFMPMIHCYRIMA-GAMJHBRKSA-N,,"[(2R,3S,4R,5R,6R)-5-Acetamido-6-[[(2R,3R,4R,5R...",beta-D-galacto-hexopyranosyl-(1->3)-[2-acetami...,http://purl.obolibrary.org/obo/CHEBI_78616,GlyTouCan:G71575PR|GlyGen:G71575PR,CHEBI:157053


In [13]:
df.isnull().sum(axis = 0)

cid                      115829
cmpdname                 115829
cmpdsynonym              115829
inchikey                  22476
meshheadings             115829
ChEBI_label              108419
ChEBI_synonyms           157565
ChEBI_parents            108419
ChEBI_cross_reference    139115
ChEBI_id                 108419
dtype: int64

In [14]:
df.fillna('', inplace=True)
df

Unnamed: 0,cid,cmpdname,cmpdsynonym,inchikey,meshheadings,ChEBI_label,ChEBI_synonyms,ChEBI_parents,ChEBI_cross_reference,ChEBI_id
0,1,Acetylcarnitine,Acetyl-DL-carnitine|acetylcarnitine|DL-O-Acety...,RDHQFKQIGNGIED-UHFFFAOYSA-N,Acetylcarnitine,O-acetylcarnitine,3-(acetyloxy)-4-(trimethylazaniumyl)butanoate|...,http://purl.obolibrary.org/obo/CHEBI_17387,CAS:870-77-9|PMID:23315938|Reaxys:4136458,CHEBI:73024
1,4,1-Aminopropan-2-ol,1-Aminopropan-2-ol|1-AMINO-2-PROPANOL|78-96-6|...,HXKKHQJGJAFBHI-UHFFFAOYSA-N,,1-aminopropan-2-ol,1-aminopropan-2-ol|1-methyl-2-aminoethanol|2-h...,http://purl.obolibrary.org/obo/CHEBI_35681|htt...,HMDB:HMDB0012136|PMID:4362743|CAS:78-96-6|Reax...,CHEBI:19030
2,6,"1-Chloro-2,4-dinitrobenzene","1-chloro-2,4-dinitrobenzene|2,4-Dinitrochlorob...",VYZAHLCBVHPDDF-UHFFFAOYSA-N,Dinitrochlorobenzene,"1-chloro-2,4-dinitrobenzene","1-chloro-2,4-dinitrobenzene|1-Chloro-2,4-dinit...",http://purl.obolibrary.org/obo/CHEBI_35716|htt...,CAS:97-00-7|PMID:25515858|Reaxys:613161|PMID:1...,CHEBI:34718
3,7,9-Ethyladenine,9-Ethyladenine|2715-68-6|9-ethyl-9h-purin-6-am...,MUIPLRMGAXZWSQ-UHFFFAOYSA-N,,,,,,
4,8,"2,3-Dihydroxy-3-methylpentanoic acid","2,3-Dihydroxy-3-methylpentanoic acid|2,3-dihyd...",PDGXJDXVGMHUIR-UHFFFAOYSA-N,,"2,3-Dihydroxy-3-methylpentanoate","2,3-Dihydroxy-3-methylpentanoate|2,3-Dihydroxy...",http://purl.obolibrary.org/obo/CHEBI_24654,KEGG:C04104,CHEBI:882
...,...,...,...,...,...,...,...,...,...,...
246486,,,,RYZLCEZTTCNESU-ISYZDWFVSA-N,,"(2S,4S,5R,6R)-5-Acetamido-2-[(2R,3R,4S,5S,6R)-...","WURCS=2.0/5,7,6/[h2112h_2*NCC/3=O][a2112h-1b_1...",http://purl.obolibrary.org/obo/CHEBI_37909,GlyTouCan:G49104WF|GlyGen:G49104WF,CHEBI:151828
246487,,,,OETBDAZRINTQAP-SNUYCWBESA-N,,"(5Z,7E)-(1R,2R,3R)-3-(hydroxy-propyl)-19-nor-9...","(1R,2R,3R,5Z,7E,17beta)-17-[(2R)-6-hydroxy-6-m...",http://purl.obolibrary.org/obo/CHEBI_27300,,CHEBI:137133
246488,,,,DEWDMTSMCKXBNP-BYPYZUCNSA-M,,N-carbamoyl-L-methioninate,(2S)-2-(carbamoylamino)-4-(methylsulfanyl)buta...,http://purl.obolibrary.org/obo/CHEBI_58865,PMID:1732229,CHEBI:137116
246489,,,,GIFMPMIHCYRIMA-GAMJHBRKSA-N,,"[(2R,3S,4R,5R,6R)-5-Acetamido-6-[[(2R,3R,4R,5R...",beta-D-galacto-hexopyranosyl-(1->3)-[2-acetami...,http://purl.obolibrary.org/obo/CHEBI_78616,GlyTouCan:G71575PR|GlyGen:G71575PR,CHEBI:157053


In [15]:
(df['meshheadings'].values == '').sum() 

238018

In [16]:
chemical_mesh_df.sample(10)

Unnamed: 0,Class ID,Preferred Label,Synonyms,type_x,type_y,type_z,type
27611,C000708352,trilaciclib,2'-((5-(4-Methyl-1-piperazinyl)-2-pyridinyl)am...,Organic Chemical,Pharmacologic Substance,,Pharmacologic Substance
31592,C571528,chloro(curcuminato)(p-cymene)ruthenium(II),(p-cymene)Ru(curcuminato)chloro,Organic Chemical,Pharmacologic Substance,,Pharmacologic Substance
5326,C502343,bromanil,,Organic Chemical,,,Organic Chemical
6125,C000626579,LSS-11,,Organic Chemical,,,Organic Chemical
43278,C058397,ganglioside GT1b-bovine serum albumin complex,GGT1b-BSA,"Amino Acid, Peptide, or Protein","Indicator, Reagent, or Diagnostic Aid",,"Indicator, Reagent, or Diagnostic Aid"
38014,C451860,6-bromo-2-(2-(3-(2-chloroethyl)-3-nitrosoureid...,6-Br-CE-NUE-B-IQ,Organic Chemical,Pharmacologic Substance,,Pharmacologic Substance
43311,C000619835,technetium 99m HYNIC-H6F,99mTc-HYNIC-H6F,"Amino Acid, Peptide, or Protein","Indicator, Reagent, or Diagnostic Aid",,"Indicator, Reagent, or Diagnostic Aid"
1592,C005356,sphingosine phosphorylcholine,lysosphingomyelin|sphingosylphosphocholine|sph...,Biologically Active Substance,Organic Chemical,,Organic Chemical
23329,C548136,papyriferic acid,,Organic Chemical,,,Organic Chemical
14168,C529319,"1,7-dihydroxy-3-methoxyacridone",KS-5 cpd,Organic Chemical,,,Organic Chemical


## Multiprocessing with Dask

In [136]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [15]:
import dask
from dask.distributed import Client, progress
from dask import delayed

client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:34693  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 7.45 GiB


In [16]:
@delayed(nout=2)
def check_mesh(chemical_mesh_df,i,pubchem_doc):
    for j in chemical_mesh_df.itertuples(index=False):
        if (j[1].lower() == i[1].lower()) or (j[1].lower in [item.lower() for item in pubchem_doc['synonyms']]):
            return True, j
    return False, j

In [17]:
@delayed(nout=4)
def mesh_addition(mesh_flag,mesh,pubchem_doc,new_syn_mesh,total_mesh_concepts): 
    if mesh_flag:
        mesh_id = mesh[0]
        print(mesh_id)
        if mesh[1] not in pubchem_doc['synonyms']:
            new_syn_mesh.append(mesh[1])
        semantic_type = mesh[6]
        total_mesh_concepts = total_mesh_concepts + 1
        return mesh_id,new_syn_mesh,semantic_type,total_mesh_concepts
    return '',[],'',total_mesh_concepts

In [None]:
documents = []
num = 0
total_mesh_concepts = 0

for i in df.itertuples(index=False):
    pubchem_doc = {}
    
    if len(i[1])==0 and len(i[5])>0:
            pubchem_doc['term'] = i[5]
    else:
            pubchem_doc['term'] = i[1]
            
    synonyms = []
    if len(i[2])>0:
        synonyms = synonyms + i[2].split("|")  
    if len(i[6])>0:
        possible_new_syn = i[6].split("|")
        if possible_new_syn not in synonyms:
            synonyms = synonyms + possible_new_syn
    pubchem_doc['synonyms']= synonyms
    
    mesh_id=''
    semantic_type =''
    new_syn_mesh = []
    
    mesh_flag,mesh = check_mesh(chemical_mesh_df,i,pubchem_doc)
    mesh_id,new_syn_mesh,semantic_type,total_mesh_concepts = mesh_addition(mesh_flag,mesh,pubchem_doc,new_syn_mesh,total_mesh_concepts)
    
    pubchem_doc['synonyms'] = pubchem_doc['synonyms'] + new_syn_mesh
    pubchem_doc['mesh_id'] = mesh_id
    pubchem_doc['semantic_type'] = semantic_type        
    pubchem_doc['cid']= i[0]
    pubchem_doc['chebi_id']= i[9] 
    pubchem_doc['inchikey'] = i[3]
    
    cross_references = []
    if len(i[9])>0:
        cross_references = cross_references + i[8].split("|")
    pubchem_doc['cross_references']= cross_references
    
    num +=1
    if num%1000 == 0:
        print(num/df.shape[0]*100,'% processed')
        
    documents.append(pubchem_doc)


results = dask.persist(*documents)

print(total_mesh_concepts, 'MESH ids added')    

## Processing in one core

In [16]:
import json
import os

In [17]:
def check_mesh(chemical_mesh_df,i,pubchem_doc):
    for j in chemical_mesh_df.itertuples(index=False):
        if (j[1].lower() == i[1].lower()) or (j[1].lower in [item.lower() for item in pubchem_doc['synonyms']]):
            return True, j
    return False, j

In [18]:
def mesh_addition(mesh_flag,mesh,pubchem_doc,new_syn_mesh,total_mesh_concepts): 
    if mesh_flag:
        mesh_id = mesh[0]
        if mesh[1] not in pubchem_doc['synonyms']:
            new_syn_mesh.append(mesh[1])
        semantic_type = mesh[6]
        total_mesh_concepts = total_mesh_concepts + 1
        return mesh_id,new_syn_mesh,semantic_type,total_mesh_concepts
    return '',[],'',total_mesh_concepts

In [49]:
documents = []
num = 0
total_mesh_concepts = 0
file_part = 'data/final/drugs_doc_1.json'
checkpoint_flag = True
length_file = 20000

for i in df.itertuples(index=False):
    num +=1
    if os.path.isfile(file_part):
        #print('Archivo',file_part,'encontrado')
        if num%length_file == 0:
            part = int(num/length_file)
            print('Part number',part)        
            file_part = 'data/final/drugs_doc_'+str(part+1)+'.json'
        checkpoint_flag = True

    else:
        pubchem_doc = {}

        if len(i[1])==0 and len(i[5])>0:
                pubchem_doc['term'] = i[5]
        else:
                pubchem_doc['term'] = i[1]

        synonyms = []
        if len(i[2])>0:
            synonyms = synonyms + i[2].split("|")  
        if len(i[6])>0:
            possible_new_syn = i[6].split("|")
            if possible_new_syn not in synonyms:
                synonyms = synonyms + possible_new_syn
        pubchem_doc['synonyms']= synonyms

        mesh_id=''
        semantic_type =''
        new_syn_mesh = []

        mesh_flag,mesh = check_mesh(chemical_mesh_df,i,pubchem_doc)
        mesh_id,new_syn_mesh,semantic_type,total_mesh_concepts = mesh_addition(mesh_flag,mesh,pubchem_doc,new_syn_mesh,total_mesh_concepts)
        
        
        pubchem_doc['synonyms'] = pubchem_doc['synonyms'] + new_syn_mesh
        pubchem_doc['mesh_headings'] = i[4]
        pubchem_doc['mesh_id'] = mesh_id
        pubchem_doc['semantic_type'] = semantic_type        
        pubchem_doc['cid']= i[0]
        pubchem_doc['chebi_id']= i[9] 
        pubchem_doc['inchikey'] = i[3]

        cross_references = []
        if len(i[9])>0:
            cross_references = cross_references + i[8].split("|")
        pubchem_doc['cross_references']= cross_references

        if num%(length_file/10) == 0:
            print(num)
            print(num/df.shape[0]*100,'% processed')
            
        documents.append(pubchem_doc)
        
        if num%length_file == 0:
            part = int(num/length_file)
            print('Part number',part)
            file_part = 'data/final/drugs_doc_'+str(part)+'.json'
            if checkpoint_flag:
                print('Processing file with checkpoint')
                with open(file_part, 'w') as fout:
                    json.dump(documents, fout)
                start = length_file
                end = start + length_file
            else:
                print('Processing file without checkpoint from',start)
                with open(file_part, 'w') as fout:
                    json.dump(documents[start:end], fout)
                start = end
                end = start + length_file
            file_part = 'data/final/drugs_doc_'+str(part+1)+'.json'
            checkpoint_flag = False
    


print(total_mesh_concepts, 'MESH ids added')    

2000
0.811388651106937 % processed
4000
1.622777302213874 % processed
6000
2.434165953320811 % processed
8000
3.245554604427748 % processed
10000
4.0569432555346845 % processed
12000
4.868331906641622 % processed
14000
5.679720557748559 % processed
16000
6.491109208855496 % processed
18000
7.302497859962433 % processed
20000
8.113886511069369 % processed
Part number 1
Processing file with checkpoint
22000
8.925275162176307 % processed
24000
9.736663813283243 % processed
26000
10.54805246439018 % processed
28000
11.359441115497118 % processed
30000
12.170829766604054 % processed
32000
12.982218417710992 % processed
34000
13.793607068817929 % processed
36000
14.604995719924865 % processed
38000
15.416384371031802 % processed
40000
16.227773022138738 % processed
Part number 2
Processing file without checkpoint from 20000
42000
17.039161673245676 % processed
44000
17.850550324352614 % processed
46000
18.66193897545955 % processed
48000
19.473327626566487 % processed
50000
20.28471627767342

In [22]:
mypath = 'data/final'
drug_files = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]

In [33]:
drugs = []
for file in drug_files:
    with open(os.path.join(mypath,file), 'r') as fout:
        drug_file = json.loads(fout.read())
    drugs += drug_file

with open('data/drugs_doc_all.json', 'w') as fout:
    json.dump(drugs, fout)

print(len(drugs))

246491


## Load processed drugs json

In [34]:
with open('data/drugs_doc_all.json', 'r') as fout:
    drug_file = json.loads(fout.read())
    print(len(drug_file))

246491


In [39]:
drug_file[0].keys()

dict_keys(['term', 'synonyms', 'mesh_headings', 'mesh_id', 'semantic_type', 'cid', 'chebi_id', 'inchikey', 'cross_references'])

# CTD

In [25]:
important_columns_ctd = ['ChemicalName','ChemicalID', 'Synonyms']

In [26]:
ctd_df = pd.read_csv(
    "data/CTD_chemicals.csv",      # relative python path to subdirectory
    sep=',',
    usecols=important_columns_ctd,
    dtype=str
)

print(ctd_df.shape)
ctd_df.head()

(174239, 3)


Unnamed: 0,ChemicalName,ChemicalID,Synonyms
0,(0.017ferrocene)amylose,MESH:C089250,(0.017 ferrocene)amylose
1,001-C8-NBD,MESH:C114385,001 C8 NBD|H-MeTyr-Arg-MeArg-D-Leu-NH(CH2)8NH-...
2,001-C8 oligopeptide,MESH:C114386,001 C8 oligopeptide|H-MeTyr-Arg-MeArg-D-Leu-NH...
3,"0231A , Streptomyces",MESH:C434150,
4,"0231B, Streptomyces",MESH:C434149,


In [27]:
ctd_df = ctd_df[ctd_df['ChemicalID'].str.startswith('MESH')]
print(ctd_df.shape)

(174239, 3)


In [28]:
ctd_df['ChemicalID'] = ctd_df['ChemicalID'].apply(lambda x: x.split(':')[1])

In [138]:
#ctd_df['Synonyms'] = ctd_df['Synonyms'].apply(lambda x: [syn for syn in str(x).split('|')])

In [29]:
ctd_df.sample(10)

Unnamed: 0,ChemicalName,ChemicalID,Synonyms
29683,"3-hydroxymethylphenytoin N,N-dimethylglycine e...",C043113,"Glycine, N,N-dimethyl-, (R)-|HDH ODGE"
112711,LQB34 compound,C527703,
117411,methyl-(4-carboxyphenyl)glycine,C474696,alpha-methyl-(4-carboxyphenyl)glycine|MCPG cpd
12055,"2,2',4,5'-tetrachlorobiphenyl",C042128,"2,2',4,5'-TCB"
76728,CRL 41405,C077879,2-(hexahydro-1H-azepin-1-yl)-4'-aminopropiophe...
100226,hexa CAF protocol,C040421,hexa-CAF
74214,clathrimide B,C575041,
45670,"6-methyl-8-methylsulfonyl-1,3,4,6-tetrahydro-2...",C048443,"2H-5,1-Benzothiazocine, 1,3,4,6-tetrahydro-6-m..."
101338,HLA-DR4 Antigen,D015804,"Antigen, HLA-DR4|HLA-DR4|HLA DR4 Antigen"
71375,CEVAD protocol,C404862,CEVAD regimen


In [190]:
i = 0
num = 0
mesh_appended= set([])

for doc in drug_file:
    if doc['mesh_id'] == '':
        proposed_mesh = ctd_df.loc[ctd_df['ChemicalName']==doc['term']]
        if len(proposed_mesh)>0:
            new_mesh = str(proposed_mesh['ChemicalID'].values[0])
            doc['mesh_id'] = new_mesh
            mesh_appended.add(new_mesh)
            num +=1
        else:
            try:
                alternative_proposed_mesh = ctd_df[ctd_df['Synonyms'].str.contains(doc['term'], na=False, case = False,regex= False)]
                #print(alternative_proposed_mesh)
                if len(alternative_proposed_mesh)>0 and len(alternative_proposed_mesh)<2:
                    new_mesh = str(alternative_proposed_mesh['ChemicalID'].values[0])
                    doc['mesh_id'] = new_mesh
                    mesh_appended.add(new_mesh)
                    num +=1
                elif len(alternative_proposed_mesh)>1:
                    all_terms=[]
                    for j in alternative_proposed_mesh.itertuples(index=False):
                        all_terms.append(j[0])
                        all_terms += [f for f in j[2].split('|')]
                        
                        if doc['term'] in all_terms:
                            doc['mesh_id'] = j[1]
                            mesh_appended.add(j[1])
                            num +=1
                        elif similar(doc['term'],j[0])>0.85:
                            doc['mesh_id'] = j[1] 
                            mesh_appended.add(j[1])
                            num +=1
                        all_terms=[]
                        
                
            except:
                print('Error in:',doc['term'])
            
    i +=1
    if i%1000 == 0:
        print(i)
        print('No of mesh ids linked:',num)
            

1000
No of mesh ids linked: 90
2000
No of mesh ids linked: 126
3000
No of mesh ids linked: 163
4000
No of mesh ids linked: 253
5000
No of mesh ids linked: 321
6000
No of mesh ids linked: 367
7000
No of mesh ids linked: 411
8000
No of mesh ids linked: 451
9000
No of mesh ids linked: 467
10000
No of mesh ids linked: 503
11000
No of mesh ids linked: 528
12000
No of mesh ids linked: 539
13000
No of mesh ids linked: 551
14000
No of mesh ids linked: 601
15000
No of mesh ids linked: 629
16000
No of mesh ids linked: 645
17000
No of mesh ids linked: 673
18000
No of mesh ids linked: 821
19000
No of mesh ids linked: 1059
20000
No of mesh ids linked: 1329
21000
No of mesh ids linked: 1559
22000
No of mesh ids linked: 1766
23000
No of mesh ids linked: 1967
24000
No of mesh ids linked: 2214
25000
No of mesh ids linked: 2440
26000
No of mesh ids linked: 2702
27000
No of mesh ids linked: 3013
28000
No of mesh ids linked: 3283
29000
No of mesh ids linked: 3522
30000
No of mesh ids linked: 3725
31000
No

In [191]:
with open('data/drugs_with_mesh.json', 'w') as fout:
    json.dump(drug_file, fout)

In [None]:
with open('data/foo.txt', 'w') as fout:
    for row in mesh_appended:
        output.write(str(row) + '\n')

## Read docs with MESH

In [12]:
with open('data/drugs_with_mesh.json', 'r') as fout:
    drug_file = json.loads(fout.read())
    print(len(drug_file))

246491


In [40]:
drug_file[0].keys()

dict_keys(['term', 'synonyms', 'mesh_headings', 'mesh_id', 'semantic_type', 'cid', 'chebi_id', 'inchikey', 'cross_references'])

## Append non linked Mesh terms

In [33]:
with open('data/mesh_appended.txt','r') as f:
    mesh_appended = f.read().splitlines() 

In [35]:
len(mesh_appended)

21864

In [37]:
df_not_linked = ctd_df[~ctd_df.ChemicalID.isin(mesh_appended)]

In [41]:
df_not_linked.fillna('',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [42]:
df_not_linked

Unnamed: 0,ChemicalName,ChemicalID,Synonyms
0,(0.017ferrocene)amylose,C089250,(0.017 ferrocene)amylose
1,001-C8-NBD,C114385,001 C8 NBD|H-MeTyr-Arg-MeArg-D-Leu-NH(CH2)8NH-...
2,001-C8 oligopeptide,C114386,001 C8 oligopeptide|H-MeTyr-Arg-MeArg-D-Leu-NH...
3,"0231A , Streptomyces",C434150,
4,"0231B, Streptomyces",C434149,
...,...,...,...
174234,"(Z,Z)-6,9-heneicosadien-11-ol",C482761,Z6Z9-11R-ol-C21
174235,"(Z,Z)-6,9-heptadecadiene",C000712888,"6,9-C17"
174236,"(Z,Z)-7,11-hexadecadienal",C558532,
174237,"(Z,Z)-dodeca-3,6-dien-1-ol",C483463,


In [46]:
for i in df_not_linked.itertuples(index=False):
    new_doc = {}
    new_doc['term'] = i[0]
    
    synonyms = []
    if len(i[2])>0:
        synonyms += i[2].split("|")
    new_doc['synonyms']= synonyms

    new_doc['mesh_headings'] = ''
    new_doc['mesh_id'] = i[1]
    new_doc['semantic_type'] = ''
    new_doc['cid'] = ''
    new_doc['chebi_id'] = ''
    new_doc['inchikey'] = ''
    new_doc['cross_references'] = ''
    
    drug_file.append(new_doc)

In [47]:
print(len(drug_file))

398866


## Resultant number of linked Mesh terms

In [48]:
from collections import Counter
counter = Counter(x['mesh_id'] for x in drug_file)

In [49]:
print('The number of terms with MESH value is:',len(drug_file)-counter[''])

The number of terms with MESH value is: 199920


## Save and load Doc with Mesh

In [50]:
with open('data/drugs.json', 'w') as fout:
    json.dump(drug_file, fout)

In [44]:
with open('data/drugs.json', 'r') as fout:
    drugs = json.loads(fout.read())
    print(len(drugs))

398866


# ATC

In [45]:
atc_df= pd.read_csv(
    "data/ATC.csv",      # relative python path to subdirectory
    sep=',',
    na_filter=False,
    quotechar="\"",       # single quote allowed as quote character
    usecols=['Class ID','Preferred Label','Synonyms','Semantic Types','ATC LEVEL'],
    dtype=str
)

print(atc_df.shape)
atc_df.head()

(6458, 5)


Unnamed: 0,Class ID,Preferred Label,Synonyms,Semantic Types,ATC LEVEL
0,http://purl.bioontology.org/ontology/UATC/A03AX13,silicones,,http://purl.bioontology.org/ontology/STY/T109|...,5
1,http://purl.bioontology.org/ontology/UATC/J01DB07,cefatrizine,,http://purl.bioontology.org/ontology/STY/T195|...,5
2,http://purl.bioontology.org/ontology/UATC/V09AA,"Technetium 99m compounds, central nervous syst...",Technetium (99mTc) compounds,http://purl.bioontology.org/ontology/STY/T130|...,4
3,http://purl.bioontology.org/ontology/UATC/G03GA04,urofollitropin,,http://purl.bioontology.org/ontology/STY/T116|...,5
4,http://purl.bioontology.org/ontology/UATC/D04AA10,promethazine,,http://purl.bioontology.org/ontology/STY/T109|...,5


In [46]:
atc_df['Class ID'] = atc_df['Class ID'].apply(lambda x: x.split('/')[-1])

In [47]:
atc_df['Semantic Types'] = atc_df['Semantic Types'].apply(lambda x: x.split('|')[-1].split('/')[-1])

In [48]:
semantic_type_df = pd.read_csv(
    "data/STY.csv",      # relative python path to subdirectory
    sep=',',
    dtype=str,
    usecols=['Class ID','Preferred Label']
)
semantic_type_df

Unnamed: 0,Class ID,Preferred Label
0,http://purl.bioontology.org/ontology/STY/T057,Occupational Activity
1,http://purl.bioontology.org/ontology/STY/T047,Disease or Syndrome
2,http://purl.bioontology.org/ontology/STY/T167,Substance
3,http://purl.bioontology.org/ontology/STY/T066,Machine Activity
4,http://purl.bioontology.org/ontology/STY/T184,Sign or Symptom
...,...,...
122,http://purl.bioontology.org/ontology/STY/T194,Archaeon
123,http://purl.bioontology.org/ontology/STY/T012,Bird
124,http://purl.bioontology.org/ontology/STY/T087,Amino Acid Sequence
125,http://purl.bioontology.org/ontology/STY/T122,Biomedical or Dental Material


In [49]:
semantic_type_df['Class ID'] = semantic_type_df['Class ID'].apply(lambda x: x.split('/')[-1])

In [50]:
atc_df = atc_df.merge(semantic_type_df,left_on='Semantic Types',right_on='Class ID',how='inner')

In [51]:
atc_df.drop(['Semantic Types','Class ID_y'],inplace=True,axis=1)

In [52]:
atc_df.rename(columns={'Preferred Label_y':'Semantic Type','Preferred Label_x':'Term','Class ID_x':'ATC'},inplace=True)

In [53]:
atc_df

Unnamed: 0,ATC,Term,Synonyms,ATC LEVEL,Semantic Type
0,A03AX13,silicones,,5,Biomedical or Dental Material
1,B05AA06,gelatin agents,,5,Biomedical or Dental Material
2,V07AA,Plasters,,4,Biomedical or Dental Material
3,A01AA,Caries prophylactic agents,,4,Biomedical or Dental Material
4,V03AE01,polystyrene sulfonate,,5,Biomedical or Dental Material
...,...,...,...,...,...
6326,A07FA02,saccharomyces boulardii,,5,Fungus
6327,B05AX03,blood plasma,,5,Body Substance
6328,G01AX14,lactobacillus,,5,Bacterium
6329,D08AL30,silver,,5,"Element, Ion, or Isotope"


In [54]:
i = 0
num = 0
atc_appended= set([])

for doc in drugs:
    proposed_atc = atc_df.loc[atc_df['Term']==doc['term']]
    if len(proposed_atc)>0:
        new_atc_id = str(proposed_atc['ATC'].values[0])
        new_semantic_type = str(proposed_atc['Semantic Type'].values[0])
        new_level = str(proposed_atc['ATC LEVEL'].values[0])
        doc['ATC'] = new_atc_id
        doc['semantic_type'] = new_semantic_type
        doc['ATC_level'] = new_level
        atc_appended.add(new_atc_id)
        num +=1
    elif len(proposed_atc)<1:
        try:
            alternative_proposed_atc = atc_df[atc_df['Synonyms'].str.contains(doc['term'], na=False, case = False,regex= False)]
            if len(alternative_proposed_atc)>0 and len(alternative_proposed_atc)<2:
                new_atc_id = str(alternative_proposed_atc['ATC'].values[0])
                new_semantic_type = str(alternative_proposed_atc['Semantic Type'].values[0])
                new_level = str(alternative_proposed_atc['ATC LEVEL'].values[0])
                doc['ATC'] = new_atc_id
                doc['semantic_type'] = new_semantic_type
                doc['ATC_level'] = new_level
                atc_appended.add(new_atc_id)
                num +=1
            elif len(alternative_proposed_atc)>1:
                all_terms=[]
                for j in alternative_proposed_atc.itertuples(index=False):
                    all_terms.append(j[1])
                    all_terms += [f for f in j[2].split('|')]

                    if doc['term'] in all_terms:
                        doc['ATC'] = j[0]
                        doc['semantic_type'] = j[4]
                        doc['ATC_level'] =j[3]
                        atc_appended.add(j[0])
                        num +=1
                    elif similar(doc['term'],j[0])>0.92:
                        doc['ATC'] = j[0]
                        doc['semantic_type'] = j[4]
                        doc['ATC_level'] =j[3]
                        atc_appended.add(j[0])
                        num +=1
                    all_terms=[]
            else:
                doc['ATC'] = ''
                doc['ATC_level'] = ''
                    
        except:
            print('Error in:',doc['term'])


        
            
    i +=1
    if i%30000 == 0:
        print(i)
        print('No of ATC ids linked:',num)

30000
No of ATC ids linked: 69
60000
No of ATC ids linked: 218
90000
No of ATC ids linked: 233
120000
No of ATC ids linked: 355
150000
No of ATC ids linked: 407
180000
No of ATC ids linked: 458
210000
No of ATC ids linked: 465
240000
No of ATC ids linked: 483
270000
No of ATC ids linked: 483
300000
No of ATC ids linked: 540
330000
No of ATC ids linked: 815
360000
No of ATC ids linked: 979
390000
No of ATC ids linked: 1179


## Append non linked ATC terms

In [55]:
len(atc_appended)

1115

In [56]:
atc_appended=list(atc_appended)

In [57]:
df_not_linked = atc_df[~atc_df['ATC'].isin(atc_appended)]

In [58]:
df_not_linked.fillna('',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [59]:
drugs[0]

{'term': 'Acetylcarnitine',
 'synonyms': ['Acetyl-DL-carnitine',
  'acetylcarnitine',
  'DL-O-Acetylcarnitine',
  'DL-Acetylcarnitine',
  'O-acetylcarnitine',
  '3-acetyloxy-4-(trimethylazaniumyl)butanoate',
  '(+/-)-acetylcarnitine',
  '3-(acetyloxy)-4-(trimethylazaniumyl)butanoate',
  'Ammonium, (3-carboxy-2-hydroxypropyl)trimethyl-, hydroxide, inner salt, acetate, DL-',
  '14992-62-2',
  '870-77-9',
  '1-Propanaminium, 2-(acetoxy)-3-carboxy-N,N,N-trimethyl-, hydroxide, inner salt, (+-)- (9CI)',
  'L-ACETYLCARNITINE',
  '1-Propanaminium, 2-(acetyloxy)-3-carboxy-N,N,N-trimethyl-, inner salt',
  '3040-38-8',
  'bmse000142',
  'SCHEMBL69781',
  'DTXSID2048117',
  'CHEBI:73024',
  'LMFA07070060',
  'HY-126358',
  '3-(acetyloxy)-4-(trimethylammonio)butanoate',
  'CS-0102945',
  'FT-0778235',
  'Q27140241',
  '3-(acetyloxy)-4-(trimethylazaniumyl)butanoate',
  'acetylcarnitine',
  '3-(acetyloxy)-4-(trimethylammonio)butanoate',
  'Acetyl-DL-carnitine',
  'DL-O-Acetylcarnitine'],
 'mesh_headi

In [60]:
df_not_linked

Unnamed: 0,ATC,Term,Synonyms,ATC LEVEL,Semantic Type
0,A03AX13,silicones,,5,Biomedical or Dental Material
1,B05AA06,gelatin agents,,5,Biomedical or Dental Material
2,V07AA,Plasters,,4,Biomedical or Dental Material
3,A01AA,Caries prophylactic agents,,4,Biomedical or Dental Material
4,V03AE01,polystyrene sulfonate,,5,Biomedical or Dental Material
...,...,...,...,...,...
6326,A07FA02,saccharomyces boulardii,,5,Fungus
6327,B05AX03,blood plasma,,5,Body Substance
6328,G01AX14,lactobacillus,,5,Bacterium
6329,D08AL30,silver,,5,"Element, Ion, or Isotope"


In [62]:
for i in df_not_linked.itertuples(index=False):
    new_doc = {}
    new_doc['term'] = i[1]
    
    synonyms = []
    if len(i[2])>0:
        synonyms += i[2].split("|")
    new_doc['synonyms']= synonyms

    new_doc['mesh_headings'] = ''
    new_doc['mesh_id'] = ''
    new_doc['semantic_type'] = i[4]
    new_doc['cid'] = ''
    new_doc['chebi_id'] = ''
    new_doc['inchikey'] = ''
    new_doc['cross_references'] = ''
    new_doc['ATC'] = i[0]
    new_doc['ATC_level'] = i[3]
    
    drugs.append(new_doc)

In [63]:
print(len(drugs))

404082


In [64]:
with open('data/Chemical/drugs.json', 'w') as fout:
    json.dump(drugs, fout)

In [44]:
with open('data/Chemical/drugs.json', 'r') as fout:
    drugs = json.loads(fout.read())
    print(len(drugs))

398866


In [15]:
import pysolr
import math

solr = pysolr.Solr('http://localhost:8983/solr/gettingstarted', timeout=10)

In [77]:
solr.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":25,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all",\n      "rid":"-4"}},\n  "status":"OK"}\n'

In [74]:
import pysolr
import math

solr = pysolr.Solr('http://librairy.linkedddata.es/data/atc', timeout=2)

documents = []
num = 0;
for i in atc_data.itertuples(index=False):
    atc_document = {}
    if (len(i[0]) > 0 ):
        atc_document['id']= i[0].split("/")[-1]
    if (len(i[1]) > 0 ):
        atc_document['label_t']= i[1].lower()
    synonyms = []
    if (len(i[2]) > 0 ):
        synonyms.append(i[2].lower())
    atc_document['synonyms']= synonyms
    if (len(i[3]) > 0 ):
        atc_document['cui_s']= i[3]
    if (len(i[4]) > 0 ):
        atc_document['parent_s']= i[4].split("/")[-1]
    if (len(i[5]) > 0 ):
        atc_document['level_i']= i[5]    
    documents.append(atc_document) 
    num+=1
    if (len(documents) > 0 and len(documents) %100 == 0):
        solr.add(documents)
        solr.commit()
        documents = []
        print(num,"drugs added")

100 drugs added
200 drugs added
300 drugs added
400 drugs added
500 drugs added
600 drugs added
700 drugs added
800 drugs added
900 drugs added
1000 drugs added
1100 drugs added
1200 drugs added
1300 drugs added
1400 drugs added
1500 drugs added
1600 drugs added
1700 drugs added
1800 drugs added
1900 drugs added
2000 drugs added
2100 drugs added
2200 drugs added
2300 drugs added
2400 drugs added
2500 drugs added
2600 drugs added
2700 drugs added
2800 drugs added
2900 drugs added
3000 drugs added
3100 drugs added
3200 drugs added
3300 drugs added
3400 drugs added
3500 drugs added
3600 drugs added
3700 drugs added
3800 drugs added
3900 drugs added
4000 drugs added
4100 drugs added
4200 drugs added
4300 drugs added
4400 drugs added
4500 drugs added
4600 drugs added
4700 drugs added
4800 drugs added
4900 drugs added
5000 drugs added
5100 drugs added
5200 drugs added
5300 drugs added
5400 drugs added
5500 drugs added
5600 drugs added
5700 drugs added
5800 drugs added
5900 drugs added
6000 d

# Adding CIMA Codes

In [75]:
drugs_data = pd.read_csv(
    "data/drugs.csv",      # relative python path to subdirectory
    sep=';',           # Tab-separated value file.
    quotechar="\"",        # single quote allowed as quote character
    usecols=['Nº Registro', 'Medicamento', 'Cód. ATC']
)

print(drugs_data.shape) 
drugs_data.head()



(23407, 3)


Unnamed: 0,Nº Registro,Medicamento,Cód. ATC
0,40537,INACID 25 mg CAPSULAS DURAS,M01AB01
1,7235,PASSIFLORINE SOLUCION ORAL,N05CM
2,18329,TIOBARBITAL BRAUN 1 G,N01AF03
3,33561,DIHYDERGOT COMPRIMIDOS,N02CA01
4,34369,GLUCOSADA GRIFOLS 5% SOLUCION PARA PERFUSION,B05BA03


In [76]:
atc_regs = drugs_data.groupby('Cód. ATC')['Nº Registro'].apply(list).reset_index()
print(atc_regs)

     Cód. ATC                                       Nº Registro
0     A01AA01                                    [57745, 57746]
1       A01AB  [38791, 34484, 35528, 12912, 45563, 9957, 29898]
2     A01AB03                                    [84343, 80642]
3     A01AB09                                    [55969, 55962]
4     A01AB19                                           [39783]
...       ...                                               ...
2227    V10XX                              [05322001, 05322002]
2228  V10XX02                                        [03264001]
2229  V10XX03                                       [113873001]
2230  V10XX04                                      [1171226001]
2231   XXXXXX               [75668, 75670, 75659, 75661, 75663]

[2232 rows x 2 columns]


In [79]:
import pysolr

solr = pysolr.Solr('http://librairy.linkeddata.es/data/atc', timeout=2)

def add_num_regs(atc_code,num_regs):
    solr_query = "id:"+str(atc_code)
    for result in solr.search(q=solr_query,rows=1):
        result['cima_codes']=num_regs
        del (result['_version_'])
        return result
    return {}

sample_doc = {
  "id": "1212121212112121212",
  "cima_codes":["a","b","c"]
}
solr.add([sample_doc])
solr.commit()
solr.delete(id=sample_doc['id'])
solr.commit()
updated_drugs = []
num = 0;
for i in atc_regs.itertuples():
    atc_code = i[1]
    updated_drugs.append(add_num_regs(atc_code,[str(code) for code in i[2]])) 
    num+=1
    if (len(updated_drugs) == 0):
        solr.add(updated_drugs)
        solr.commit()
        updated_drugs = []
        print(num,"drugs updated")

100 drugs updated
200 drugs updated
300 drugs updated
400 drugs updated
500 drugs updated
600 drugs updated
700 drugs updated
800 drugs updated
900 drugs updated
1000 drugs updated
1100 drugs updated
1200 drugs updated
1300 drugs updated
1400 drugs updated
1500 drugs updated
1600 drugs updated
1700 drugs updated
1800 drugs updated
1900 drugs updated
2000 drugs updated
2100 drugs updated
2200 drugs updated


# Adding medical names

In [17]:
import pandas as pd

md_df = pd.read_excel('data/Medicines_output_european_public_assessment_reports.xlsx', skiprows=8, keep_default_na=False)
md_df.head()

Unnamed: 0,Category,Medicine name,Therapeutic area,International non-proprietary name (INN) / common name,Active substance,Product number,Patient safety,Authorisation status,ATC code,Additional monitoring,...,Vet pharmacotherapeutic group,Date of opinion,Decision date,Revision number,Condition / indication,Species,ATCvet code,First published,Revision date,URL
0,Human,Eliquis,"Arthroplasty, Venous Thromboembolism",apixaban,Apixaban,EMEA/H/C/002148,yes,Authorised,B01AF02,no,...,,,2020-03-26 01:00:00,20,For Eliquis 2.5 mg film-coated tablets:Prevent...,,,2018-08-03 16:27:00,2020-04-22 17:05:00,https://www.ema.europa.eu/en/medicines/human/E...
1,Human,Temozolomide Sun,"Glioma, Glioblastoma",temozolomide,temozolomide,EMEA/H/C/002198,no,Authorised,L01AX03,no,...,,,2020-03-11 01:00:00,16,Temozolomide Sun is indicated for the treatmen...,,,2018-08-03 00:00:00,2020-04-22 16:34:00,https://www.ema.europa.eu/en/medicines/human/E...
2,Human,Atazanavir Mylan,HIV Infections,atazanavir,atazanavir sulfate,EMEA/H/C/004048,no,Authorised,J05AE08,no,...,,,2020-03-12 01:00:00,8,"Atazanavir Mylan, co-administered with low dos...",,,2018-05-03 17:20:00,2020-04-22 16:00:00,https://www.ema.europa.eu/en/medicines/human/E...
3,Human,Vivanza,Erectile Dysfunction,vardenafil,vardenafil,EMEA/H/C/000488,no,Authorised,G04BE09,no,...,,,2020-03-19 01:00:00,27,Treatment of erectile dysfunction in adult men...,,,2018-04-19 00:00:00,2020-04-22 16:00:00,https://www.ema.europa.eu/en/medicines/human/E...
4,Human,Juluca,HIV Infections,dolutegravir / rilpivirine,"dolutegravir sodium, rilpivirine hydrochloride",EMEA/H/C/004427,yes,Authorised,J05AR,yes,...,,2018-03-22 01:00:00,2020-04-01 00:00:00,2,Juluca is indicated for the treatment of human...,,,2018-05-16 12:26:00,2020-04-22 15:44:00,https://www.ema.europa.eu/en/medicines/human/E...


In [30]:
import pysolr
import html2text
import urllib.request

solr = pysolr.Solr('http://librairy.linkeddata.es/data/atc', timeout=20)

h = html2text.HTML2Text()
h.ignore_links = True

def add_field_in_list(resource,field,value):
    if (len(value) > 0 ):
        if (not field in resource):
            resource[field] = []
        resource[field].append(value)  

def get_txt(url):
    fp = urllib.request.urlopen(url)
    mybytes = fp.read()
    mystr = mybytes.decode("utf8")
    fp.close()
    return h.handle(mystr)

documents = []
i = 1
for row in md_df.itertuples():
    atc_code = row[9]
    if (len(atc_code) > 1):
        for drug in solr.search("id:"+atc_code):
            add_field_in_list(drug,'category',row[1])
            add_field_in_list(drug,'medicines',row[2])
            add_field_in_list(drug,'therapeutic_area',row[3])
            add_field_in_list(drug,'emea_codes',row[6])
            add_field_in_list(drug,'overviews',row[30])
            solr.add([drug])
            solr.commit()
            #print(drug)
            if (i % 100 == 0):
                print(i,"drugs updated")
            i+=1

100 drugs updated
200 drugs updated
300 drugs updated
400 drugs updated
500 drugs updated
600 drugs updated
700 drugs updated
800 drugs updated
900 drugs updated
1000 drugs updated
1100 drugs updated
1200 drugs updated
1300 drugs updated
