### Preparing TWOSIDES for GNN
1. First we select only the TWOSIDES PSEs that are in SIDERs for the GNN
2. Then We split TWOSIDES into drugs that are available in SIDER

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
ts = pd.read_csv('../data/TWOSIDES-short.csv', sep=',')
sider = pd.read_csv('../data/se_all.csv', sep=',')
ts

Unnamed: 0,Drug1,Drug2,PSE
0,Temazepam,sildenafil,Arthralgia
1,Bumetanide,Oxytocin,Arthralgia
2,POLYETHYLENE GLYCOL 3350,Hydroxychloroquine,Arthralgia
3,Tamoxifen,Prednisone,Diarrhoea
4,Temazepam,sildenafil,Diarrhoea
...,...,...,...
42920386,Ketoprofen,montelukast,Activities of daily living impaired
42920387,Ketoprofen,montelukast,Impaired work ability
42920388,Ketoprofen,montelukast,Off label use
42920389,Ketoprofen,montelukast,Hypertension


In [3]:
sider

Unnamed: 0,DrugID,SEID,SE name
0,CID100000085,C0000729,Abdominal cramps
1,CID100000085,C0000737,Abdominal pain
2,CID100000085,C0151736,Accidental injury
3,CID100000085,C0002418,Amblyopia
4,CID100000085,C0002871,Anaemia
...,...,...,...
138894,CID171306834,C0042963,Vomiting
138895,CID171306834,C3665596,Warts
138896,CID171306834,C0043096,Weight decreased
138897,CID171306834,C0043094,Weight increased


In [4]:
SEs = sider['SE name'].unique().tolist()
PSEs = ts['PSE'].unique().tolist()
print('SIDERS:',len(SEs),', TWOSIDES:',len(PSEs))

SIDERS: 5805 , TWOSIDES: 12726


#### We will select side efects that are present in all ADR, PSE and SIDERS for the final model.

In [5]:
# Makes lower case list of SE names
SEs_lower = []
PSEs_lower = [] 
for SE in SEs: SEs_lower.append(SE.lower())
for SE in PSEs: PSEs_lower.append(SE.lower())
    
SE_final = []  # Candicate PSEs for GNN  
tmp = []

for index, SE in enumerate(PSEs_lower):
    if SE in SEs_lower:
        SE_final.append(PSEs[index])
    else:
        tmp.append(PSEs[index])
print('Only',len(SE_final),'side effect out of',len(PSEs), 'will be used at the model.')

Only 4029 side effect out of 12726 will be used at the model.


In [6]:
SE_final

['Arthralgia',
 'Diarrhoea',
 'Headache',
 'Vomiting',
 'Dyspepsia',
 'Renal impairment',
 'Drug hypersensitivity',
 'Cough',
 'Throat irritation',
 'Rhinorrhoea',
 'Malaise',
 'Hepatic enzyme increased',
 'Back pain',
 'Fluid retention',
 'Oedema peripheral',
 'Oedema',
 'Death',
 'Cerebrovascular accident',
 'Pain',
 'Wound',
 'Fatigue',
 'Gastrooesophageal reflux disease',
 'Sinusitis',
 'Urinary tract infection',
 'Fall',
 'Ocular icterus',
 'Bone pain',
 'Nausea',
 'Paraesthesia',
 'General physical health deterioration',
 'Abdominal pain',
 'Anxiety',
 'Emotional distress',
 'Injury',
 'Memory impairment',
 'Rash',
 'Alanine aminotransferase increased',
 'Drug eruption',
 'Chest pain',
 'Contusion',
 'Dyspnoea',
 'Stress',
 'Asthenia',
 'Depression',
 'Dysgeusia',
 'Weight decreased',
 'Pruritus',
 'Feeling abnormal',
 'Muscle spasms',
 'Nightmare',
 'Convulsion',
 'Hyperhidrosis',
 'Disturbance in attention',
 'Urinary retention',
 'Dizziness',
 'Haemoglobin decreased',
 'White 

In [9]:
tmp = ts.groupby(by='PSE')   
df = []   
for SE in tqdm(SE_final):
    PSE_df = tmp.get_group(SE)
    row =  {'PSE': SE, 'Count': len(PSE_df)}
    df.append(row)    

SE_count = pd.DataFrame.from_dict(df)
SE_count

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4029.0), HTML(value='')))




Unnamed: 0,PSE,Count
0,Arthralgia,119763
1,Diarrhoea,156590
2,Headache,138533
3,Vomiting,153059
4,Dyspepsia,63487
...,...,...
4024,Bladder necrosis,1
4025,Tibial torsion,1
4026,Foot and mouth disease,1
4027,Neuromuscular toxicity,1


In [19]:
# PSEs that will be used to build the GNN
#SEs_final = SE_count.loc[SE_count['Count'] > SE_count['Count'].mean()] # x0.5 for 1830
SEs_final = SE_count.loc[SE_count['Count'] > 100] # for 1830
SEs_final = SEs_final['PSE'].tolist()
len(SEs_final)

3347

In [20]:
SEs_final

['Arthralgia',
 'Diarrhoea',
 'Headache',
 'Vomiting',
 'Dyspepsia',
 'Renal impairment',
 'Drug hypersensitivity',
 'Cough',
 'Throat irritation',
 'Rhinorrhoea',
 'Malaise',
 'Hepatic enzyme increased',
 'Back pain',
 'Fluid retention',
 'Oedema peripheral',
 'Oedema',
 'Death',
 'Cerebrovascular accident',
 'Pain',
 'Wound',
 'Fatigue',
 'Gastrooesophageal reflux disease',
 'Sinusitis',
 'Urinary tract infection',
 'Fall',
 'Ocular icterus',
 'Bone pain',
 'Nausea',
 'Paraesthesia',
 'General physical health deterioration',
 'Abdominal pain',
 'Anxiety',
 'Emotional distress',
 'Injury',
 'Memory impairment',
 'Rash',
 'Alanine aminotransferase increased',
 'Drug eruption',
 'Chest pain',
 'Contusion',
 'Dyspnoea',
 'Stress',
 'Asthenia',
 'Depression',
 'Dysgeusia',
 'Weight decreased',
 'Pruritus',
 'Feeling abnormal',
 'Muscle spasms',
 'Nightmare',
 'Convulsion',
 'Hyperhidrosis',
 'Disturbance in attention',
 'Urinary retention',
 'Dizziness',
 'Haemoglobin decreased',
 'White 

In [30]:
#cleaned TS with selected PSEs
ts_final = ts.loc[ts['PSE'].isin(SEs_final)].reset_index(drop=True) 
ts_final

Unnamed: 0,Drug1,Drug2,PSE
0,Temazepam,sildenafil,Arthralgia
1,Bumetanide,Oxytocin,Arthralgia
2,POLYETHYLENE GLYCOL 3350,Hydroxychloroquine,Arthralgia
3,Tamoxifen,Prednisone,Diarrhoea
4,Temazepam,sildenafil,Diarrhoea
...,...,...,...
36784786,Ketoprofen,montelukast,Pulmonary embolism
36784787,Ketoprofen,montelukast,Cough
36784788,Ketoprofen,montelukast,Rash
36784789,Ketoprofen,montelukast,Alopecia


In [21]:
# list of every drug in TWOSIDES
tmp = ts['Drug1'].unique().tolist() + ts['Drug2'].unique().tolist()
ts_drugs = []
[ts_drugs.append(drug) for drug in tmp if drug not in ts_drugs]
print(len(tmp), len(ts_drugs))

3558 1918


In [22]:
# SIDER's drug list
drugs = pd.read_csv('../data/drug_names.tsv', sep='\t')
drug_names = drugs['name'].tolist()
len(drug_names)

1430

In [23]:
# Makes lower case list of drug names
ts_drugs_lower = [] 
for drug in ts_drugs:
    ts_drugs_lower.append(drug.lower())

drug_names_lower = []
for drug in drug_names:
    drug_names_lower.append(drug.lower()) 

In [24]:
train_drugs = []  # Drugs in SIDER, will be used to build the GNN 
eval_drugs = []   # Drugs not in SIDER, will be used to evaluate the GNN 

for index, drug in enumerate(ts_drugs_lower):
    if drug in drug_names_lower:
        train_drugs.append(ts_drugs[index])
    else:
        eval_drugs.append(ts_drugs[index])
print(len(train_drugs),len(eval_drugs))
        

953 965


In [37]:
%%time

train_list = []  # Drug 1 & 2 are in SIDER => model building
asses_list = []  # Drug 1 or 2 are in SIDER => model assesment 
eval_list = []   # Drug 1 & 2 are not in SIDER => model evaluation

for i in tqdm(range(len(ts_final))): 
    row =  {'Drug1': ts_final['Drug1'][i], 
            'Drug2': ts_final['Drug2'][i], 
            'PSE': ts_final['PSE'][i]} 

    if ts_final['Drug1'][i] in train_drugs and ts_final['Drug2'][i] in train_drugs:
        train_list.append(row)       
    elif ts_final['Drug1'][i] in eval_drugs and ts_final['Drug2'][i] in eval_drugs:
        eval_list.append(row)
    else:
        asses_list.append(row)

        
ts_train = pd.DataFrame.from_dict(train_list)
ts_asses = pd.DataFrame.from_dict(asses_list)
ts_eval = pd.DataFrame.from_dict(eval_list)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=36784791.0), HTML(value='')))


Wall time: 31min 19s


In [38]:
print(len(ts_train)/len(ts_final),len(ts_asses)/len(ts_final),len(ts_eval)/len(ts_final))

0.6007351516554763 0.345553410919203 0.053711437425320696


In [None]:
ts_train.to_csv('../data/TWOSIDE-train-PSE-3347.csv', index=False, sep = ',')
ts_asses.to_csv('../data/TWOSIDE-assesment-PSE-3347.csv', index=False, sep = ',')
ts_eval.to_csv('../data/TWOSIDE-evaluation-PSE-3347.csv', index=False, sep = ',')

In [2]:
ts_train = pd.read_csv('../data/TWOSIDE-train-PSE-3347.csv', sep=',')
drugid = pd.read_csv('../data/DrugID.csv', sep = ',')
drugid


Unnamed: 0,GraphID,DrugID,Name
0,1,85,carnitine
1,2,119,gamma-aminobutyric
2,3,137,5-aminolevulinic
3,4,143,leucovorin
4,5,146,5-methyltetrahydrofolate
...,...,...,...
1425,1426,56603655,pegaptanib
1426,1427,56842239,n-3
1427,1428,70683024,x
1428,1429,70695640,colestyramine


In [3]:
drugid_dic = {drugid['Name'][i].lower():drugid['DrugID'][i] for i in range(len(drugid))}
drugid_dic

{'carnitine': 85,
 'gamma-aminobutyric': 119,
 '5-aminolevulinic': 137,
 'leucovorin': 143,
 '5-methyltetrahydrofolate': 146,
 'pge2': 158,
 'prostacyclin': 159,
 'prostaglandin': 160,
 'acetate': 175,
 'acetylcholine': 187,
 'adenosine': 191,
 'glucose': 24748,
 'pge1': 214,
 'ammonia': 222,
 'arginine': 232,
 'benzyl': 2345,
 'betaine': 247,
 'calcium': 6116,
 'graphene': 297,
 'chloramphenicol': 298,
 'bile': 303,
 'choline': 20585,
 'citric': 311,
 'chloride': 312,
 'salicylate': 338,
 'd-cycloserine': 401,
 'bupropion': 444,
 'estradiol': 3267,
 'mannitol': 453,
 'eaca': 564,
 'n-acetylcysteine': 581,
 'cytarabine': 596,
 'mesna': 598,
 'lactate': 612,
 'dmso': 679,
 'dopamine': 681,
 'estrone': 698,
 'oestrogen': 699,
 'monoethanolamine': 700,
 'lindane': 727,
 'glutamine': 738,
 'glycine': 750,
 'glycerol': 10482134,
 'bicarbonate': 767,
 'lmwh': 772,
 'histamine': 774,
 'quinol': 785,
 'potassium': 24450,
 'kanamycin': 815,
 'epinephrine': 838,
 'thyroxine': 853,
 'triiodothyro

In [4]:
dd = ts_train['Drug1'].unique().tolist()
for drug in dd:
    if drug.lower() not in drugid_dic:
        print(drug)

In [5]:
# Conver drug names to DrugIDs
ts_train['Drug1'] = ts_train['Drug1'].str.lower()
ts_train['Drug2'] = ts_train['Drug2'].str.lower()
ts_train['Drug1'] = ts_train['Drug1'].map(drugid_dic)
ts_train['Drug2'] = ts_train['Drug2'].map(drugid_dic)
ts_train

Unnamed: 0,Drug1,Drug2,PSE
0,5391,5212,Arthralgia
1,2471,5771,Arthralgia
2,5376,4900,Diarrhoea
3,5391,5212,Diarrhoea
4,2471,5771,Diarrhoea
...,...,...,...
18492593,3825,4248,Pulmonary embolism
18492594,3825,4248,Cough
18492595,3825,4248,Rash
18492596,3825,4248,Alopecia


In [6]:
PSEs = ts_train['PSE'].unique().tolist()
x = ts_train.groupby(by=['Drug1','Drug2'])
df = []
for comb in tqdm(x.groups):
    row = {'Drug1':comb[0],'Drug2':comb[1]} 
    comb_df = x.get_group(comb)
    col = {PSE:0 for PSE in PSEs}
    for PSE in comb_df['PSE'].unique().tolist():
        col[PSE] = 1
        
    row.update(col)
    df.append(row)
    
ts = pd.DataFrame.from_dict(df)
ts


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=104222.0), HTML(value='')))




In [None]:
ts.to_csv('../data/GNN-TWOSIDE-train-PSE-3347.csv', index=False, sep = ',')

#### ================= testing ======================

In [12]:
ts_train = pd.read_csv('../data/GNN-TWOSIDE-train-PSE-964.csv', sep=',')
#PSEs = ts_train.columns.values.tolist()
#cols = {PSE:0 for PSE in PSEs}

In [40]:
#ts_train = pd.read_csv('../data/GNN-TWOSIDE-train-PSE-964.csv', sep=',')
#ts_asses = pd.read_csv('../data/TWOSIDE-assesment-PSE-964.csv', sep = ',')
#ts_eval = pd.read_csv('../data/TWOSIDE-evaluation-PSE-964.csv', sep = ',')

In [13]:
ts = ts_eval[PSEs]
ts

In [14]:
test = ts_train.copy()
test

Unnamed: 0,Drug1,Drug2,Arthralgia,Diarrhoea,Headache,Vomiting,Dyspepsia,Renal impairment,Cough,Rhinorrhoea,...,Hyperventilation,Pleuritic pain,Nail disorder,Aneurysm,Thrombophlebitis,Hypertensive crisis,Pneumonia bacterial,Hepatocellular injury,Shock haemorrhagic,Haemorrhagic stroke
0,85,232,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,85,2249,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,85,2585,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,85,3062,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,85,3446,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104217,54687131,54454,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
104218,54687131,60164,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
104219,56603655,2249,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
104220,56603655,2806,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [4]:
x = test.groupby(by=['Drug1','Drug2'])
len(x)

104222

In [11]:
x.get_group(('pegaptanib', 'Verteporfin'))

Unnamed: 0,Drug1,Drug2,PSE
2032276,pegaptanib,Verteporfin,Tachycardia
2032277,pegaptanib,Verteporfin,Dyspnoea
2032278,pegaptanib,Verteporfin,Visual acuity reduced
2032279,pegaptanib,Verteporfin,Vision blurred
2032280,pegaptanib,Verteporfin,Condition aggravated


In [30]:
df = []
for comb in tqdm(x.groups):
    row = {'Drug1':comb[0],'Drug2':comb[1]} 
    comb_df = x.get_group(comb)
    col = cols
    for PSE in comb_df.columns.values.tolist()[2:]:
        if PSE in col:
            col[PSE] = 1
    row.update(col)
    df.append(row)
    
ts = pd.DataFrame.from_dict(df)
ts

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=89632.0), HTML(value='')))




KeyboardInterrupt: 

In [44]:
ts.to_csv('../data/TWOSIDE-evaluation-PSE-964.csv', index=False, sep = ',')

In [43]:
ts

Unnamed: 0,Drug1,Drug2,Arthralgia,Diarrhoea,Headache,Vomiting,Dyspepsia,Renal impairment,Cough,Rhinorrhoea,...,Hyperventilation,Pleuritic pain,Nail disorder,Aneurysm,Thrombophlebitis,Hypertensive crisis,Pneumonia bacterial,Hepatocellular injury,Shock haemorrhagic,Haemorrhagic stroke
0,6-Aminocaproic Acid,ANTI-INHIBITOR COAGULANT COMPLEX,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6-Aminocaproic Acid,"Albumin Human, USP",0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6-Aminocaproic Acid,Albuterol,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6-Aminocaproic Acid,Aprotinin,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6-Aminocaproic Acid,Azacitidine,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18111,zoledronic acid,trastuzumab,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
18112,zoledronic acid,urapidil,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
18113,zoledronic acid,vitamin D3,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
18114,zotepine,Magnesium Oxide,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
