### Preparing TWOSIDES for GNN
1. First we select only the TWOSIDES PSEs that are in SIDERs for the GNN
2. Then We split TWOSIDES into drugs that are available in SIDER

In [18]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
ts = pd.read_csv('../data/TWOSIDES-short.csv', sep=',')
sider = pd.read_csv('../data/se_all.csv', sep=',')
ts

Unnamed: 0,Drug1,Drug2,PSE
0,Temazepam,sildenafil,Arthralgia
1,Bumetanide,Oxytocin,Arthralgia
2,POLYETHYLENE GLYCOL 3350,Hydroxychloroquine,Arthralgia
3,Tamoxifen,Prednisone,Diarrhoea
4,Temazepam,sildenafil,Diarrhoea
...,...,...,...
42920386,Ketoprofen,montelukast,Activities of daily living impaired
42920387,Ketoprofen,montelukast,Impaired work ability
42920388,Ketoprofen,montelukast,Off label use
42920389,Ketoprofen,montelukast,Hypertension


In [3]:
sider

Unnamed: 0,DrugID,SEID,SE name
0,CID100000085,C0000729,Abdominal cramps
1,CID100000085,C0000737,Abdominal pain
2,CID100000085,C0151736,Accidental injury
3,CID100000085,C0002418,Amblyopia
4,CID100000085,C0002871,Anaemia
...,...,...,...
138894,CID171306834,C0042963,Vomiting
138895,CID171306834,C3665596,Warts
138896,CID171306834,C0043096,Weight decreased
138897,CID171306834,C0043094,Weight increased


In [4]:
SEs = sider['SE name'].unique().tolist()
PSEs = ts['PSE'].unique().tolist()
print('SIDERS:',len(SEs),', TWOSIDES:',len(PSEs))

SIDERS: 5805 , TWOSIDES: 12726


 We will select side efects that are present in all ADR, PSE and SIDERS for the final model.

In [7]:
# Makes lower case list of SE names
SEs_lower = []
PSEs_lower = [] 
for SE in SEs: SEs_lower.append(SE.lower())
for SE in PSEs: PSEs_lower.append(SE.lower())
    
SE_final = []  # Candicate PSEs for GNN  
tmp = []

for index, SE in enumerate(PSEs_lower):
    if SE in SEs_lower:
        SE_final.append(PSEs[index])
    else:
        tmp.append(PSEs[index])
print('Only',len(SE_final),'side effect out of',len(PSEs), 'will be used at the model.')

Only 4029 side effect out of 8697 will be used at the model.


In [9]:
SE_final

['Arthralgia',
 'Diarrhoea',
 'Headache',
 'Vomiting',
 'Dyspepsia',
 'Renal impairment',
 'Drug hypersensitivity',
 'Cough',
 'Throat irritation',
 'Rhinorrhoea',
 'Malaise',
 'Hepatic enzyme increased',
 'Back pain',
 'Fluid retention',
 'Oedema peripheral',
 'Oedema',
 'Death',
 'Cerebrovascular accident',
 'Pain',
 'Wound',
 'Fatigue',
 'Gastrooesophageal reflux disease',
 'Sinusitis',
 'Urinary tract infection',
 'Fall',
 'Ocular icterus',
 'Bone pain',
 'Nausea',
 'Paraesthesia',
 'General physical health deterioration',
 'Abdominal pain',
 'Anxiety',
 'Emotional distress',
 'Injury',
 'Memory impairment',
 'Rash',
 'Alanine aminotransferase increased',
 'Drug eruption',
 'Chest pain',
 'Contusion',
 'Dyspnoea',
 'Stress',
 'Asthenia',
 'Depression',
 'Dysgeusia',
 'Weight decreased',
 'Pruritus',
 'Feeling abnormal',
 'Muscle spasms',
 'Nightmare',
 'Convulsion',
 'Hyperhidrosis',
 'Disturbance in attention',
 'Urinary retention',
 'Dizziness',
 'Haemoglobin decreased',
 'White 

In [19]:
tmp = ts['PSE'].tolist()
df = []
for SE in tqdm(SE_final):
    if SE in tmp:
        row =  {'PSE': SE, 'Count': tmp.count(SE)}
        df.append(row)    
    else: pass

SE_count = pd.DataFrame.from_dict(df)
SE_count

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4029.0), HTML(value='')))


Wall time: 37min 35s


Unnamed: 0,PSE,Count
0,Arthralgia,119763
1,Diarrhoea,156590
2,Headache,138533
3,Vomiting,153059
4,Dyspepsia,63487
...,...,...
4024,Bladder necrosis,1
4025,Tibial torsion,1
4026,Foot and mouth disease,1
4027,Neuromuscular toxicity,1


In [51]:
# PSEs that will be used to build the GNN
SEs_final = SE_count.loc[SE_count['Count'] > SE_count['Count'].mean()] # x0.5 for 1830
SEs_final = SEs_final['PSE'].tolist()
len(SEs_final)

964

In [52]:
SEs_final

['Arthralgia',
 'Diarrhoea',
 'Headache',
 'Vomiting',
 'Dyspepsia',
 'Renal impairment',
 'Drug hypersensitivity',
 'Cough',
 'Throat irritation',
 'Rhinorrhoea',
 'Malaise',
 'Hepatic enzyme increased',
 'Back pain',
 'Fluid retention',
 'Oedema peripheral',
 'Oedema',
 'Death',
 'Cerebrovascular accident',
 'Pain',
 'Wound',
 'Fatigue',
 'Gastrooesophageal reflux disease',
 'Sinusitis',
 'Urinary tract infection',
 'Fall',
 'Bone pain',
 'Nausea',
 'Paraesthesia',
 'General physical health deterioration',
 'Abdominal pain',
 'Anxiety',
 'Emotional distress',
 'Injury',
 'Memory impairment',
 'Rash',
 'Alanine aminotransferase increased',
 'Drug eruption',
 'Chest pain',
 'Contusion',
 'Dyspnoea',
 'Stress',
 'Asthenia',
 'Depression',
 'Dysgeusia',
 'Weight decreased',
 'Pruritus',
 'Feeling abnormal',
 'Muscle spasms',
 'Nightmare',
 'Convulsion',
 'Hyperhidrosis',
 'Disturbance in attention',
 'Urinary retention',
 'Dizziness',
 'Haemoglobin decreased',
 'White blood cell count de

In [53]:
# list of every drug in TWOSIDES
tmp = ts['Drug1'].unique().tolist() + ts['Drug2'].unique().tolist()
ts_drugs = []
[ts_drugs.append(drug) for drug in tmp if drug not in ts_drugs]
print(len(tmp), len(ts_drugs))

3558 1918


In [54]:
# SIDER's drug list
drugs = pd.read_csv('../data/drug_names.tsv', sep='\t')
drug_names = drugs['name'].tolist()
len(drug_names)

1430

In [55]:
# Makes lower case list of drug names
ts_drugs_lower = [] 
for drug in ts_drugs:
    ts_drugs_lower.append(drug.lower())

drug_names_lower = []
for drug in drug_names:
    drug_names_lower.append(drug.lower()) 

In [56]:
train_drugs = []  # Drugs in SIDER, will be used to build the GNN 
eval_drugs = []   # Drugs not in SIDER, will be used to evaluate the GNN 

for index, drug in enumerate(ts_drugs_lower):
    if drug in drug_names_lower:
        train_drugs.append(ts_drugs[index])
    else:
        eval_drugs.append(ts_drugs[index])
print(len(train_drugs),len(eval_drugs))
        

953 965


In [57]:
%%time

train_list = []  # Drug 1 & 2 are in SIDER => model building
asses_list = []  # Drug 1 or 2 are in SIDER => model assesment 
eval_list = []   # Drug 1 & 2 are not in SIDER => model evaluation

for i in tqdm(range(len(ts))): 
    if ts['PSE'][i] in SEs_final:
        row =  {'Drug1': ts['Drug1'][i], 'Drug2': ts['Drug2'][i], 'PSE': ts['PSE'][i]} 

        if ts['Drug1'][i] in train_drugs and ts['Drug2'][i] in train_drugs:
            train_list.append(row)       
        elif ts['Drug1'][i] in eval_drugs and ts['Drug2'][i] in eval_drugs:
            eval_list.append(row)
        else:
            asses_list.append(row)
    else:
        pass
        
ts_train = pd.DataFrame.from_dict(train_list)
ts_asses = pd.DataFrame.from_dict(asses_list)
ts_eval = pd.DataFrame.from_dict(eval_list)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=42920391.0), HTML(value='')))


Wall time: 37min 44s


In [49]:
print(len(ts_train)/len(ts),len(ts_asses)/len(ts),len(ts_eval)/len(ts))

0.47247512260547675 0.27793742605932925 0.04368140541869714


In [58]:
ts_train.to_csv('../data/TWOSIDE-train-PSE-964.csv', index=False, sep = ',')
ts_asses.to_csv('../data/TWOSIDE-assesment-PSE-964.csv', index=False, sep = ',')
ts_eval.to_csv('../data/TWOSIDE-evaluation-PSE-964.csv', index=False, sep = ',')

In [59]:
ts_train

Unnamed: 0,Drug1,Drug2,PSE
0,Temazepam,sildenafil,Arthralgia
1,Bumetanide,Oxytocin,Arthralgia
2,Tamoxifen,Prednisone,Diarrhoea
3,Temazepam,sildenafil,Diarrhoea
4,Bumetanide,Oxytocin,Diarrhoea
...,...,...,...
18492593,Ketoprofen,montelukast,Pulmonary embolism
18492594,Ketoprofen,montelukast,Cough
18492595,Ketoprofen,montelukast,Rash
18492596,Ketoprofen,montelukast,Alopecia
