In [1]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MultiLabelBinarizer

In [50]:
input_path = 'Input/'

t_line_drug= input_path + 'patient_oncologicalTreatmentLine_drugs.csv'
d_no_onco = input_path + '02_patient_nonOncologycalTreatments.csv'

treatment_line_drug = pd.read_csv(t_line_drug, delimiter=",")
drug_no_onco = pd.read_csv(d_no_onco, delimiter=",")

treatment_line_drug['drug_name'] = treatment_line_drug['drug_name'].str.lower()
drug_no_onco['drug_name'] = drug_no_onco['drug_name'].str.lower()

replacement_mapping_dict = {
    'enalaprile': 'enalapril',
    'atenolole': 'atenolol'
}
drug_no_onco['drug_name'].replace(replacement_mapping_dict, inplace=True)

display(treatment_line_drug.shape, treatment_line_drug.head())
display(drug_no_onco.shape, drug_no_onco.head())

(1874, 3)

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name
0,1003706,1,cisplatin
1,1003706,1,pemetrexed
2,1003706,2,docetaxel
3,1007602,3,carboplatin
4,1007602,3,etoposide


(1322, 2)

Unnamed: 0,patient_id,drug_name
0,1098173,omeprazole
1,1098173,ranitidine
2,1100890,omeprazole
3,1103334,omeprazole
4,1103334,enalapril


# Select drugs by each line

In [51]:
cancer_treatment = pd.merge(treatment_line_drug[['patient_id', 'oncologicalTreatmentLine_id']], drug_no_onco, how='inner',
                          on=['patient_id'])
cancer_treatment.drop_duplicates(keep='first', inplace=True)
cancer_treatment = pd.concat([cancer_treatment, treatment_line_drug])

cancer_treatment = cancer_treatment.sort_values(by=['oncologicalTreatmentLine_id'])
cancer_treatment

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name
0,1003706,1,cisplatin
1,1003706,1,pemetrexed
2,1003706,2,docetaxel
3,1007602,3,carboplatin
4,1007602,3,etoposide
...,...,...,...
2739,994263,1236,simvastatin
1871,994263,1236,docetaxel
1872,998317,1237,cisplatin
2740,998317,1237,omeprazole


In [52]:
df_drug = cancer_treatment.groupby(by=['patient_id','oncologicalTreatmentLine_id']).agg(lambda x: x.tolist()).reset_index()
df_drug

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name
0,3561,768,"[acetaminophen, corticoids, etoposide, carbopl..."
1,3877,787,"[etoposide, atenolol, carboplatin, omeprazole]"
2,6203,949,"[vinorelbine, cisplatin, omeprazole]"
3,6203,950,"[crizotinib, omeprazole]"
4,8359,1172,"[carboplatin, furosemide]"
...,...,...,...
1232,2819894,715,"[enalapril, pembrolizumab]"
1233,2819955,716,"[omeprazole, vinorelbine, carboplatin]"
1234,2819955,717,"[omeprazole, gemcitabine, carboplatin]"
1235,2822207,718,"[cisplatin, vinorelbine, omeprazole]"


In [53]:
for i in range(df_drug.shape[0]):
    drugs = df_drug.drug_name[i]
    drugs.sort()
    df_drug.at[i,'n_drugs'] = len(drugs)
    df_drug.at[i,'drug_name']=drugs
    
    set_d = list(set.intersection(onco_drug, set(drugs)))
    set_d.sort()
    df_drug.at[i,'n_drug_onco'] = len(set_d)
    df_drug.at[i,'oncological_drug'] = ', '.join(list(set_d))

df_drug['n_drugs'] = df_drug['n_drugs'].astype(int)
df_drug['n_drug_onco'] = df_drug['n_drug_onco'].astype(int)  
df_drug = df_drug.sort_values(by=['n_drug_onco', 'n_drugs'], ascending=False)
df_drug

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name,n_drugs,n_drug_onco,oncological_drug
262,429640,814,"[cisplatin, docetaxel, pemetrexed, vinorelbine]",4,4,"cisplatin, docetaxel, pemetrexed, vinorelbine"
575,816028,1158,"[carboplatin, dexamethasone, lorazepam, omepra...",6,3,"carboplatin, paclitaxel, pemetrexed"
233,375428,781,"[antibiotics, carboplatin, paclitaxel, vinorel...",4,3,"carboplatin, paclitaxel, vinorelbine"
601,865317,1189,"[cisplatin, docetaxel, omeprazole, pemetrexed]",4,3,"cisplatin, docetaxel, pemetrexed"
614,925591,1207,"[cisplatin, gemcitabine, vinorelbine]",3,3,"cisplatin, gemcitabine, vinorelbine"
...,...,...,...,...,...,...
1201,2777603,683,[docetaxel],1,1,docetaxel
1209,2785277,691,[afatinib],1,1,afatinib
1216,2800873,699,[carboplatin],1,1,carboplatin
1218,2811038,701,[carboplatin],1,1,carboplatin


In [54]:
treatment = df_drug[['drug_name', 'oncological_drug']]
treatment = treatment.loc[treatment.astype(str).drop_duplicates().index].reset_index()
treatment = treatment.drop(columns=['index'])
treatment

Unnamed: 0,drug_name,oncological_drug
0,"[cisplatin, docetaxel, pemetrexed, vinorelbine]","cisplatin, docetaxel, pemetrexed, vinorelbine"
1,"[carboplatin, dexamethasone, lorazepam, omepra...","carboplatin, paclitaxel, pemetrexed"
2,"[antibiotics, carboplatin, paclitaxel, vinorel...","carboplatin, paclitaxel, vinorelbine"
3,"[cisplatin, docetaxel, omeprazole, pemetrexed]","cisplatin, docetaxel, pemetrexed"
4,"[cisplatin, gemcitabine, vinorelbine]","cisplatin, gemcitabine, vinorelbine"
...,...,...
535,[pembrolizumab],pembrolizumab
536,[necitumumab],necitumumab
537,[crizotinib],crizotinib
538,[alectinib],alectinib


In [55]:
treatment.to_csv('treatments/cancer_treatments.csv', index=False)

In [49]:
onco_drug = set(treatment_line_drug.drug_name.unique())
for i in range(treatment.shape[0]):
    drugs = treatment.drug_name[i]
    set_d = list(set.intersection(onco_drug, set(drugs)))
    set_d.sort()
    treatment.at[i,'oncological_drug'] = ', '.join(list(set_d))

treatment

Unnamed: 0,drug_name,oncological_drug
0,"[allopurinol, atorvastatin, carboplatin, genta...","carboplatin, vinorelbine"
1,"[acetaminophen, allopurinol, amlodipine, atorv...","carboplatin, vinorelbine"
2,"[acetaminophen, amlodipine, antibiotics, atorv...","carboplatin, paclitaxel"
3,"[allopurinol, atorvastatin, cisplatin, gentami...","cisplatin, vinorelbine"
4,"[allopurinol, atenolol, carboplatin, etoposide...","carboplatin, etoposide"
...,...,...
535,[bevacizumab],bevacizumab
536,[cisplatin],cisplatin
537,[necitumumab],necitumumab
538,[alectinib],alectinib
