In [1]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MultiLabelBinarizer

In [12]:
input_path = 'Input/'

t_line_drug= input_path + 'patient_oncologicalTreatmentLine_drugs.csv'
d_no_onco = input_path + 'patient_nonOncologycalTreatments.csv'

treatment_line_drug = pd.read_csv(t_line_drug, delimiter=",")
drug_no_onco = pd.read_csv(d_no_onco, delimiter=",")

treatment_line_drug['drug_name'] = treatment_line_drug['drug_name'].str.lower()
drug_no_onco['drug_name'] = drug_no_onco['drug_name'].str.lower()

replacement_mapping_dict = {
    'enalaprile': 'enalapril',
    'atenolole': 'atenolol'
}
drug_no_onco['drug_name'].replace(replacement_mapping_dict, inplace=True)

display(treatment_line_drug.shape, treatment_line_drug.head())
display(drug_no_onco.shape, drug_no_onco.head())

(1874, 3)

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name
0,1003706,1,cisplatin
1,1003706,1,pemetrexed
2,1003706,2,docetaxel
3,1007602,3,carboplatin
4,1007602,3,etoposide


(1325, 2)

Unnamed: 0,patient_id,drug_name
0,1098173,omeprazole
1,1098173,ranitidine
2,1100890,omeprazole
3,1103334,omeprazole
4,1103334,enalapril


# Select drugs by each line

In [3]:
cancer_treatment = pd.merge(treatment_line_drug[['patient_id', 'oncologicalTreatmentLine_id']], drug_no_onco, how='inner',
                          on=['patient_id'])
cancer_treatment.drop_duplicates(keep='first', inplace=True)
cancer_treatment = pd.concat([cancer_treatment, treatment_line_drug])

cancer_treatment = cancer_treatment.sort_values(by=['oncologicalTreatmentLine_id'])
cancer_treatment

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name
0,1003706,1,cisplatin
1,1003706,1,pemetrexed
2,1003706,2,docetaxel
3,1007602,3,carboplatin
4,1007602,3,etoposide
...,...,...,...
2748,994263,1236,simvastatin
1871,994263,1236,docetaxel
1872,998317,1237,cisplatin
2749,998317,1237,omeprazole


In [4]:
df_drug = cancer_treatment.groupby(by=['patient_id','oncologicalTreatmentLine_id']).agg(lambda x: x.tolist()).reset_index()
df_drug

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name
0,3561,768,"[acetaminophen, corticoids, etoposide, carbopl..."
1,3877,787,"[etoposide, atenolol, carboplatin, omeprazole]"
2,6203,949,"[vinorelbine, cisplatin, omeprazole]"
3,6203,950,"[crizotinib, omeprazole]"
4,8359,1172,"[carboplatin, furosemide]"
...,...,...,...
1232,2819894,715,"[enalapril, pembrolizumab]"
1233,2819955,716,"[omeprazole, vinorelbine, carboplatin]"
1234,2819955,717,"[omeprazole, gemcitabine, carboplatin]"
1235,2822207,718,"[cisplatin, vinorelbine, omeprazole]"


In [5]:
onco_drug = set(treatment_line_drug.drug_name.unique())
for i in range(df_drug.shape[0]):
    # == write the number of drug of the treatment
    drugs = df_drug.drug_name[i]
    drugs.sort()    
    df_drug.at[i,'n_drugs'] = len(drugs)
    #df_drug.at[i,'drug_name']=drugs
    
    # == write the number of oncological drugs of the treatment
    set_d = list(set.intersection(onco_drug, set(drugs)))
    set_d.sort()    
    df_drug.at[i,'n_drug_onco'] = len(set_d)
    #df_drug.at[i,'oncological_drug'] = ', '.join(list(set_d))
    
    # == write the treatment order by onco_drug + non-onco_drug
    non_onco_drug = list(set(drugs) - set(set_d))
    non_onco_drug.sort()
    df_drug.at[i,'treatment'] = ', '.join(list(set_d)+list(non_onco_drug))

df_drug['n_drugs'] = df_drug['n_drugs'].astype(int)
df_drug['n_drug_onco'] = df_drug['n_drug_onco'].astype(int)  
df_drug = df_drug.sort_values(by=['n_drug_onco', 'n_drugs'], ascending=False)
df_drug

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name,n_drugs,n_drug_onco,treatment
262,429640,814,"[cisplatin, docetaxel, pemetrexed, vinorelbine]",4,4,"cisplatin, docetaxel, pemetrexed, vinorelbine"
575,816028,1158,"[carboplatin, dexamethasone, lorazepam, omepra...",6,3,"carboplatin, paclitaxel, pemetrexed, dexametha..."
233,375428,781,"[antibiotics, carboplatin, paclitaxel, vinorel...",4,3,"carboplatin, paclitaxel, vinorelbine, antibiotics"
601,865317,1189,"[cisplatin, docetaxel, omeprazole, pemetrexed]",4,3,"cisplatin, docetaxel, pemetrexed, omeprazole"
614,925591,1207,"[cisplatin, gemcitabine, vinorelbine]",3,3,"cisplatin, gemcitabine, vinorelbine"
...,...,...,...,...,...,...
1201,2777603,683,[docetaxel],1,1,docetaxel
1209,2785277,691,[afatinib],1,1,afatinib
1216,2800873,699,[carboplatin],1,1,carboplatin
1218,2811038,701,[carboplatin],1,1,carboplatin


In [6]:
treatment = df_drug[['treatment', 'n_drug_onco']]
treatment.drop_duplicates(keep='first', inplace=True)
#treatment = treatment.loc[treatment.astype(str).drop_duplicates().index].reset_index()
#treatment = treatment.drop(columns=['index'])
treatment

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,treatment,n_drug_onco
262,"cisplatin, docetaxel, pemetrexed, vinorelbine",4
575,"carboplatin, paclitaxel, pemetrexed, dexametha...",3
233,"carboplatin, paclitaxel, vinorelbine, antibiotics",3
601,"cisplatin, docetaxel, pemetrexed, omeprazole",3
614,"cisplatin, gemcitabine, vinorelbine",3
...,...,...
342,pembrolizumab,1
369,necitumumab,1
502,crizotinib,1
503,alectinib,1


In [7]:
treatment.to_csv('treatments/cancer_treatments.csv', index=False)

In [8]:
df_drug.loc[df_drug.patient_id==66695]

Unnamed: 0,patient_id,oncologicalTreatmentLine_id,drug_name,n_drugs,n_drug_onco,treatment
49,66695,1023,"[carboplatin, enalapril, omeprazole, pemetrexed]",4,2,"carboplatin, pemetrexed, enalapril, omeprazole"
50,66695,1024,"[docetaxel, enalapril, omeprazole]",3,1,"docetaxel, enalapril, omeprazole"


In [11]:
list(treatment_line_drug.drug_name.unique())

['Cisplatin',
 'Pemetrexed',
 'Docetaxel',
 'Carboplatin',
 'Etoposide',
 'Gemcitabine',
 'Vinorelbine',
 'Nivolumab',
 'Crizotinib',
 'Ceritinib',
 'Ipilimumab',
 'Paclitaxel',
 'Necitumumab',
 'Bevacizumab',
 'Afatinib',
 'Osimertinib',
 'Alectinib',
 'Pembrolizumab']