In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(130298) # to have the same random numbers

In [2]:
%pwd

'C:\\Users\\Eleonora\\statistical learning for healthcare data\\heart-failure-project'

In [3]:
drugs = pd.read_csv('dat_md.csv')
drugs.drop(columns = ['Unnamed: 0'], inplace = True)
drugs.shape

(12654, 2)

We notice that this dataset has much more rows than the other: so we want to check if there are patients for which there is more than one row in the table (i.e. patients that take more than one drug). There are indeed a lot of them (10650).

In [4]:
drugs['inpatient.number'].duplicated().sum()

10650

In [5]:
drugs.head()

Unnamed: 0,inpatient.number,Drug_name
0,857781,sulfotanshinone sodium injection
1,857781,Furosemide tablet
2,857781,Meglumine Adenosine Cyclophosphate for injection
3,857781,Furosemide injection
4,857781,Milrinone injection


We observe the different drugs administrated to the patients.

In [6]:
drugs['Drug_name'].value_counts()

Spironolactone tablet                               1833
Furosemide injection                                1718
Furosemide tablet                                   1641
Meglumine Adenosine Cyclophosphate for injection    1114
Deslanoside injection                               1015
Digoxin tablet                                       998
Atorvastatin calcium tablet                          822
Milrinone injection                                  707
sulfotanshinone sodium injection                     570
Benazepril hydrochloride tablet                      434
Valsartan Dispersible tablet                         348
Shenfu injection                                     338
Isosorbide Mononitrate Sustained Release tablet      326
Hydrochlorothiazide tablet                           283
Torasemide tablet                                    252
Nitroglycerin injection                              203
Isoprenaline Hydrochloride injection                  30
Dobutamine hydrochloride inject

First, we consider only one category for the Furosemide given by injection or by tablet, since it is indeed the same drug, only subministrated in different ways.

In [7]:
new_col = drugs['Drug_name'].copy()
new_col[new_col == 'Furosemide injection'] = 'Furosemide'
new_col[new_col == 'Furosemide tablet'] = 'Furosemide'
drugs['Drug_name'] = new_col

Now we remove the rows corresponding to the two drugs taken only by a very small number of persons, since we will not be able to infer something with so few patients taking them.

In [8]:
too_few1 = list(drugs.index[drugs['Drug_name'] == 'Isoprenaline Hydrochloride injection'])
too_few2 = list(drugs.index[drugs['Drug_name'] == 'Dobutamine hydrochloride injection'])

def union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list

too_few = union(too_few1, too_few2)
drugs.drop(too_few, inplace = True)
drugs.reset_index(inplace=True, drop=True)

In [9]:
sum(drugs.duplicated())

1446

Checking for duplicated rows, we notice that now there are 1446 repeated rows: these will correspond to the patients taking Furosemide both by injection and by tablet. So we remove these duplicates.

In [10]:
drugs.drop_duplicates(inplace=True)
drugs.reset_index(inplace=True, drop=True)
drugs

Unnamed: 0,inpatient.number,Drug_name
0,857781,sulfotanshinone sodium injection
1,857781,Furosemide
2,857781,Meglumine Adenosine Cyclophosphate for injection
3,857781,Milrinone injection
4,857781,Deslanoside injection
...,...,...
11151,791864,Spironolactone tablet
11152,791864,Valsartan Dispersible tablet
11153,791864,Digoxin tablet
11154,791864,Deslanoside injection


# Grouping drugs according to their aim

We try to group the different drugs according to their aim and/or their acting principle.
From literature, we found that these drugs can be subdivided into 5 main groups:
- diuretics: 'Furosemide', 'Spironolactone tablet', 'Hydrochlorothiazide tablet', 'Torasemide tablet'
- anti-hypertension: 'Spironolactone tablet', 'Benazepril hydrochloride tablet', 'Valsartan Dispersible tablet'
- heart failure: 'Meglumine Adenosine Cyclophosphate for injection', 'Deslanoside injection', 'Shenfu injection', 'Nitroglycerin injection'
- angina and other cardiac problems: 'Meglumine Adenosine Cyclophosphate for injection', 'Digoxin tablet', 'Milrinone injection', 'sulfotanshinone sodium injection', 'Benazepril hydrochloride tablet', 'Valsartan Dispersible tablet', 'Isosorbide Mononitrate Sustained Release tablet', 'Nitroglycerin injection'
- lowering lipids: 'Atorvastatin calcium tablet'

In [11]:
drugs_by_aim = drugs.copy()

In [12]:
DIURETICS = ['Furosemide', 'Spironolactone tablet', 'Hydrochlorothiazide tablet', 'Torasemide tablet']
HYPERTENSION = ['Spironolactone tablet', 'Benazepril hydrochloride tablet', 'Valsartan Dispersible tablet']
HEART_FAILURE = ['Meglumine Adenosine Cyclophosphate for injection', 'Deslanoside injection', 'Shenfu injection',
                 'Nitroglycerin injection']
ANGINA_ETAL = ['Meglumine Adenosine Cyclophosphate for injection', 'Digoxin tablet', 'Milrinone injection',
               'sulfotanshinone sodium injection', 'Benazepril hydrochloride tablet', 
               'Valsartan Dispersible tablet', 'Isosorbide Mononitrate Sustained Release tablet', 
               'Nitroglycerin injection']
CHOLESTEROL = ['Atorvastatin calcium tablet']

# create a list containing these groups of drugs

list_of_groups = [DIURETICS, HYPERTENSION, HEART_FAILURE, ANGINA_ETAL, CHOLESTEROL]

In [13]:
drug_group = ['diuretics','hypertension','heart_failure','angina_etal','cholesterol']

for group_idx,drug in enumerate(drug_group):
    
    new_col = drugs_by_aim['Drug_name'].copy()
    
    for i in range(drugs.shape[0]):
        if new_col[i] in list_of_groups[group_idx]:
            new_col[i]=1
        else:
            new_col[i]=0
            
    drugs_by_aim[drug]= new_col

drugs_by_aim.drop('Drug_name', axis=1, inplace=True)

We also drop the column where we specify the particular name of the drugs and we only keep into account their aim.

In [14]:
# Let's look at one patient to understand what we have
drugs_by_aim[drugs_by_aim['inpatient.number'] == 857781]

Unnamed: 0,inpatient.number,diuretics,hypertension,heart_failure,angina_etal,cholesterol
0,857781,0,0,0,1,0
1,857781,1,0,0,0,0
2,857781,0,0,1,1,0
3,857781,0,0,0,1,0
4,857781,0,0,1,0,0
5,857781,1,0,0,0,0
6,857781,0,1,0,1,0
7,857781,0,0,0,0,1
8,857781,0,0,0,1,0


We don't need to drop duplicates, but we need to get from each patient ID a row having 1 if the patient has at least a 1 in that column, 0 otherwise. This can simply be achieved by taking the max value for each patient on each column.

In [15]:
df_grouped = drugs_by_aim.groupby(['inpatient.number'])
df_grouped.ngroups

2004

In [16]:
drugs_by_patient = df_grouped.max()
drugs_by_patient.head()

Unnamed: 0_level_0,diuretics,hypertension,heart_failure,angina_etal,cholesterol
inpatient.number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
722128,1,1,1,1,0
723327,1,1,1,1,1
723617,1,1,0,0,1
724385,1,1,1,1,0
725509,1,1,1,1,0


Now we want to merge these two datasets over the patient ID (based on the patient ID in the main dataframe, because some patients have already been removed for other reasons). Notice that the number of patients in this dataset is 2004, while in the original dataset we have 2008. Thus, there should be 4 patients which are not registered because they're not taking any drug.

In [17]:
X_train = pd.read_csv('train_data.csv')
X_train.set_index('inpatient.number', inplace = True)
X_train.head()

Unnamed: 0_level_0,DestinationDischarge,admission.ward,admission.way,discharge.department,gender,body.temperature,pulse,respiration,systolic.blood.pressure,diastolic.blood.pressure,...,globulin,direct.bilirubin,total.bile.acid,total.protein,low.density.lipoprotein.cholesterol,triglyceride,high.density.lipoprotein.cholesterol,GCS,dischargeDay,ageCat
inpatient.number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
746794,Home,Cardiology,NonEmergency,Cardiology,1,37.1,99.0,15.0,172.0,86.0,...,27.5,3.2,1.8,65.9,3.29,1.91,0.98,15,5,"(59,69]"
830900,HealthcareFacility,Cardiology,NonEmergency,Cardiology,0,36.5,75.0,18.0,154.0,84.0,...,30.2,2.5,6.0,68.2,3.42,2.69,0.93,15,4,"(79,89]"
730511,Home,Cardiology,NonEmergency,Cardiology,0,38.6,96.0,19.0,130.0,80.0,...,29.6,11.1,3.8,62.0,2.73,1.19,0.93,15,10,"(79,89]"
790988,HealthcareFacility,Cardiology,NonEmergency,Cardiology,0,37.0,88.0,18.0,108.0,54.0,...,32.9,1.0,0.6,73.9,2.09,1.87,1.12,15,10,"(69,79]"
779438,,GeneralWard,NonEmergency,GeneralWard,0,36.5,106.0,18.0,150.0,90.0,...,32.6,9.7,2.3,65.6,3.42,1.43,0.79,15,5,"(79,89]"


In [18]:
def merge_drugs(data,drugs_by_patient):
    # suffix is needed because we already have a variable called cholesterol
    data_final = data.join(drugs_by_patient, rsuffix = '_drug')
    # convert the last five columns to int again
    data_final.iloc[:,-5:] = data_final.iloc[:,-5:].astype('Int64')
    return data_final

In [19]:
X_train = merge_drugs(X_train,drugs_by_patient)
X_train.head()

Unnamed: 0_level_0,DestinationDischarge,admission.ward,admission.way,discharge.department,gender,body.temperature,pulse,respiration,systolic.blood.pressure,diastolic.blood.pressure,...,triglyceride,high.density.lipoprotein.cholesterol,GCS,dischargeDay,ageCat,diuretics,hypertension,heart_failure,angina_etal,cholesterol
inpatient.number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
746794,Home,Cardiology,NonEmergency,Cardiology,1,37.1,99.0,15.0,172.0,86.0,...,1.91,0.98,15,5,"(59,69]",1,1,0,1,0
830900,HealthcareFacility,Cardiology,NonEmergency,Cardiology,0,36.5,75.0,18.0,154.0,84.0,...,2.69,0.93,15,4,"(79,89]",1,0,1,1,1
730511,Home,Cardiology,NonEmergency,Cardiology,0,38.6,96.0,19.0,130.0,80.0,...,1.19,0.93,15,10,"(79,89]",1,1,1,1,0
790988,HealthcareFacility,Cardiology,NonEmergency,Cardiology,0,37.0,88.0,18.0,108.0,54.0,...,1.87,1.12,15,10,"(69,79]",1,0,0,0,1
779438,,GeneralWard,NonEmergency,GeneralWard,0,36.5,106.0,18.0,150.0,90.0,...,1.43,0.79,15,5,"(79,89]",1,1,1,1,1


In [20]:
print(f'Patients not taking drugs: {np.where(X_train["diuretics"].isnull())[0]}')

Patients not taking drugs: [210 590 919]


In [21]:
X_train.iloc[:,-5:] = X_train.iloc[:,-5:].fillna(0)

In [22]:
X_train.iloc[:,-5:].sum()

diuretics        1547
hypertension     1454
heart_failure    1279
angina_etal      1464
cholesterol       629
dtype: int64

In [23]:
for i in range(6):
    print(f'Number of patients taking {i} drugs: {len(np.where(X_train.iloc[:,-5:].sum(axis=1)==i)[0])}')

Number of patients taking 0 drugs: 3
Number of patients taking 1 drugs: 7
Number of patients taking 2 drugs: 62
Number of patients taking 3 drugs: 193
Number of patients taking 4 drugs: 847
Number of patients taking 5 drugs: 455


In [24]:
X_test = pd.read_csv('test_data.csv')
X_test.set_index('inpatient.number', inplace = True)

In [25]:
X_test = merge_drugs(X_test,drugs_by_patient)
X_test.head()

Unnamed: 0_level_0,DestinationDischarge,admission.ward,admission.way,discharge.department,gender,body.temperature,pulse,respiration,systolic.blood.pressure,diastolic.blood.pressure,...,triglyceride,high.density.lipoprotein.cholesterol,GCS,dischargeDay,ageCat,diuretics,hypertension,heart_failure,angina_etal,cholesterol
inpatient.number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
770068,Home,Cardiology,NonEmergency,Cardiology,1,36.5,80.0,18.0,125.0,80.0,...,,,15,62,"(69,79]",1,1,1,1,0
860037,Home,Cardiology,NonEmergency,Cardiology,1,36.0,98.0,18.0,150.0,87.0,...,0.46,1.06,15,9,"(59,69]",1,1,1,1,1
782110,Home,Cardiology,NonEmergency,Cardiology,1,36.5,80.0,18.0,110.0,80.0,...,,,15,64,"(69,79]",1,1,1,1,0
742279,Home,Others,Emergency,Cardiology,0,36.4,56.0,18.0,105.0,65.0,...,0.48,1.27,15,11,"(59,69]",1,1,0,1,1
734121,,Cardiology,Emergency,Cardiology,1,36.5,59.0,19.0,138.0,88.0,...,0.5,1.4,15,3,"(79,89]",1,1,0,1,0


In [26]:
print(f'Patients not taking drugs: {np.where(X_test["diuretics"].isnull())[0]}')

Patients not taking drugs: [359]


In [27]:
X_test.iloc[:,-5:] = X_test.iloc[:,-5:].fillna(0)

In [28]:
X_test.iloc[:,-5:].sum()

diuretics        386
hypertension     367
heart_failure    327
angina_etal      376
cholesterol      177
dtype: int64

In [29]:
for i in range(6):
    print(f'Number of patients taking {i} drugs in test set: {len(np.where(X_test.iloc[:,-5:].sum(axis=1)==i)[0])}')

Number of patients taking 0 drugs in test set: 1
Number of patients taking 1 drugs in test set: 5
Number of patients taking 2 drugs in test set: 11
Number of patients taking 3 drugs in test set: 42
Number of patients taking 4 drugs in test set: 210
Number of patients taking 5 drugs in test set: 128


In [30]:
X_train.to_csv('train_data_drugs.csv')

In [31]:
X_test.to_csv('test_data_drugs.csv')