In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(130298) # to have the same random numbers

In [2]:
%pwd

'C:\\Users\\Eleonora\\statistical learning for healthcare data\\progetto'

In [3]:
drugs = pd.read_csv('dat_md.csv')
drugs.drop(columns = ['Unnamed: 0'], inplace = True)
#drugs.set_index('inpatient.number', inplace = True)
drugs.shape

(12654, 2)

We notice that this dataset has much more rows than the other: so we want to check if there are patients for which there is more than one row in the table (i.e. patients that take more than one drug). There are indeed a lot of them (10650).

In [4]:
drugs.head()

Unnamed: 0,inpatient.number,Drug_name
0,857781,sulfotanshinone sodium injection
1,857781,Furosemide tablet
2,857781,Meglumine Adenosine Cyclophosphate for injection
3,857781,Furosemide injection
4,857781,Milrinone injection


In [5]:
sum(drugs['inpatient.number'].duplicated())

10650

We observe the different drugs administrated to the patients.

In [6]:
drugs['Drug_name'].value_counts()

Spironolactone tablet                               1833
Furosemide injection                                1718
Furosemide tablet                                   1641
Meglumine Adenosine Cyclophosphate for injection    1114
Deslanoside injection                               1015
Digoxin tablet                                       998
Atorvastatin calcium tablet                          822
Milrinone injection                                  707
sulfotanshinone sodium injection                     570
Benazepril hydrochloride tablet                      434
Valsartan Dispersible tablet                         348
Shenfu injection                                     338
Isosorbide Mononitrate Sustained Release tablet      326
Hydrochlorothiazide tablet                           283
Torasemide tablet                                    252
Nitroglycerin injection                              203
Isoprenaline Hydrochloride injection                  30
Dobutamine hydrochloride inject

First, we unite into only one drug the Furosemide given by injection and by tablet, since it is indeed the same drug, only subministrated in different ways.

In [7]:
new_col = drugs['Drug_name'].copy()
new_col[new_col == 'Furosemide injection'] = 'Furosemide'
new_col[new_col == 'Furosemide tablet'] = 'Furosemide'
drugs['Drug_name'] = new_col

In [8]:
drugs['Drug_name'].value_counts()

Furosemide                                          3359
Spironolactone tablet                               1833
Meglumine Adenosine Cyclophosphate for injection    1114
Deslanoside injection                               1015
Digoxin tablet                                       998
Atorvastatin calcium tablet                          822
Milrinone injection                                  707
sulfotanshinone sodium injection                     570
Benazepril hydrochloride tablet                      434
Valsartan Dispersible tablet                         348
Shenfu injection                                     338
Isosorbide Mononitrate Sustained Release tablet      326
Hydrochlorothiazide tablet                           283
Torasemide tablet                                    252
Nitroglycerin injection                              203
Isoprenaline Hydrochloride injection                  30
Dobutamine hydrochloride injection                    22
Name: Drug_name, dtype: int64

Now we remove the rows corresponding to the two drugs taken only by a very small number of persons, since we will not be able to infer something with so few patients taking them.

In [9]:
too_few1 = list(drugs.index[drugs['Drug_name'] == 'Isoprenaline Hydrochloride injection'])
too_few2 = list(drugs.index[drugs['Drug_name'] == 'Dobutamine hydrochloride injection'])

def union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list

too_few = union(too_few1, too_few2)
drugs.drop(too_few, inplace = True)
drugs.reset_index(inplace=True, drop=True)

In [10]:
drugs

Unnamed: 0,inpatient.number,Drug_name
0,857781,sulfotanshinone sodium injection
1,857781,Furosemide
2,857781,Meglumine Adenosine Cyclophosphate for injection
3,857781,Furosemide
4,857781,Milrinone injection
...,...,...
12597,791864,Valsartan Dispersible tablet
12598,791864,Digoxin tablet
12599,791864,Deslanoside injection
12600,791864,Milrinone injection


In [11]:
sum(drugs.duplicated())

1446

Checking again for duplicated rows, we notice that now there are 1446 repeated rows (these will correspond to the patients taking Furosemide both by injection and by tablet). So we remove these duplicates (and we obtain a dataset with 11156 rows).

In [12]:
drugs.drop_duplicates(inplace=True)
drugs.reset_index(inplace=True, drop=True)
drugs

Unnamed: 0,inpatient.number,Drug_name
0,857781,sulfotanshinone sodium injection
1,857781,Furosemide
2,857781,Meglumine Adenosine Cyclophosphate for injection
3,857781,Milrinone injection
4,857781,Deslanoside injection
...,...,...
11151,791864,Spironolactone tablet
11152,791864,Valsartan Dispersible tablet
11153,791864,Digoxin tablet
11154,791864,Deslanoside injection


In [13]:
sum(drugs["inpatient.number"].duplicated())

9152

However we are still far from the number of patients (2008). This means that many patients will take more than one medicine.

# Grouping drugs according to their aim

We try to group the different drugs according to their aim and/or their acting principle.
From literature, we found that these drugs can be subdivided into 5 main groups:
- diuretics: 'Furosemide', 'Spironolactone tablet', 'Hydrochlorothiazide tablet', 'Torasemide tablet'
- anti-hypertension: 'Spironolactone tablet', 'Benazepril hydrochloride tablet', 'Valsartan Dispersible tablet'
- heart failure: 'Meglumine Adenosine Cyclophosphate for injection', 'Deslanoside injection', 'Shenfu injection', 'Nitroglycerin injection'
- angina and other cardiac problems: 'Meglumine Adenosine Cyclophosphate for injection', 'Digoxin tablet', 'Milrinone injection', 'sulfotanshinone sodium injection', 'Benazepril hydrochloride tablet', 'Valsartan Dispersible tablet', 'Isosorbide Mononitrate Sustained Release tablet', 'Nitroglycerin injection'
- lowering lipids: 'Atorvastatin calcium tablet'

In [14]:
drugs_by_aim=drugs.copy()
drugs_by_aim

Unnamed: 0,inpatient.number,Drug_name
0,857781,sulfotanshinone sodium injection
1,857781,Furosemide
2,857781,Meglumine Adenosine Cyclophosphate for injection
3,857781,Milrinone injection
4,857781,Deslanoside injection
...,...,...
11151,791864,Spironolactone tablet
11152,791864,Valsartan Dispersible tablet
11153,791864,Digoxin tablet
11154,791864,Deslanoside injection


### Diuretics

In [15]:
diuretics = drugs_by_aim['Drug_name'].copy()
DIURETICS= ['Furosemide', 'Spironolactone tablet', 'Hydrochlorothiazide tablet', 'Torasemide tablet']

for i in range(drugs.shape[0]):
    if diuretics[i] in DIURETICS:
        diuretics[i]=1
    else:
        diuretics[i]=0

In [16]:
drugs_by_aim.insert(2, 'diuretics', 5)
drugs_by_aim['diuretics']=diuretics
drugs_by_aim

Unnamed: 0,inpatient.number,Drug_name,diuretics
0,857781,sulfotanshinone sodium injection,0
1,857781,Furosemide,1
2,857781,Meglumine Adenosine Cyclophosphate for injection,0
3,857781,Milrinone injection,0
4,857781,Deslanoside injection,0
...,...,...,...
11151,791864,Spironolactone tablet,1
11152,791864,Valsartan Dispersible tablet,0
11153,791864,Digoxin tablet,0
11154,791864,Deslanoside injection,0


### Hypertension

In [17]:
hypertension = drugs_by_aim['Drug_name'].copy()
HYPERTENSION = ['Spironolactone tablet', 'Benazepril hydrochloride tablet', 'Valsartan Dispersible tablet']
for i in range(drugs.shape[0]):
    if hypertension[i] in HYPERTENSION:
        hypertension[i]=1
    else:
        hypertension[i]=0

In [18]:
drugs_by_aim.insert(3, 'hypertension', 5)
drugs_by_aim['hypertension']=hypertension
drugs_by_aim

Unnamed: 0,inpatient.number,Drug_name,diuretics,hypertension
0,857781,sulfotanshinone sodium injection,0,0
1,857781,Furosemide,1,0
2,857781,Meglumine Adenosine Cyclophosphate for injection,0,0
3,857781,Milrinone injection,0,0
4,857781,Deslanoside injection,0,0
...,...,...,...,...
11151,791864,Spironolactone tablet,1,1
11152,791864,Valsartan Dispersible tablet,0,1
11153,791864,Digoxin tablet,0,0
11154,791864,Deslanoside injection,0,0


### Heart failure

In [19]:
heart_failure = drugs_by_aim['Drug_name'].copy()
HEART_FAILURE = ['Meglumine Adenosine Cyclophosphate for injection', 'Deslanoside injection', 'Shenfu injection',
                 'Nitroglycerin injection']
for i in range(drugs.shape[0]):
    if heart_failure[i] in HEART_FAILURE:
        heart_failure[i]=1
    else:
        heart_failure[i]=0

In [20]:
drugs_by_aim.insert(4, 'heart_failure', 5)
drugs_by_aim['heart_failure']=heart_failure
drugs_by_aim

Unnamed: 0,inpatient.number,Drug_name,diuretics,hypertension,heart_failure
0,857781,sulfotanshinone sodium injection,0,0,0
1,857781,Furosemide,1,0,0
2,857781,Meglumine Adenosine Cyclophosphate for injection,0,0,1
3,857781,Milrinone injection,0,0,0
4,857781,Deslanoside injection,0,0,1
...,...,...,...,...,...
11151,791864,Spironolactone tablet,1,1,0
11152,791864,Valsartan Dispersible tablet,0,1,0
11153,791864,Digoxin tablet,0,0,0
11154,791864,Deslanoside injection,0,0,1


### Angina and other cardiac problem

In [21]:
angina_etal = drugs_by_aim['Drug_name'].copy()
ANGINA_ETAL = ['Meglumine Adenosine Cyclophosphate for injection', 'Digoxin tablet', 'Milrinone injection',
               'sulfotanshinone sodium injection', 'Benazepril hydrochloride tablet', 'Valsartan Dispersible tablet',
               'Isosorbide Mononitrate Sustained Release tablet', 'Nitroglycerin injection']
for i in range(drugs.shape[0]):
    if angina_etal[i] in ANGINA_ETAL:
        angina_etal[i]=1
    else:
        angina_etal[i]=0

In [22]:
drugs_by_aim.insert(5, 'angina_etal', 5)
drugs_by_aim['angina_etal']=angina_etal
drugs_by_aim

Unnamed: 0,inpatient.number,Drug_name,diuretics,hypertension,heart_failure,angina_etal
0,857781,sulfotanshinone sodium injection,0,0,0,1
1,857781,Furosemide,1,0,0,0
2,857781,Meglumine Adenosine Cyclophosphate for injection,0,0,1,1
3,857781,Milrinone injection,0,0,0,1
4,857781,Deslanoside injection,0,0,1,0
...,...,...,...,...,...,...
11151,791864,Spironolactone tablet,1,1,0,0
11152,791864,Valsartan Dispersible tablet,0,1,0,1
11153,791864,Digoxin tablet,0,0,0,1
11154,791864,Deslanoside injection,0,0,1,0


### Cholesterol

In [23]:
cholesterol = drugs_by_aim['Drug_name'].copy()
CHOLESTEROL = ['Atorvastatin calcium tablet']
for i in range(drugs.shape[0]):
    if cholesterol[i] in CHOLESTEROL:
        cholesterol[i]=1
    else:
        cholesterol[i]=0

In [24]:
drugs_by_aim.insert(6, 'cholesterol', 5)
drugs_by_aim['cholesterol']=cholesterol
drugs_by_aim

Unnamed: 0,inpatient.number,Drug_name,diuretics,hypertension,heart_failure,angina_etal,cholesterol
0,857781,sulfotanshinone sodium injection,0,0,0,1,0
1,857781,Furosemide,1,0,0,0,0
2,857781,Meglumine Adenosine Cyclophosphate for injection,0,0,1,1,0
3,857781,Milrinone injection,0,0,0,1,0
4,857781,Deslanoside injection,0,0,1,0,0
...,...,...,...,...,...,...,...
11151,791864,Spironolactone tablet,1,1,0,0,0
11152,791864,Valsartan Dispersible tablet,0,1,0,1,0
11153,791864,Digoxin tablet,0,0,0,1,0
11154,791864,Deslanoside injection,0,0,1,0,0


We obtained a complete dataset with new columns corresponding to the aim of the prescribed drug.

In [25]:
drugs_by_aim.to_csv('drugs_by_aim_with_NAME.csv')

Now we drop the column where we specify the particular name of the drugs and we only keep into account their aim.
At this point we check again for duplicated rows (which will be the patients taking more than one drug of the same group) and we drop them.

In [26]:
col_to_drop=['Drug_name']

In [27]:
drugs_by_aim.drop(col_to_drop, axis=1, inplace=True)
drugs_by_aim

Unnamed: 0,inpatient.number,diuretics,hypertension,heart_failure,angina_etal,cholesterol
0,857781,0,0,0,1,0
1,857781,1,0,0,0,0
2,857781,0,0,1,1,0
3,857781,0,0,0,1,0
4,857781,0,0,1,0,0
...,...,...,...,...,...,...
11151,791864,1,1,0,0,0
11152,791864,0,1,0,1,0
11153,791864,0,0,0,1,0
11154,791864,0,0,1,0,0


In [28]:
sum(drugs_by_aim.duplicated())

1812

In [29]:
drugs_by_aim.drop_duplicates(inplace=True)
drugs_by_aim.reset_index(inplace=True, drop=True)
drugs_by_aim

Unnamed: 0,inpatient.number,diuretics,hypertension,heart_failure,angina_etal,cholesterol
0,857781,0,0,0,1,0
1,857781,1,0,0,0,0
2,857781,0,0,1,1,0
3,857781,0,0,1,0,0
4,857781,0,1,0,1,0
...,...,...,...,...,...,...
9339,791864,0,0,1,1,0
9340,791864,1,1,0,0,0
9341,791864,0,1,0,1,0
9342,791864,0,0,0,1,0


We obtained a dataset with 9344 rows (and 6 columns), that will be easier to merge and to analyse with respect to the starting one.

In [30]:
drugs_by_aim.to_csv('drugs_by_aim.csv')

In [31]:
# example of one patient: he takes 6 different drugs -> Now we should merge this 6 rows into just one row
pat=drugs_by_aim[drugs_by_aim['inpatient.number']==857781]
pat

Unnamed: 0,inpatient.number,diuretics,hypertension,heart_failure,angina_etal,cholesterol
0,857781,0,0,0,1,0
1,857781,1,0,0,0,0
2,857781,0,0,1,1,0
3,857781,0,0,1,0,0
4,857781,0,1,0,1,0
5,857781,0,0,0,0,1


In [None]:
#We have to join this table to the original one
#final=tab.join(drugs_by_aim)
#final