# Table of Contents:



## Load Data

In [1]:
pwd # current directory

'C:\\Users\\17812\\Documents\\GitHub\\healthcare\\dev\\kailun'

In [2]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="whitegrid")

In [15]:
filename= r'diabetic_data.csv'
train_data = pd.read_csv(filename)

In [68]:
filename=r'diabetic_data_continuous.csv'
train_data3 = pd.read_csv(filename)

#### Requirments for extraction of initial dataset from database:

1) Hospital admission 

2) Any kind of diabetes diagnosis

3) Length of stay > 1 day, < 14 days

4) Lab tests were performed in hospital 

5) Medications were given in hospital

#### Let's check that the data is loaded correctly 

In [5]:
train_data.sample(32)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
14296,55878306,102519855,Caucasian,Female,[50-60),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
31778,101812044,55138239,Caucasian,Male,[60-70),?,3,1,1,2,...,No,Down,No,No,No,No,No,Ch,Yes,<30
8555,38495064,106467156,Caucasian,Female,[80-90),?,1,3,7,2,...,No,Steady,No,No,No,No,No,Ch,Yes,>30
56429,162068958,100209231,Caucasian,Female,[70-80),?,2,1,7,1,...,No,No,No,No,No,No,No,No,No,NO
96382,374396630,32397804,Caucasian,Female,[70-80),?,2,1,1,8,...,No,Steady,No,No,No,No,No,No,Yes,>30
97936,394414100,77539959,Caucasian,Female,[70-80),?,1,1,7,1,...,No,No,No,No,No,No,No,No,No,NO
89822,290912718,77941773,Caucasian,Male,[50-60),?,1,1,1,2,...,No,No,No,No,No,No,No,Ch,Yes,NO
93455,335996720,40133610,Caucasian,Female,[70-80),?,6,6,17,3,...,No,No,No,No,No,No,No,Ch,Yes,NO
78931,241496538,105958782,Caucasian,Female,[80-90),?,2,1,7,4,...,No,Up,No,No,No,No,No,Ch,Yes,>30
49573,149882094,38098629,Caucasian,Male,[70-80),?,1,3,7,3,...,No,No,No,No,No,No,No,No,No,<30


In [6]:
print(train_data.columns)
print(train_data.shape)  # There are 101766 rows and 50 columns 

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')
(101766, 50)


#### First let's check whether there are any null/ NA values, since machine learning models can't handle these 

In [None]:
#np.sum(train_data.isnull(), axis=0), use either code
train_data.info()  

#### There are no null values, but there are ? marks.  For example `weight` has too many "?" for it to be a meaningful feature and it has been dropped by the paper since there are 98569 cases out of 100k+. (Note here that the HITECH legislature does not require hospitals to capture this information)

In [8]:
train_data[train_data['weight']=='?'].shape[0]

98569

#### `payer_code` is another variable with a substantial number of missing values 

In [16]:
train_data[train_data['payer_code']=='?'].shape[0]/ train_data.shape[0]  

0.395574160328597

#### What are the columns that have "?"s?  

In [12]:
for col in train_data.columns:
    if train_data[col].dtype == object:
        if train_data[col][train_data[col] == '?'].count() != 0:
            #print(col, 'no rows with ?')
            print(col, train_data[col][train_data[col] == '?'].count())

race 2273
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423


#### Should we drop `payer_code` and `medical_specialty`? In the paper, `payer_code` is dropped because it's not relevant to readmission, while `medical_specialty` is kept:

In [17]:
train_data = train_data.drop(['weight','payer_code'], axis = 1)

## Examine the predictor variable 

Need to feature engineer this feature into binary feature 

In [18]:
train_data['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

Based on the value counts above, we observe three levels of readmitted status.  About 10% of patients are readmitted within 30 days after hospital encounter.  Let's feature engineer the variable 'readmitted' to be strictly categorical 0 or 1.  

In [19]:
train_data['readmitted_yes'] = pd.Series(len(train_data['readmitted']), index=train_data.index)
train_data['readmitted_yes']= 0
train_data.loc[train_data['readmitted']== '<30','readmitted_yes'] = 1

In [None]:
def logit_fun(col_name):
    prob= len(train_data.loc[train_data[col_name]==1])/len(train_data[col_name])
    logit=np.log(prob/(1-prob))
    return logit

In [None]:
logit_fun('readmitted_yes')

## Need to narrow down the encounters (rows): 

If our assumption is that there's a linear trend bet the readmission rate and features:

Based on the paper, the population in the paper is 69,984, so how to get there:

Based on the paper, have to remove multiple rows for same patient, leaving only independent observations.  What's being tracked here? How is patient_nbr generated? A patient has multiple encounters over the course of 10 years, from the value_counts(), we observe that the patient is not unique, but the encounter is.  We will keep the first encounter 

In [None]:
train_data['patient_nbr'].value_counts()

#### Note that for tree based models, let's try both with just the first encounter and all of the encounters 

In [20]:
# Could be commented out 
train_data_2 = train_data.drop_duplicates(subset= ['patient_nbr'], keep = 'first') 

In [69]:
train_data3= train_data3.drop_duplicates(subset= ['patient_nbr'], keep ='first')

In [21]:
train_data_2['readmitted'].value_counts()

NO     42985
>30    22240
<30     6293
Name: readmitted, dtype: int64

#### We can also remove patients who have discharge_id= 20, which means that the patient has died in the hospital  NEED TO CHECK WITH domain expert regarding discharge_disposition_id

#### Here we want to look at `diag_2` indexes for the rows that have '?' for `diag_1`: 

In [27]:
train_data_2[train_data_2['diag_1'] == '?']['diag_2']

518         780
1006        595
1267     250.82
1488        276
3197     250.01
37693       780
57058       V63
57737       276
60314       427
86018    250.02
87181         ?
Name: diag_2, dtype: object

In [28]:
drop_Idx = set(train_data_2[(train_data_2['diag_1'] == '?') & (train_data_2['diag_2'] == '?') & (train_data_2['diag_3'] == '?')].index)
drop_Idx = drop_Idx.union(set(train_data_2['diag_1'][train_data_2['diag_1'] == '?'].index))
drop_Idx = drop_Idx.union(set(train_data_2['race'][train_data_2['race'] == '?'].index))
drop_Idx = drop_Idx.union(set(train_data_2[train_data_2['discharge_disposition_id'] == 20].index))
drop_Idx = drop_Idx.union(set(train_data_2[train_data_2['discharge_disposition_id'] == 11].index))
drop_Idx = drop_Idx.union(set(train_data_2['gender'][train_data_2['gender'] == 'Unknown/Invalid'].index))
print(len(list(drop_Idx))) # the number of rows dropped 

3010


In [29]:
new_Idx = list(set(train_data_2.index) - set(drop_Idx))
train_data_2 = train_data_2.loc[new_Idx]

In [30]:
train_data_2.shape

(68508, 49)

In [70]:
train_data3= train_data3.loc[new_Idx]

In [72]:
train_data3= train_data3.drop(['encounter_id', 'patient_nbr', 'Unnamed:0'], axis=1)

## Interaction terms from the publication: 

Interaction terms from the paper and new interaction terms to implement 

In [None]:
# this includes medical_specialty
interaction_terms_paper= [('HbA1c', 'diag1_collapse'),('age_bin', 'medical_specialty'), ('race', 'discharge_yes'), 
                          ('discharge_yes', 'time_in_hospital'),('medical_specialty', 'discharge'), 
                          ('time_in_hospital', 'medical_specialty'), ('time_in_hospital', "diag1_collapse")]

In [None]:
interaction_terms_paper_mod= [('HbA1c', 'diag1_collapse'),('race', 'discharge_yes'), ('discharge_yes', 'time_in_hospital'),
                              ('time_in_hospital', "diag1_collapse")]

####  These are the single features that I will examine in more detail: it seems like A1Cresult and change are used here, but diabetesMed is not used in the initial logistic model.  Will need more investigation 

In [None]:
nom_var=['A1Cresult','change', 'diabetesMed', 'readmitted', 'readmitted_yes']

######################################BREAK##################################################

### A1Cresult is divided into 4 groups:  A1Cresult has to be feature engineered into `HbA1c` as an interactive variable

`None`is considered not measured: This is designated as the reference group per paper 

`>8 and No change` is considered `High_NoCh`

`>8 and Ch` is considered `High_Ch`

`Norm and >7` groups is considered `Norm` :  This is the normal group

#### First let's examine the value counts for A1Cresult

In [31]:
train_data_2['A1Cresult'].value_counts()

None    56014
>8       6011
Norm     3683
>7       2800
Name: A1Cresult, dtype: int64

In [None]:
# Let's also examine the A1Cresult with change
train_data_2[['encounter_id','A1Cresult', 'change']].head()

In [33]:
train_data_2['HbA1c']=pd.Series(len(train_data_2['A1Cresult']), index=train_data_2)
train_data_2['HbA1c']= 'None'
train_data_2.loc[(train_data_2['A1Cresult']=='Norm') | (train_data_2['A1Cresult']=='>7'), 'HbA1c']= 'Norm'
train_data_2.loc[(train_data_2['A1Cresult']=='>8') & (train_data_2['change']=='No'), 'HbA1c']= 'High_NoCH'
train_data_2.loc[(train_data_2['A1Cresult']=='>8') & (train_data_2['change']== 'Ch'), 'HbA1c']= 'High_CH'

In [34]:
train_data_2[['A1Cresult', 'change', 'HbA1c']].sample(60)

Unnamed: 0,A1Cresult,change,HbA1c
36950,,No,
55794,,Ch,
56274,,Ch,
44257,,Ch,
44270,,No,
87102,,No,
43224,,Ch,
10023,,No,
1001,,No,
20254,,Ch,


In [35]:
# designate as one
one_dum_tree = pd.get_dummies(train_data_2[['HbA1c']])
one=list(one_dum_tree.columns)
one_dum_linear= one_dum_tree.drop('HbA1c_None', axis=1)
one_l=list(one_dum_linear.columns)

### Diagnosis is divided into groups referenced in the paper, the name of the feature is `diag_1`, but need to feature engineer `diag1_collapse`

`circulatory`: 1- 390 <= icd9 <= 459 or icd9 == 785

`respiratory`: 2- 460 <= icd9 <= 519 or icd9 == 786

`digestive`: 3- 520 <= icd9 <= 579 or icd9 == 787:

`diabetes`: 4- 250.xx

`injury`(this is also includes poisoning) : 5- 800<= icd9 <=999

`muscul`: 6- 710 <= icd9 <= 739

`genit`: 7- 580<= icd9 <= 629 or icd9 == 788

`neoplasms`: 8- 140<=icd9 <=239

`other`: 0- The catch all 

#### Let's first get the value counts and the `diag_1` 

In [36]:
print(train_data_2['diag_1'].value_counts())
print("total unique diagnosis categories are")
print(len(train_data_2['diag_1'].value_counts()))

414    5054
428    3811
786    2964
410    2709
486    2315
       ... 
917       1
V25       1
826       1
299       1
236       1
Name: diag_1, Length: 692, dtype: int64
total unique diagnosis categories are
692


In [37]:
print(train_data['diag_1'].str.contains('V').sum())
print(train_data['diag_1'].str.contains('E').sum())
print(train_data_2['diag_1'].str.contains('V').sum())
print(train_data_2['diag_1'].str.contains('E').sum())

1644
1
893
1


In [38]:
# this is for the original train_data dataframe
train_data.loc[(train_data['diag_1'].str.contains('V')) | (train_data['diag_1'].str.contains('E')), 'diag_1']= 0
train_data['diag_1'] = train_data['diag_1'].replace('?', -1)
train_data['diag_1'] = train_data['diag_1'].astype(float)

# this is for train_data_2
train_data_2.loc[(train_data_2['diag_1'].str.contains('V')) | (train_data_2['diag_1'].str.contains('E')), 'diag_1'] = 0
train_data_2['diag_1'] = train_data_2['diag_1'].astype(float)

print(train_data_2.loc[train_data_2['diag_1']==0].shape[0])  # this should add up to 894

894


In [39]:
# Creating an empty column for diag1_collapse
train_data_2['diag1_collapse'] = pd.Series(len(train_data_2['diag_1']), index=train_data_2.index)
# Initiate all the values as 0
train_data_2['diag1_collapse'] = 'other'

for index, row in train_data_2.iterrows():
    
    if (row['diag_1'] >= 390 and row['diag_1'] <= 359) | (np.floor(row['diag_1']) == 785):
        train_data_2.loc[index, 'diag1_collapse'] = 'circulatory'
        
    elif (row['diag_1'] >= 460 and row['diag_1'] <= 519) | (np.floor(row['diag_1']) == 786):
        train_data_2.loc[index, 'diag1_collapse'] = 'respiratory'
        
    elif (row['diag_1'] >= 520 and row['diag_1'] <= 579) | (np.floor(row['diag_1']) == 787):
        train_data_2.loc[index, 'diag1_collapse'] = 'digestive'
        
    elif (np.floor(row['diag_1']) == 250):
        train_data_2.loc[index, 'diag1_collapse'] = 'diabetes'
        
    elif (row['diag_1'] >= 800 and row['diag_1'] <= 999):
        train_data_2.loc[index, 'diag1_collapse'] = 'injury'
        
    elif (row['diag_1'] >= 710 and row['diag_1'] <= 739):
        train_data_2.loc[index, 'diag1_collapse'] = 'muscul'
        
    elif (row['diag_1'] >= 580 and row['diag_1'] <= 629) | (np.floor(row['diag_1']) == 788):
        train_data_2.loc[index, 'diag1_collapse'] = 'genit'
        
    elif (row['diag_1'] >= 140 and row['diag_1'] <= 239):
        train_data_2.loc[index, 'diag1_collapse'] = 'neoplasms'

In [40]:
train_data_2[['diag_1','diag1_collapse']].sample(50)

Unnamed: 0,diag_1,diag1_collapse
51809,349.0,other
86227,550.0,digestive
70148,715.0,muscul
63014,440.0,other
83472,997.0,injury
97538,427.0,other
32721,204.0,neoplasms
53226,8.0,other
47909,428.0,other
92338,578.0,digestive


In [41]:
two_dum_tree = pd.get_dummies(train_data_2[['diag1_collapse']])
two=list(two_dum_tree.columns)
two_dum_linear=two_dum_tree.drop('diag1_collapse_diabetes', axis=1)
two_l=list(two_dum_linear.columns)

### Age is divided into 3 groups

`30_60` : this age group is designated as the reference group, which means that for linear models, drop this group after onehot encoding to prevent multicollinearity 

`<30`

`60_100`

In [42]:
train_data_2['age'].value_counts()

[70-80)     17409
[60-70)     15328
[50-60)     12039
[80-90)     10959
[40-50)      6643
[30-40)      2619
[90-100)     1748
[20-30)      1085
[10-20)       526
[0-10)        152
Name: age, dtype: int64

In [43]:
train_data_2['age_bin'] = pd.Series(len(train_data_2['age']), index=train_data_2.index)
train_data_2['age_bin']= '60+'
train_data_2.loc[(train_data_2['age']== '[0-10)')|(train_data_2['age']== '[10-20)')|(train_data_2['age']=='[20-30)'),'age_bin'] = '<30'
train_data_2.loc[(train_data_2['age']== '[30-40)')|(train_data_2['age']== '[40-50)')|(train_data_2['age']== '[50-60)'), 'age_bin'] ='30_60' 

In [44]:
three_dum_tree = pd.get_dummies(train_data_2[['age_bin']])
three=list(three_dum_tree.columns)
three_dum_linear = three_dum_tree.drop('age_bin_30_60', axis=1)
three_l=list(three_dum_linear.columns)

### Medical Specialty represents the speciality of the admitting physician 

There are six groups represented by the paper: however from the data, there are 71 different groups. Below are the top 20 groups. Here we categorize the groups as the following:

`Missing`: ? 

`Inter_Med`: InternalMedicine

`GP`: Family/GeneralPractice 

`ER_Trauma`: Emergency/Trauma

`Cardiology`: Cardiology

`Surgery`: Surgery-General, Surgery-Cardio/Thoracic, Surgery Neuro, Surgery Vascular

`Other`: Catch all of the rest 

In [None]:
train_data_2['medical_specialty'].value_counts().head(20)

In [45]:
def conv_ms_cat(ms_cat):
    if ms_cat == '?': 
        result = 'missing'
    elif ms_cat == 'InternalMedicine':
        result = 'inter_med'
    elif ms_cat== 'Family/GeneralPractice':
        result = 'gp'
    elif ms_cat ==  'Emergency/Trauma':
        result = 'ER_trauma'
    elif ms_cat ==  'Cardiology':
        result = 'cardiology'
    elif (ms_cat ==  'Surgery-General') | (ms_cat== 'Surgery-Cardiovascular/Thoracic')| (ms_cat== 'Surgery-Neuro')| (ms_cat=='Surgery-Vascular'):
        result = 'surgery'
    else:
        result = 'other'
    return result

In [46]:
train_data_2['medical_specialty_short']=train_data_2['medical_specialty'].apply(lambda x: conv_ms_cat(x))

In [49]:
train_data_2[['medical_specialty_short', 'medical_specialty']].sample(50)

Unnamed: 0,medical_specialty_short,medical_specialty
12232,cardiology,Cardiology
16499,other,PhysicalMedicineandRehabilitation
62944,other,Radiologist
30407,inter_med,InternalMedicine
74699,missing,?
55794,missing,?
55633,other,Psychiatry
100656,missing,?
4788,missing,?
393,gp,Family/GeneralPractice


In [47]:
four_dum_tree = pd.get_dummies(train_data_2[['medical_specialty_short']])
four=list(four_dum_tree.columns)
four_dum_linear= four_dum_tree.drop('medical_specialty_short_cardiology', axis=1)
four_l=list(four_dum_linear.columns)

### Race is divided into four groups in the paper, here we'll divide into 5 groups 

AfricanAmerican 

Asian 

Caucasian 

Hispanic

Other 

In [None]:
train_data_2['race'].value_counts()

In [50]:
five_dum_tree= pd.get_dummies(train_data_2[['race']])
five=five_dum_tree.columns
five_dum_linear=five_dum_tree.drop('race_AfricanAmerican', axis=1)
five_l=five_dum_linear.columns

### Discharge is divided into two groups: 

We observe that 1 is the most prevalent, and 1 indicates that the patient is discharged to home, and there are only two groups, either discharge to home or not 

`discharge_other`: is a binary variable 

In [None]:
train_data_2['discharge_disposition_id'].value_counts()

In [51]:
train_data_2['discharge_other'] = pd.Series(len(train_data_2['discharge_disposition_id']), index=train_data_2.index)
train_data_2['discharge_other']= 0
train_data_2.loc[train_data_2['discharge_disposition_id']!= 1 ,'discharge_other'] = 1

In [64]:
#check with a comparison between discharge_disposition_id and discharge_other, make sure they match 
train_data_2[['discharge_disposition_id', 'discharge_other']].head()

Unnamed: 0,discharge_disposition_id,discharge_other
0,25,1
1,1,0
2,1,0
3,1,0
4,1,0


In [52]:
six_dum_tree= train_data_2[['discharge_other']]
six= six_dum_tree.columns
six_dum_linear=six_dum_tree
six_l= six_dum_tree.columns

### Admission Source ID is divided into three groups in the paper

But in the paper, the significant coefficient is on the "other" subcategory, so here we further divide the "other" group into 

In [55]:
idmapping = pd.read_csv('raw data/admission_source_id.csv')

In [57]:
train_data_2[train_data_2['admission_source_id']==1]

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,metformin-pioglitazone,change,diabetesMed,readmitted,readmitted_yes,HbA1c,diag1_collapse,age_bin,medical_specialty_short,discharge_other
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,...,No,No,No,NO,0,,diabetes,<30,other,1
23,183930,107400762,Caucasian,Female,[80-90),2,6,1,11,?,...,No,No,No,>30,0,,other,60+,missing,1
27,248916,115196778,Caucasian,Female,[50-60),1,1,1,2,Surgery-General,...,No,No,Yes,>30,0,,injury,30_60,surgery,0
42,421194,96435585,Caucasian,Female,[70-80),2,1,1,13,?,...,No,Ch,Yes,>30,0,,other,60+,missing,0
73,927786,60679647,Caucasian,Female,[70-80),3,1,1,14,Surgery-General,...,No,Ch,Yes,NO,0,,neoplasms,60+,surgery,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101740,443778398,134647673,Caucasian,Male,[40-50),3,1,1,1,?,...,No,Ch,Yes,NO,0,,other,30_60,missing,0
101745,443797076,183766055,Caucasian,Male,[50-60),2,1,1,3,?,...,No,Ch,Yes,NO,0,High_CH,other,30_60,missing,0
101749,443816024,106392411,Caucasian,Female,[70-80),3,6,1,3,Orthopedics,...,No,Ch,Yes,NO,0,Norm,muscul,60+,other,1
101751,443835140,175326800,Caucasian,Male,[70-80),3,6,1,13,?,...,No,Ch,Yes,NO,0,Norm,other,60+,missing,1


In [59]:
idmapping[idmapping['description'].str.contains(r'Transfer', na=False)] # Note that here that admission_type_id is string instead of int

Unnamed: 0,admission_source_id,description
3,4,Transfer from a hospital
4,5,Transfer from a Skilled Nursing Facility (SNF)
5,6,Transfer from another health care facility
9,10,Transfer from critial access hospital
16,18,Transfer From Another Home Health Agency
20,22,Transfer from hospital inpt/same fac reslt in...
23,25,Transfer from Ambulatory Surgery Center
24,26,Transfer from Hospice


In [60]:
# Convert admission source ID to category groups
def conv_adms_id(sid):
    try: 
        if  (sid == 1) | (sid ==2) | (sid==3): 
            result = 'referral'
        elif  sid == 7:
            result = 'emergency'
        elif (sid == 4) | (sid==5)|(sid==6)|(sid==10)| (sid==18)| (sid==22)| (sid==25)| (sid==26) :
            result = 'transfer'
        else:
            result = 'other'
        return result
    except ValueError:
        print('Source ID must be integer!')

In [61]:
train_data_2['admission_source_collapse']=train_data_2['admission_source_id'].apply(lambda x: conv_adms_id(x))

In [62]:
eight_dum_tree= pd.get_dummies(train_data_2[['admission_source_collapse']])
eight_dum_linear= eight_dum_tree.drop('admission_source_collapse_emergency', axis=1)

## Putting together the training_data set 

In [65]:
all_dum_linear= pd.concat([train_data_2[['encounter_id']],one_dum_linear, two_dum_linear, three_dum_linear, four_dum_linear, 
                           five_dum_linear,six_dum_linear, eight_dum_linear], axis=1, sort=False)

all_dum_tree= pd.concat([train_data_2[['encounter_id']], one_dum_tree, two_dum_tree, three_dum_tree, four_dum_tree, 
                         five_dum_tree, six_dum_tree, eight_dum_tree], axis=1, sort=False)

In [None]:
all_dum_linear.columns

In [67]:
len(all_dum_tree.columns)

34

#### Concatenating all the continuous variables together 

In [81]:
all_features = pd.concat([all_dum_linear, train_data3, train_data_2[['readmitted_yes']]], axis=1, sort=False)

In [82]:
all_features.columns

Index(['encounter_id', 'HbA1c_High_CH', 'HbA1c_High_NoCH', 'HbA1c_Norm',
       'diag1_collapse_circulatory', 'diag1_collapse_digestive',
       'diag1_collapse_genit', 'diag1_collapse_injury',
       'diag1_collapse_muscul', 'diag1_collapse_neoplasms',
       'diag1_collapse_other', 'diag1_collapse_respiratory', 'age_bin_60+',
       'age_bin_<30', 'medical_specialty_short_ER_trauma',
       'medical_specialty_short_gp', 'medical_specialty_short_inter_med',
       'medical_specialty_short_missing', 'medical_specialty_short_other',
       'medical_specialty_short_surgery', 'race_Asian', 'race_Caucasian',
       'race_Hispanic', 'race_Other', 'discharge_other',
       'admission_source_collapse_other', 'admission_source_collapse_referral',
       'admission_source_collapse_transfer', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_diagnoses', 'number_outpatient_log1p', 'number_emergency_log1p',
       'number_inpatient_log1p', 'readmi

In [83]:
all_features.to_csv('all_features.csv', index=False)

In [None]:
import itertools
column1=['one_l', 'three_l', 'two', 'five_l', 'four_l']
column2=['two_l', 'four_l', 'six', 'six', 'six']

#for col1, col2 in zip(column1, column2):
for r in itertools.product(one_l, two_l):
    all_dum_linear[r[0]+'_'+ r[1]]= one_dum_linear[r[0]]*two_dum_linear[r[1]]
    
for r in itertools.product(three_l, four_l):
    all_dum_linear[r[0]+'_'+ r[1]]= three_dum_linear[r[0]]*four_dum_linear[r[1]]
    
for r in itertools.product(two, six):
    all_dum_linear[r[0]+'_'+ r[1]]= two_dum_tree[r[0]]*six_dum_linear[r[1]]
    
for r in itertools.product(five_l, six):
    all_dum_linear[r[0]+'_'+ r[1]]= five_dum_linear[r[0]]*six_dum_linear[r[1]]
    
for r in itertools.product(four_l, six):
    all_dum_linear[r[0]+'_'+ r[1]]= four_dum_linear[r[0]]*six_dum_linear[r[1]]

In [None]:
all_dum_linear.to_csv('interaction_feat_kc.csv',index=True)

In [None]:
all_dum_linear.columns

In [None]:
interaction_terms_additional = [('num_medications','time_in_hospital'),
('num_medications','num_procedures'),
('time_in_hospital','num_lab_procedures'),
('num_medications','num_lab_procedures'),
('num_medications','number_diagnoses'),
('age','number_diagnoses'),
('change','num_medications'),
('number_diagnoses','time_in_hospital'),
('num_medications','numchange')]

## Visualizations Variable counts for readmitted_yes vs readmitted_no 

In [None]:
plt.figure(figsize=(15, 60))
for idx, col in enumerate(nom_var[0:3]):
    sns.catplot(y="readmitted_yes", hue=nom_var[idx], kind= "count", palette="muted", data=train_data_nom_var)

## Here we want to examine the mean probability of readmission among the groups and subgroups 

In [None]:
plt.figure(figsize=(16, 75))
for idx, col in enumerate(nom_var[0:3]):
    #plt.subplot(18, 2, idx + 1)
    sns.catplot(x=col, y="readmitted_yes", hue= "diabetesMed", data=train_data_nom_var, kind="bar")