<a href="https://colab.research.google.com/github/KendallScott/QTW/blob/main/Case%20Study%202/Case_Study_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import label_binarize, OneHotEncoder

In [14]:
df = pd.read_csv("https://raw.githubusercontent.com/KendallScott/QTW/main/Case%20Study%202/diabetic_data.csv")
ID_map = pd.read_csv("https://raw.githubusercontent.com/KendallScott/QTW/main/Case%20Study%202/IDs_mapping.csv")

admission_source = ID_map.drop(['admission_description', 'discharge_description'], axis=1)
admission = ID_map.drop(['admission_source_description', 'discharge_description'], axis=1)
discharge = ID_map.drop(['admission_description', 'admission_source_description'], axis=1)

df=pd.merge(df, admission, 
            left_on='admission_type_id', 
            right_on='id', 
            how='left' 
)


df=pd.merge(df, discharge, 
            left_on='discharge_disposition_id', 
            right_on='id', 
            how='left' 
)
df=pd.merge(df, admission_source, 
            left_on='admission_source_id', 
            right_on='id', 
            how='left' 
)

df = df[df.columns.drop(list(df.filter(regex='_x')))]
df = df[df.columns.drop(list(df.filter(regex='_y')))]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101766 entries, 0 to 101765
Data columns (total 54 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   encounter_id                  101766 non-null  int64 
 1   patient_nbr                   101766 non-null  int64 
 2   race                          101766 non-null  object
 3   gender                        101766 non-null  object
 4   age                           101766 non-null  object
 5   weight                        101766 non-null  object
 6   admission_type_id             101766 non-null  int64 
 7   discharge_disposition_id      101766 non-null  int64 
 8   admission_source_id           101766 non-null  int64 
 9   time_in_hospital              101766 non-null  int64 
 10  payer_code                    101766 non-null  object
 11  medical_specialty             101766 non-null  object
 12  num_lab_procedures            101766 non-null  int64 
 13 

Upon first inspection there do not appear to be any NAs

In [15]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_description,discharge_description,id,admission_source_description
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,NO,,Not Mapped,1,Physician Referral
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,7,Emergency Room
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,Yes,NO,Emergency,Discharged to home,7,Emergency Room
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,7,Emergency Room
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,7,Emergency Room


The weight column has a question mark (?), corresponding to missing weights, these are effectively NAs.
Also, target is not binary in this case, however, our aim is to predict rehospitalization within 30 days, a binary (yes or no) response.

We replaced the "?" values with NaN.

In [16]:
# Replacing the ? with nan
df=df.replace("?", np.nan)

In [17]:
# Get count duplicates single column using dataframe.pivot_table()
df2 = df.pivot_table(index = ['patient_nbr'], aggfunc ='size')
print(df2)

patient_nbr
135          2
378          1
729          1
774          1
927          1
            ..
189351095    1
189365864    1
189445127    1
189481478    1
189502619    1
Length: 71518, dtype: int64


When we look at the data by patient number, we can see that the number of rows drops from 101,766 down to 71,518. We will be dropping the repeated values.

In [18]:
#sorting the dataframe by patient_nbr and encounter_id, in order to filter out the repeat values 
df=df.sort_values(by=['patient_nbr', 'encounter_id'])

#filtering out repeat values by patient_nbr
df=df.groupby('patient_nbr').first()
len(df)

71518

In [19]:
for i in df.columns:
  count = df.loc[df[i]=='?',i].count()
  if count > 0:
    print('{}:'.format(i),
          '{}%'.format(np.round(np.divide(count,len(df))*100,2)))

Three variables have a large percentage of missing values, Weight, Payer_code and Medical_speciatly with about 97%, 40% and 49% of missing values accordingly. Race and diags 1-3 all have a smaller number of missing values.

In [50]:
df.describe()

Unnamed: 0,encounter_id,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,id
count,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0,71518.0
mean,156815600.0,2.100254,3.593235,5.657457,4.28913,43.075478,1.430577,15.705025,0.280069,0.10354,0.177829,7.2457,5.657457
std,100376900.0,1.508301,5.269771,4.16258,2.94921,19.952338,1.759864,8.311163,1.068957,0.509187,0.60379,1.994674,4.16258
min,12522.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
25%,81458070.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0,1.0
50%,144057400.0,1.0,1.0,7.0,3.0,44.0,1.0,14.0,0.0,0.0,0.0,8.0,7.0
75%,215551300.0,3.0,3.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,0.0,9.0,7.0
max,443867200.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,42.0,12.0,16.0,25.0


In [58]:
df

Unnamed: 0_level_0,encounter_id,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_description,discharge_description,id,admission_source_description
patient_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
135,24437208,Caucasian,Female,[50-60),,2,1,1,8,,...,No,No,No,Ch,Yes,<30,Urgent,Discharged to home,1,Physician Referral
378,29758806,Caucasian,Female,[50-60),,3,1,1,2,,...,No,No,No,No,No,NO,Elective,Discharged to home,1,Physician Referral
729,189899286,Caucasian,Female,[80-90),,1,3,7,4,MC,...,No,No,No,No,Yes,NO,Emergency,Discharged/transferred to SNF,7,Emergency Room
774,64331490,Caucasian,Female,[80-90),,1,1,7,3,,...,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,7,Emergency Room
927,14824206,AfricanAmerican,Female,[30-40),,1,1,7,5,,...,No,No,No,No,Yes,NO,Emergency,Discharged to home,7,Emergency Room
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189351095,418513058,Caucasian,Female,[80-90),,1,1,7,1,,...,No,No,No,No,No,NO,Emergency,Discharged to home,7,Emergency Room
189365864,359719064,Other,Male,[60-70),,1,1,7,3,HM,...,No,No,No,No,Yes,NO,Emergency,Discharged to home,7,Emergency Room
189445127,338462954,Caucasian,Female,[80-90),,1,1,7,3,,...,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,7,Emergency Room
189481478,443811536,Caucasian,Female,[40-50),,1,4,7,14,MD,...,No,No,No,Ch,Yes,>30,Emergency,Discharged/transferred to ICF,7,Emergency Room


In [61]:
df[['readmitted','discharge_description']].groupby('discharge_description').describe()

Unnamed: 0_level_0,readmitted,readmitted,readmitted,readmitted
Unnamed: 0_level_1,count,unique,top,freq
discharge_description,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Admitted as an inpatient to this hospital,9,3,NO,6
Discharged to home,44485,3,NO,27159
Discharged/transferred to ICF,543,3,NO,319
Discharged/transferred to SNF,8836,3,NO,4853
Discharged/transferred to a federal health care facility.,3,1,NO,3
Discharged/transferred to a long term care hospital.,260,3,NO,173
Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.,25,3,NO,20
Discharged/transferred to another rehab fac including rehab units of a hospital .,1428,3,NO,702
Discharged/transferred to another short term hospital,1539,3,NO,889
Discharged/transferred to another type of inpatient care institution,913,3,NO,476


In [62]:
#filtering out expired and hospice patients
df = df[df["discharge_description"] != 'Expired']
df = df[df["discharge_description"] != 'Expired at home. Medicaid only, hospice.']
df = df[df["discharge_description"] != 'Hospice / home']
df = df[df["discharge_description"] != 'Hospice / medical facility']

In [59]:
df[['readmitted','admission_description']].groupby('admission_description').describe()

Unnamed: 0_level_0,readmitted,readmitted,readmitted,readmitted
Unnamed: 0_level_1,count,unique,top,freq
admission_description,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Elective,13979,3,NO,9042
Emergency,36702,3,NO,21778
Newborn,9,3,NO,6
Not Available,3209,3,NO,1870
Not Mapped,291,3,NO,201
Trauma Center,21,1,NO,21
Urgent,13076,3,NO,7806


Admission code 1-Emergency, 2-Urgent, 3-Elective, 4-Newborn, 5-NotAvailable, 6-NULL, 7-Trauma Center, 8-NotMapped.
For Not Mapped and Null admission categories, the missing payer code is seen on almost all patients. Based on the levels of the payer code category, it would seem as if the missing value corresponds to patients who did not disclose a payment method (insurance or self-pay), which in itself, is a category level.

In [21]:
df[['medical_specialty','admission_description']].groupby('admission_description').describe()

Unnamed: 0_level_0,medical_specialty,medical_specialty,medical_specialty,medical_specialty
Unnamed: 0_level_1,count,unique,top,freq
admission_description,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Elective,8995,54,Cardiology,1318.0
Emergency,15986,59,InternalMedicine,7299.0
Newborn,2,2,InternalMedicine,1.0
Not Available,1245,22,Family/GeneralPractice,490.0
Not Mapped,258,12,InternalMedicine,86.0
Trauma Center,0,0,,
Urgent,8969,57,Emergency/Trauma,2364.0


In [28]:
len(df)

71518

In [22]:
df.readmitted.value_counts()
# >30 does not count as within 30 days for our target, will replace

NO     42985
>30    22240
<30     6293
Name: readmitted, dtype: int64

In [63]:
y = df.readmitted.replace(to_replace='>30',value='NO')
y = label_binarize(y,classes=['NO','<30'])

In [66]:
# Make dataframe for model input
X = df.loc[:, df.columns != 'readmitted']

#remove repeated categorical columns for description, and discharge column, which has some values of the predictor
X = X[X.columns.drop(list(X.filter(regex='discharge')))]
X = X[X.columns.drop(list(X.filter(regex='_description')))]

#one hot encoding categorical variables for model
cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))
cat_cols

#creating dataframe of categorical columns
cat_df = X[cat_cols]
cat_df = pd.get_dummies(cat_df, columns=cat_df.columns)

#creating dataframe of numeric columns
num_df = X[num_cols]

X=cat_df.join(num_df)


X


Unnamed: 0_level_0,tolbutamide_No,tolbutamide_Steady,pioglitazone_Down,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,nateglinide_Down,nateglinide_No,nateglinide_Steady,nateglinide_Up,...,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,id
patient_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
135,1,0,0,1,0,0,0,1,0,0,...,1,8,77,6,33,0,0,0,8,1
378,1,0,0,1,0,0,0,1,0,0,...,1,2,49,1,11,0,0,0,3,1
729,1,0,0,1,0,0,0,1,0,0,...,7,4,68,2,23,0,0,0,9,7
774,1,0,0,1,0,0,0,1,0,0,...,7,3,46,0,20,0,0,0,9,7
927,1,0,0,1,0,0,0,1,0,0,...,7,5,49,0,5,0,0,0,3,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189351095,1,0,0,1,0,0,0,1,0,0,...,7,1,73,1,11,0,0,0,9,7
189365864,1,0,0,1,0,0,0,1,0,0,...,7,3,56,1,8,0,0,0,7,7
189445127,1,0,0,1,0,0,0,1,0,0,...,7,3,39,0,18,0,0,0,9,7
189481478,1,0,0,1,0,0,0,1,0,0,...,7,14,69,0,16,0,0,0,5,7


In [67]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(
     X, y, random_state=0)

model = LogisticRegression().fit(X_train, y_train)
model.score(X_val, y_val)

  y = column_or_1d(y, warn=True)


0.9116688582699675