In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data_cleaned.csv')

In [3]:
data.head(60)

Unnamed: 0,gender,age,max_glu_serum,A1Cresult,admission_type_id,discharge_disposition_id,admission_source_id,medical_specialty,diag_1,diag_2,diag_3,change,diabetesMed,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,readmitted


In [4]:
sns.countplot(data=data, x="age", hue="readmitted")

ValueError: min() arg is an empty sequence

In [None]:
data = data[data['age']=='6']

In [None]:
data.shape

In [None]:
numerical = data[['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications','number_outpatient', 'number_emergency',
       'number_inpatient']]

In [None]:
categorical = data[['gender', 'age','max_glu_serum', 'A1Cresult',  'admission_type_id', 'discharge_disposition_id',
       'admission_source_id', 'medical_specialty', 'diag_1', 'diag_2',
       'diag_3',  'change', 'diabetesMed']]

In [None]:
target = data[['readmitted']]

In [None]:
# categorical.to_csv('categorical.csv', index = False)
# numerical.to_csv('numerical.csv', index = False)
# target.to_csv('target.csv',index=False)

In [None]:
for col in categorical.columns:
    categorical[col] = categorical[col].astype('object')

In [None]:
ordinal_cat = ['age', 'max_glu_serum', 'A1Cresult']
one_hot_cat = ['medical_specialty','admission_type_id', 'discharge_disposition_id',
       'admission_source_id','diag_1','diag_2','diag_3','diabetesMed']

In [None]:
categorical['max_glu_serum'].value_counts()

In [None]:
categorical['A1Cresult'].value_counts()

In [None]:
scale_mapper = {"[0-10)":1, "[10-20)":2, "[20-30)":3,"[30-40)":4,"[40-50)":5,"[50-60)":6,"[60-70)":7,"[70-80)":8,"[80-90)":9,"[90-100)":10}
categorical["age"] = categorical["age"].replace(scale_mapper)

In [None]:
scale_mapper = {"Not_Performed":0,"Norm":1, "High":2}
categorical["max_glu_serum"] = categorical["max_glu_serum"].replace(scale_mapper)

In [None]:
scale_mapper = {"Not_Performed":0,"Norm":1, "High":2}
categorical["A1Cresult"] = categorical["A1Cresult"].replace(scale_mapper)

In [None]:
data = pd.concat((categorical,numerical,target),axis=1)
print(data.shape)
data.head()

In [None]:
data.to_csv('data_cleaned.csv', index = False)

## Models

In [None]:
# X/y split
X = data.drop('readmitted',axis = 1)
y = data['readmitted']

In [None]:
# train-test-split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0) 

In [None]:
#splitting into categorical and numerical
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)

X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)

In [None]:
X_train_cat

In [None]:
X_train_num

In [None]:
#scale numericals
from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(X_train_num) 
numericals_train_scaled = transformer.transform(X_train_num)
numericals_test_scaled = transformer.transform(X_test_num)

Encode categoricals

In [None]:
#encode categoricals
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore', drop='first').fit(X_train_cat)
categoricals_train_encoded = encoder.transform(X_train_cat).toarray()
categoricals_test_encoded = encoder.transform(X_test_cat).toarray()

In [None]:
X_train = np.concatenate((numericals_train_scaled,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_scaled,categoricals_test_encoded),axis=1)
X_train = pd.DataFrame(X_train)
X_train.head()


In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='saga')
LR.fit(X_train, y_train)
LR.score(X_test, y_test) 

In [None]:
pred = LR.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(LR, X_test, y_test,cmap=plt.cm.Blues)  
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(clf, X_test, y_test,cmap=plt.cm.Blues)  
plt.show()

In [None]:
#complete the code here
model1 = clf
model2 = LR


model_pipeline = [model1, model2]
model_names = ['RandomForestClassifier', 'Logistic Regression']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_name] = mean_score
print(scores)