In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
dataset = pd.read_csv('diabetic_data.csv')
dataset.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
from sklearn.preprocessing import LabelEncoder

diag_list = ['diag_1', 'diag_2', 'diag_3']

for item in diag_list:
    labelencoder = LabelEncoder()
    labelencoder.fit(dataset[item])
    dataset[item] = labelencoder.fit_transform(dataset[item])
    non_value = labelencoder.transform(['?'])
    dataset[item].replace(non_value, np.nan, inplace=True)

In [4]:
dataset.replace('?', np.nan, inplace=True)

In [5]:
replacement_dict = dict.fromkeys(dataset['weight'].unique())
replacement_dict.pop(np.nan)
for index, item in enumerate(replacement_dict):
    replacement_dict[item] = index
dataset['weight'].replace(replacement_dict, inplace=True)

In [6]:
gender_replacements = {
    'Female': 0,
    'Male': 1,
    'Unknown/Invalid': np.nan
}
dataset['gender'].replace(gender_replacements, inplace=True)

In [7]:
dataset['race'].unique()
race_replacements = {
    "Caucasian": 0,
    "AfricanAmerican": 1,
    "Hispanic": 2,
    "Other": 3,
    "Asian": 4
}
dataset['race'].replace(race_replacements, inplace=True)

In [8]:
# trying to break the ages into risk groups
age_replacements = {
    '[0-10)': 0,
    '[10-20)': 0,
    '[20-30)': 0,
    '[30-40)': 1,
    '[40-50)': 1,
    '[50-60)': 2,
    '[60-70)': 2,
    '[70-80)': 3,
    '[80-90)': 3,
    '[90-100)': 4
}
dataset['age'].replace(age_replacements, inplace=True)

In [9]:
readmitted_replacements = {
  ">30": 0,
  "<30": 1,
  "NO": 0
}
dataset['readmitted'].replace(readmitted_replacements, inplace=True)

In [10]:
medicine_replacements = {
    'No': 0,
    'Yes': 1,
    'Ch': 1,
    'Steady': 1,
    'Down': 2,
    'Up': 3
}
medicine_cols = dataset.iloc[:, 24:49]
for col in medicine_cols:
    dataset[col].replace(medicine_replacements, inplace=True)

In [11]:
misc_replacements = {
    'None': 0,
    '>300': 1,
    '>7': 1,
    'Norm': 2,
    '>200': 3,
    '>8': 3
}

for col in ['max_glu_serum', 'A1Cresult']:
    dataset[col].replace(misc_replacements, inplace=True)

In [12]:
dataset.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,0.0,0.0,0,,6,25,1,1,...,0,0,0,0,0,0,0,0,0,0
1,149190,55629189,0.0,0.0,0,,1,1,7,3,...,0,3,0,0,0,0,0,1,1,0
2,64410,86047875,1.0,0.0,0,,1,1,7,2,...,0,0,0,0,0,0,0,0,1,0
3,500364,82442376,0.0,1.0,1,,1,1,7,2,...,0,3,0,0,0,0,0,1,1,0
4,16680,42519267,0.0,1.0,1,,1,1,7,1,...,0,1,0,0,0,0,0,1,1,0


In [13]:
column_choices = list(range(3, 6)) + [9] + list(range(12, 49))
X = dataset.iloc[:, column_choices]
Y = dataset.iloc[:, 49].values

[[0.0 0 nan ... 0 0 0]
 [0.0 0 nan ... 0 1 1]
 [0.0 0 nan ... 0 0 1]
 ...
 [1.0 3 nan ... 0 1 1]
 [0.0 3 nan ... 0 1 1]
 [1.0 3 nan ... 0 0 0]]


In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 0)

In [16]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

In [None]:
def calculate_specificity(true_negative, false_positive):
    return true_negative / (true_negative + false_positive)

def calculate_sensitivity(true_positive, false_positive):
    return true_positive / (true_positive + false_positive)

In [17]:
#Fitting the Logistic Regression Algorithm to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)
#predicting the Test set results
Y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
accuracy_score(Y_test, Y_pred)

0.8874914021813894

In [19]:
from sklearn.neighbors import KNeighborsClassifier
neighbor_classifier = KNeighborsClassifier(n_neighbors=5)
neighbor_classifier.fit(X_train, Y_train)
#predicting the Test set results
Y_pred = neighbor_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
print(KNN Confusion Matrix)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
knn_accuracy = accuracy_score(Y_test, Y_pred)
print(z)

0.8785496708263731


In [20]:
from sklearn import tree
tree_classifier = tree.DecisionTreeClassifier()
tree_classifier.fit(X_train, Y_train)
#predicting the Test set results
Y_pred = tree_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
z = accuracy_score(Y_test, Y_pred)
print(z)

0.7876584455143952


In [21]:
from sklearn import tree
tree_classifier = tree.DecisionTreeClassifier()
tree_classifier.fit(X_train, Y_train)
#predicting the Test set results
Y_pred = tree_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
z = accuracy_score(Y_test, Y_pred)
print(z)

0.7931610494251744


In [22]:
from sklearn.ensemble import RandomForestClassifier
forest_classifer = RandomForestClassifier(max_depth=2, random_state=0)
forest_classifer.fit(X_train, Y_train)
#predicting the Test set results
Y_pred = forest_classifer.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
z = accuracy_score(Y_test, Y_pred)
print(z)

0.888375749238479
