In [1]:
import pandas as pd
import numpy as np

In [2]:
# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Performance evaluation
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, recall_score

In [104]:
data = pd.read_csv('./Healthcare-Diabetes.csv')

In [105]:
data.head(5)

Unnamed: 0,Id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [107]:
data.drop('Id', axis = 1, inplace = True)

In [109]:
data.shape

(2768, 9)

In [14]:
X = data.iloc[:,:-1]

In [15]:
y = data.Outcome

In [16]:
X = X.to_numpy()
y = y.to_numpy()

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [38]:
print(f'X_train shape : {X_train.shape}')
print(f'X_test shape  : {X_test.shape}')
print(f'y_train shape : {y_train.shape}')
print(f'y_test shape  : {y_test.shape}')

X_train shape : (2214, 8)
X_test shape  : (554, 8)
y_train shape : (2214,)
y_test shape  : (554,)


In [19]:
std = StandardScaler()
X_train_scaled = std.fit_transform(X_train)
X_test_scaled = std.transform(X_test)

In [22]:
# Creating performance dataframe
training_performance = pd.DataFrame(columns = ['model_name', 'std_accuracy', 'std_precision', 'std_f1_score', 'std_recall'])
testing_performance = pd.DataFrame(columns = ['model_name', 'std_accuracy', 'std_precision', 'std_f1_score', 'std_recall'])

In [23]:
models = {
    'SGDClassifier' : SGDClassifier(),
    'RidgeClassifier' : RidgeClassifier(),
    'RidgeClassifierCV' : RidgeClassifierCV(),    
    'LogisticRegression' : LogisticRegression(max_iter = 2000),
    'LogisticRegressionCV' : LogisticRegressionCV(max_iter = 2000),    
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'RandomForestClassifier' : RandomForestClassifier(),
}

In [24]:
for name, model in models.items():
    instance = model
    # Training
    instance.fit(X_train_scaled, y_train)
    
    # Prediction
    y_pred_train = instance.predict(X_train_scaled)
    y_pred_test = instance.predict(X_test_scaled)
        
    # Training Evaluation
    training_accuracy  = accuracy_score(y_train, y_pred_train) * 100
    training_precision = precision_score(y_train, y_pred_train) * 100
    training_f1_score  = f1_score(y_train, y_pred_train) * 100
    training_recall    = recall_score(y_train, y_pred_train) * 100

    # Testing Evaluaion
    testing_accuracy  = accuracy_score(y_test, y_pred_test) * 100
    testing_precision = precision_score(y_test, y_pred_test) * 100
    testing_f1_score  = f1_score(y_test, y_pred_test) * 100
    testing_recall    = recall_score(y_test, y_pred_test) * 100

    # Populating Dataframe
    training_evaluation = pd.DataFrame.from_dict(data = {
                                                            'model_name' : [name],
                                                            'std_accuracy' : [training_accuracy],
                                                            'std_precision' : [training_precision],
                                                            'std_f1_score' : [training_f1_score],
                                                            'std_recall' : [training_recall]
                                                        })
    
    testing_evaluation  = pd.DataFrame(data = {
                                                    'model_name' : [name],
                                                    'std_accuracy' : [testing_accuracy],
                                                    'std_precision' : [testing_precision],
                                                    'std_f1_score' : [testing_f1_score],
                                                    'std_recall' : [testing_recall]
                                                })

    training_performance = pd.concat([training_performance, training_evaluation], ignore_index = True)
    testing_performance  = pd.concat([testing_performance, testing_evaluation], ignore_index = True)

  training_performance = pd.concat([training_performance, training_evaluation], ignore_index = True)
  testing_performance  = pd.concat([testing_performance, testing_evaluation], ignore_index = True)


In [25]:
training_performance

Unnamed: 0,model_name,std_accuracy,std_precision,std_f1_score,std_recall
0,SGDClassifier,77.190605,77.398721,58.976442,47.637795
1,RidgeClassifier,78.590786,75.263158,64.414414,56.299213
2,RidgeClassifierCV,78.455285,75.132275,64.108352,55.905512
3,LogisticRegression,78.319783,74.143836,64.338782,56.824147
4,LogisticRegressionCV,78.229449,74.054983,64.136905,56.56168
5,DecisionTreeClassifier,100.0,100.0,100.0,100.0
6,RandomForestClassifier,100.0,100.0,100.0,100.0


In [26]:
testing_performance

Unnamed: 0,model_name,std_accuracy,std_precision,std_f1_score,std_recall
0,SGDClassifier,75.270758,73.451327,54.785479,43.684211
1,RidgeClassifier,76.895307,71.232877,61.904762,54.736842
2,RidgeClassifierCV,76.714801,71.034483,61.492537,54.210526
3,LogisticRegression,77.256318,71.333333,62.941176,56.315789
4,LogisticRegressionCV,77.075812,71.14094,62.536873,55.789474
5,DecisionTreeClassifier,99.097473,97.927461,98.694517,99.473684
6,RandomForestClassifier,99.638989,100.0,99.470899,98.947368


- Overall RandomForestClassifier is performing well