### Model Building without preprocessing

In [1]:
import pandas as pd
import numpy as np

In [84]:
# preprocessing
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Performance evaluation
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, recall_score

In [2]:
data = pd.read_csv('./Healthcare-Diabetes.csv')

In [8]:
data.sample(5)

Unnamed: 0,Id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1091,1092,13,152,90,33,29,26.8,0.731,43,1
1701,1702,5,88,78,30,0,27.6,0.258,37,0
462,463,8,74,70,40,49,35.3,0.705,39,0
483,484,0,84,82,31,125,38.2,0.233,23,0
2388,2389,0,101,64,17,0,21.0,0.252,21,0


In [10]:
data.drop('Id', axis = 1, inplace = True)

In [31]:
X = data.iloc[:,:-1]

In [24]:
y = data.Outcome

In [32]:
X = X.to_numpy()
y = y.to_numpy()

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [38]:
print(f'X_train shape : {X_train.shape}')
print(f'X_test shape  : {X_test.shape}')
print(f'y_train shape : {y_train.shape}')
print(f'y_test shape  : {y_test.shape}')

X_train shape : (2214, 8)
X_test shape  : (554, 8)
y_train shape : (2214,)
y_test shape  : (554,)


In [99]:
# Creating performance dataframe
training_performance = pd.DataFrame(columns = ['model_name', 'accuracy', 'precision', 'f1_score', 'recall'])
testing_performance = pd.DataFrame(columns = ['model_name', 'accuracy', 'precision', 'f1_score', 'recall'])

In [85]:
models = {
    'SGDClassifier' : SGDClassifier(),
    'RidgeClassifier' : RidgeClassifier(),
    'RidgeClassifierCV' : RidgeClassifierCV(),    
    'LogisticRegression' : LogisticRegression(max_iter = 2000),
    'LogisticRegressionCV' : LogisticRegressionCV(max_iter = 2000),    
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'RandomForestClassifier' : RandomForestClassifier(),
}

In [100]:
for name, model in models.items():
    instance = model
    # Training
    instance.fit(X_train, y_train)
    
    # Prediction
    y_pred_train = instance.predict(X_train)
    y_pred_test = instance.predict(X_test)
        
    # Training Evaluation
    training_accuracy  = accuracy_score(y_train, y_pred_train) * 100
    training_precision = precision_score(y_train, y_pred_train) * 100
    training_f1_score  = f1_score(y_train, y_pred_train) * 100
    training_recall    = recall_score(y_train, y_pred_train) * 100

    # Testing Evaluaion
    testing_accuracy  = accuracy_score(y_test, y_pred_test) * 100
    testing_precision = precision_score(y_test, y_pred_test) * 100
    testing_f1_score  = f1_score(y_test, y_pred_test) * 100
    testing_recall    = recall_score(y_test, y_pred_test) * 100

    # Populating Dataframe
    training_evaluation = pd.DataFrame.from_dict(data = {
                                                            'model_name' : [name],
                                                            'accuracy' : [training_accuracy],
                                                            'precision' : [training_precision],
                                                            'f1_score' : [training_f1_score],
                                                            'recall' : [training_recall]
                                                        })
    
    testing_evaluation  = pd.DataFrame(data = {
                                                    'model_name' : [name],
                                                    'accuracy' : [testing_accuracy],
                                                    'precision' : [testing_precision],
                                                    'f1_score' : [testing_f1_score],
                                                    'recall' : [testing_recall]
                                                })

    training_performance = pd.concat([training_performance, training_evaluation], ignore_index = True)
    testing_performance  = pd.concat([testing_performance, testing_evaluation], ignore_index = True)

  training_performance = pd.concat([training_performance, training_evaluation], ignore_index = True)
  testing_performance  = pd.concat([testing_performance, testing_evaluation], ignore_index = True)


In [105]:
training_performance

Unnamed: 0,model_name,accuracy,precision,f1_score,recall
0,SGDClassifier,50.948509,41.06946,57.841615,97.769029
1,RidgeClassifier,78.590786,75.263158,64.414414,56.299213
2,RidgeClassifierCV,78.455285,75.132275,64.108352,55.905512
3,LogisticRegression,78.635953,74.450085,65.04065,57.742782
4,LogisticRegressionCV,78.319783,74.143836,64.338782,56.824147
5,DecisionTreeClassifier,100.0,100.0,100.0,100.0
6,RandomForestClassifier,100.0,100.0,100.0,100.0


In [106]:
testing_performance

Unnamed: 0,model_name,accuracy,precision,f1_score,recall
0,SGDClassifier,51.263538,40.950226,57.278481,95.263158
1,RidgeClassifier,76.895307,71.232877,61.904762,54.736842
2,RidgeClassifierCV,76.714801,71.034483,61.492537,54.210526
3,LogisticRegression,77.256318,71.333333,62.941176,56.315789
4,LogisticRegressionCV,77.256318,71.333333,62.941176,56.315789
5,DecisionTreeClassifier,99.277978,98.4375,98.95288,99.473684
6,RandomForestClassifier,99.458484,100.0,99.204244,98.421053


- **RandomForestClassifier is the only algorithm which is perfoming well on Training and Testing**