## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [40]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
#otheres
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

#### Import the CSV Data as Pandas DataFrame

In [41]:
df = pd.read_csv('data/diabetes.csv')

#### Show Top 5 Records

In [42]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


#### Preparing X and Y variables

In [43]:
X = df.drop(columns=['diabetes'],axis=1)

In [44]:
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,Female,80.0,0,1,never,25.19,6.6,140
1,Female,54.0,0,0,No Info,27.32,6.6,80
2,Male,28.0,0,0,never,27.32,5.7,158
3,Female,36.0,0,0,current,23.45,5.0,155
4,Male,76.0,1,1,current,20.14,4.8,155


In [45]:
y = df['diabetes']

In [46]:
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64

In [None]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

In [47]:
numerical_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns
print("numerical_columns: ",numerical_columns)
print("categorical_columns:",categorical_columns)

num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
        ])

            # Categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown='ignore'))
        ])
preprocessor = ColumnTransformer(
                transformers=[
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipeline", cat_pipeline, categorical_columns)
                ]
            )

numerical_columns:  Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level'],
      dtype='object')
categorical_columns: Index(['gender', 'smoking_history'], dtype='object')


In [48]:
X = preprocessor.fit_transform(X)

In [49]:
X.shape

(100000, 15)

In [50]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((80000, 15), (20000, 15))

In [52]:

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'KNeighbors': KNeighborsClassifier(),
    'GaussianNB': GaussianNB(),
    'MLP': MLPClassifier(random_state=42, max_iter=1000),
            }
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    test_model_score = accuracy_score(y_train, y_train_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- test_model_score: {:.4f}".format(test_model_score))

    print('----------------------------------')
    print('\n')

Logistic Regression
Model performance for Training set
- test_model_score: 0.9607
----------------------------------




Random Forest
Model performance for Training set
- test_model_score: 0.9993
----------------------------------


Gradient Boosting
Model performance for Training set
- test_model_score: 0.9723
----------------------------------






AdaBoost
Model performance for Training set
- test_model_score: 0.9719
----------------------------------


CatBoost
Model performance for Training set
- test_model_score: 0.9785
----------------------------------


Extra Trees
Model performance for Training set
- test_model_score: 0.9993
----------------------------------


SVC
Model performance for Training set
- test_model_score: 0.9647
----------------------------------


KNeighbors
Model performance for Training set
- test_model_score: 0.9695
----------------------------------


GaussianNB
Model performance for Training set
- test_model_score: 0.6350
----------------------------------


MLP
Model performance for Training set
- test_model_score: 0.9726
----------------------------------




### Results

In [56]:
pd.DataFrame(list(zip(model_list, test_model_score)), columns=['Model Name', 'test_model_score']).sort_values(by=["test_model_score"],ascending=False)

TypeError: 'float' object is not iterable