# Model Training Demo 

In [1]:
import numpy as np             
import pandas as pd                

In [2]:
# Loading data
df = pd.read_csv('data/heart.csv')

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
df.shape

(918, 12)

In [6]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [7]:
# Splitting data into dependent and independent features
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [10]:
num_features = X.select_dtypes('number').columns
cat_features = X.select_dtypes('object').columns

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [15]:
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder())
    ]
)

In [16]:
from sklearn.compose import ColumnTransformer

In [17]:
preprocessor = ColumnTransformer([
    ('num_transformation', num_pipeline, num_features),
    ('cat_tranformation', cat_pipeline, cat_features)
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [21]:
X_train.shape

(734, 20)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [25]:
models = {
    'logistic regression' : LogisticRegression(),
    'support vector classifier' : SVC(),
    'k nearest classifier' : KNeighborsClassifier(),
    'Decision Tree Classifier' : DecisionTreeClassifier(),
    'Random Forest Classifier' : RandomForestClassifier()
}

model_score = {}

for mod_name, model in models.items():
    model = model
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print('Training Evaluation')
    print(mod_name)

    print(precision_score(y_train, y_pred_train))
    print(recall_score(y_train, y_pred_train))
    print(accuracy_score(y_train, y_pred_train))

    print('-' * 50)

    print('Testing Evaluation')

    print(precision_score(y_test, y_pred_test))
    print(recall_score(y_test, y_pred_test))
    print(accuracy_score(y_test, y_pred_test))

    model_score[mod_name] = accuracy_score(y_test, y_pred_test)

    print('=' * 50)
    print('\n')

Training Evaluation
logistic regression
0.8710462287104623
0.8927680798004988
0.8692098092643051
--------------------------------------------------
Testing Evaluation
0.9
0.8411214953271028
0.8532608695652174


Training Evaluation
support vector classifier
0.8909952606635071
0.9376558603491272
0.9032697547683923
--------------------------------------------------
Testing Evaluation
0.8867924528301887
0.8785046728971962
0.8641304347826086


Training Evaluation
k nearest classifier
0.8741092636579573
0.9177057356608479
0.8828337874659401
--------------------------------------------------
Testing Evaluation
0.91
0.8504672897196262
0.8641304347826086


Training Evaluation
Decision Tree Classifier
1.0
1.0
1.0
--------------------------------------------------
Testing Evaluation
0.875
0.7850467289719626
0.8097826086956522


Training Evaluation
Random Forest Classifier
1.0
1.0
1.0
--------------------------------------------------
Testing Evaluation
0.9134615384615384
0.8878504672897196
0.8858

In [26]:
model_score

{'logistic regression': 0.8532608695652174,
 'support vector classifier': 0.8641304347826086,
 'k nearest classifier': 0.8641304347826086,
 'Decision Tree Classifier': 0.8097826086956522,
 'Random Forest Classifier': 0.8858695652173914}

In [29]:
pd.DataFrame({
    'models' : model_score.keys(),
    'accuracy_score' : model_score.values()
})

Unnamed: 0,models,accuracy_score
0,logistic regression,0.853261
1,support vector classifier,0.86413
2,k nearest classifier,0.86413
3,Decision Tree Classifier,0.809783
4,Random Forest Classifier,0.88587
