In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

import pandas as pd
import numpy as np

random.seed(50)

Inspiration from https://www.kaggle.com/code/andls555/heart-disease-prediction

Obtain dataset from here: https://www.kaggle.com/code/andls555/heart-disease-prediction/data

In [None]:
df = pd.read_csv('../data/heart_2020_cleaned.csv')
df.head()

target = 'HeartDisease'
features = df.columns[df.columns != target]

X = df[features]
y = df[target]

First lets see what our target looks like, and its distribution

In [None]:
df[target].value_counts()

In [None]:
df.shape

Lets quickly view the data

In [None]:
sns.pairplot(df.sample(1000))

## Data Cleanup

How come such few features are plotted?

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.nunique()

Lets handle all the categorical data. Some have two values (which we can easily encode as 0,1), and others might encode different categories, in which case we will want to take another approach.
Think about whether the data represented is ordinal or nominal.

In [None]:
df.head()

In [None]:
selected_columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
df[selected_columns] = df[selected_columns].replace({'Yes':1, 'No':0})

df['Sex'] = df['Sex'].replace({'Male':0, 'Female':1})

In [None]:
df.head()

In [None]:
df.GenHealth.value_counts()

In [None]:
df.Diabetic.value_counts()

In [None]:
df['Diabetic'] = df['Diabetic'].replace({'Yes': 1, 'No':0, 'Yes (during pregnancy)':1, 'No, borderline diabetes':0})


In [None]:
sns.pairplot(df.sample(1000), hue='HeartDisease')

Visualize categorical data

In [None]:
sns.catplot(x="Sex", hue="HeartDisease", kind="count", data=df.sample(1000))
# Recall: df['Sex'] = df['Sex'].replace({'Male':0, 'Female':1})

In [None]:
df.groupby('Sex').HeartDisease.value_counts(normalize=True).unstack().plot.bar()

In [None]:
sns.catplot(x="GenHealth", hue="HeartDisease", kind='count', data=df.sample(1000),  order=['Poor',  'Fair', 'Good','Very good', 'Excellent'])

In [None]:
sns.catplot(x="BMI", y="GenHealth", hue="HeartDisease", order=['Poor',  'Fair', 'Good','Very good', 'Excellent'],
            kind="box", data=df.sample(10000))

Heatmap to view correlations between numerical features

In [None]:
correlation = df.corr().round(2)
plt.figure(figsize = (14,7))
sns.heatmap(correlation, annot = True, cmap = 'YlOrBr')

In [None]:
plt.figure(figsize = (13,6))
plt.title('Distribution of correlation of features')
abs(correlation['HeartDisease']).sort_values()[:-1].plot.barh()
plt.show()

In [None]:
sns.displot(df.sample(1000), x="DiffWalking", hue="HeartDisease", kind="kde", fill=True)

In [None]:

sns.displot(df.sample(10000), x="DiffWalking", hue="HeartDisease", kind='kde', common_norm=False, fill=True)

In [None]:
sns.displot(df.sample(100000), x="BMI", hue="HeartDisease", kind='kde', common_norm=False, fill=True)

## Data processing

- Check for missing values
- Onehot encoding
- Train test split

- Standardize data


In [None]:
df.isnull().any()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split



In [None]:
df.shape

In [None]:
target = 'HeartDisease'
features = df.columns[df.columns != target]

sample_count = 100000

if sample_count:
    X = df.sample(sample_count)[features]
    y = df.sample(sample_count)[target]
else:
    X = df[features]
    y = df[target]  

X.shape

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

Here let's specificy columsn we want to apply transformers to

In [None]:
categorical_columns = ['AgeCategory', 'Race', 'GenHealth']
numerical_columns = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

rest_columns = X.columns[~ X.columns.isin( categorical_columns) &  ~ X.columns.isin( numerical_columns)]
rest_columns

In [None]:
pipe = Pipeline(
    [
        ("preprocessing", ColumnTransformer(
            [
                ("onehotencoder", OneHotEncoder(sparse=False), categorical_columns),
                ("scale", StandardScaler(), numerical_columns),
                
            ],
            remainder="passthrough"
        ))
    ]
)




 Lets make sure this works so far!

In [None]:
X_transformed = pipe.fit_transform(X)
X.shape, X_transformed.shape

In [None]:
pipe.get_feature_names_out()

In [None]:
X_transformed[0]

## Model selection

In [None]:
def evaluate_model(model, x_test, y_test, should_print=True):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    
    if should_print:
        print('Accuracy:',acc)
        print('Precision:',prec)
        print('Recall:',rec)
        print('F1 Score:',f1)
        print('Cohens Kappa Score:',kappa)
        print('Area Under Curve:',auc)
        print('Confusion Matrix:\n',cm)



    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

In [None]:
def get_pipeline(*steps):
    return  Pipeline(
    [
        ("preprocessing", ColumnTransformer(
            [
                ("onehotencoder", OneHotEncoder(sparse=False), categorical_columns),
                ("scale", StandardScaler(), numerical_columns),
                
            ],
            remainder="passthrough"
        )),
        *steps
    ]
)




In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)



In [None]:
pipe = get_pipeline(

    ('knn', knn)
)

pipe

In [None]:
pipe.fit(X_train, y_train)

In [None]:
res = evaluate_model(pipe, X_test, y_test)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

cm_display = ConfusionMatrixDisplay(res['cm']).plot()
cm_display.ax_.grid(False)

Repeat with another classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier(max_depth=15)
pipe = get_pipeline(('tree', tree))
pipe.fit(X_train, y_train)


In [None]:
res = evaluate_model(pipe, X_test, y_test)

In [None]:
cm_display = ConfusionMatrixDisplay(res['cm']).plot()
cm_display.ax_.grid(False)

## Grid search

In [None]:
tree = DecisionTreeClassifier(max_depth=15)
pipe = get_pipeline(('tree', tree))

In [None]:
param_grid = {
        # "criterion": ['gini', 'entropy'],
        "tree__max_depth": range(5,20),
        "tree__min_samples_split": range(1,10),
        # "min_samples_leaf": range(1,10),
}


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=1, n_jobs=1, scoring='f1', refit=True)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_) 

In [None]:
grid_predictions = grid.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_predictions)) 

In [None]:
res = evaluate_model(grid, X_test, y_test)

In [None]:
cm_display = ConfusionMatrixDisplay(res['cm']).plot()
cm_display.ax_.grid(False)

## Ensemble Classifiers

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


# Model persistence model

Export any model using pickle which we will use in our next steps for ML serving

In [None]:
param_grid = {
        # "model__max_depth": [5, 6, 7],
              'model__learning_rate': [.01, 0.03, .1],
              'model__n_estimators': [200],
              # 'model__subsample': [.7, .8, .9]
             }


gbc = GradientBoostingClassifier()
pipe = get_pipeline(('model', gbc))
grid = GridSearchCV(pipe, param_grid, cv=3, verbose=3, n_jobs=3, scoring='f1', refit=True)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_) 

In [None]:
res = evaluate_model(grid, X_test, y_test)

In [None]:
cm_display = ConfusionMatrixDisplay(res['cm']).plot()
cm_display.ax_.grid(False)

In [None]:
import pickle

best_est = grid.best_estimator_
with open('../models/HF.pkl', 'wb') as f:
    pickle.dump(best_est, f)

In [None]:
best_est.predict( X_test.iloc[3:4,:] )

In [None]:
best_est.predict_proba( X_test.iloc[2:5,:] )

In [None]:
 X_test.iloc[3:6,:].to_json(orient='records')

In [None]:
y_test.iloc[4]