In [1]:
import warnings 
warnings.filterwarnings('ignore')

# basic libraries
import os
import numpy as np
import pandas as pd
import re
import string
from collections import Counter
import time

#visulaization modules
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# !pip install pywaffle
# from pywaffle import Waffle

%matplotlib inline
init_notebook_mode(connected= True)

In [2]:
#Common model helpers
from sklearn.preprocessing import (StandardScaler,
                                   LabelEncoder,
                                   OneHotEncoder)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, 
                             auc, 
                             precision_score,
                             recall_score,
                             f1_score, 
                             roc_auc_score,
                             confusion_matrix)
from sklearn.model_selection import (GridSearchCV,
                                     StratifiedKFold,
                                     cross_val_score)

In [3]:
# dimensionality reduction
from sklearn.decomposition import PCA
# from umap import UMAP
import pylab as pl

In [4]:
# imbalance dataset handling

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import (RandomUnderSampler, 
                                     ClusterCentroids,
                                     TomekLinks,
                                     NeighbourhoodCleaningRule,
                                     EditedNearestNeighbours,
                                     NearMiss)


from imblearn.over_sampling import (SMOTE,
                                    ADASYN)

In [5]:
# model algorithams
from sklearn.ensemble import (RandomForestClassifier, 
                              AdaBoostClassifier, 
                              GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier

# Preprocessing

In [6]:
from wrangle import wrangle_data

In [7]:
train, validate, test = wrangle_data()

In [8]:
target = 'stroke'
features = ['hypertension', 'heart_disease', 'ever_married', 'work_type', 'age_bins', 'glucose_bins', 'bmi_bins']

In [9]:
def preprocess():
    '''
    Actions:
    Returns: X_train, y_train, X_validate, y_validate, X_test, y_test
    Modules:
        1. import pandas as pd
        2. from wrangle import wrangle_data
    '''
    # get data
    train, validate, test = wrangle_data()
    
    # set target
    target = ['stroke']
    
    # set features of interest
    features = ['hypertension', 'heart_disease', 'ever_married', 'work_type', 'age_bins', 'glucose_bins', 'bmi_bins', 'smoking_status']
    
    # create train X, y
    X_train = train[features]
    y_train = train[target]
    
    # create validate X, y
    X_validate = validate[features]
    y_validate = validate[target]
    
    # create test X, y
    X_test = test[features]
    y_test = test[target]
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [10]:
X_train, y_train, X_validate, y_validate, X_test, y_test = preprocess()

KeyError: "['smoking_status'] not in index"

In [None]:
le = LabelEncoder()

In [None]:
# split train copy
X = train[features]
y_train = train['stroke']

ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension', 'heart_disease']

for col in ordinal:
    X[col] = le.fit_transform(X[col])

nominal =  ['ever_married', 'work_type', 'smo']

## norminal data one hot encoding for categorical features
temp = X.drop(columns = nominal)
dummies = pd.get_dummies(X[nominal])
X = pd.concat([temp,dummies], axis = 1)

X_train = X

In [None]:
# split train copy
X = validate[features]
y_validate = validate['stroke']

ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension', 'heart_disease']

for col in ordinal:
    X[col] = le.fit_transform(X[col])

nominal =  ['ever_married', 'work_type']

## norminal data one hot encoding for categorical features
temp = X.drop(columns = nominal)
dummies = pd.get_dummies(X[nominal])
X = pd.concat([temp,dummies], axis = 1)

X_validate = X

In [None]:
# oversamplling
smote = SMOTE()
X_resample, y_resample = smote.fit_resample(X_train, y_train.ravel())

In [None]:
X_resample.shape, X_train.shape

# Predictions

In [None]:
#### predictions with resampled data


def predictions(x_set,y_set):
    t1 = time.time()
    print('Classification Process Starts....')
    accuracy,precision,recall,f1,auc,conf_mat= [],[],[],[],[],[]
        
    random_state = 1017
    
    ##classifiers list 
    classifiers = []
    classifiers.append(SVC(random_state=random_state, probability = True))
    classifiers.append(DecisionTreeClassifier(random_state=random_state))
    classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state)))
    classifiers.append(RandomForestClassifier(random_state=random_state))
    classifiers.append(GradientBoostingClassifier(random_state=random_state))
    classifiers.append(KNeighborsClassifier())
    classifiers.append(LogisticRegression(random_state = random_state))
    classifiers.append(XGBClassifier(random_state = random_state, eval_metric = 'logloss', learning_rate = 0.054))
    


    for classifier in classifiers:
        
        t =time.time()
        print('fitting on classifier with parameters: {}'.format(classifier))
        
        #classifier and fitting
        clf = classifier
        clf.fit(x_set,y_set)
        
        #predictions
        y_preds = clf.predict(X_validate)
        y_probs = clf.predict_proba(X_validate)
        
        # metrics
        accuracy.append((round(accuracy_score(y_validate,y_preds),2))*100)
        precision.append((round(precision_score(y_validate,y_preds),2))*100)
        recall.append((round(recall_score(y_validate,y_preds),2))*100)
        f1.append((round(f1_score(y_validate,y_preds),2))*100)
        auc.append((round (roc_auc_score(y_validate,y_probs[:,1]), 2))*100)
        conf_mat.append(confusion_matrix(y_validate,y_preds))
        
        elapsed = time.time() - t
        print('Done and elapsed time is {}seconds'.format(round(elapsed,3)))
        print('\n')
    results_df = pd.DataFrame({"Accuracy Score":accuracy,"Precision Score":precision,
                        "Recall Score":recall, "f1 Score":f1,"AUC Score":auc,
                        "Confusion Matrix":conf_mat,
                        "Algorithm":["SVC","DecisionTree","AdaBoost",
                                     "RandomForest","GradientBoosting",
                                     "KNeighboors","LogisticRegression",
                                     "XGBoost"]})
    
    results_df = (results_df.sort_values(by = 'Algorithm', ascending = False)
                  .reset_index(drop =  True))
    t2 = time.time() - t1
    print('\nClassification is Completed and results are strored in dataframe.\ntotal time elapsed is {}seconds'.format(t2))
    print('***************************************************************\n\n')
    
    return results_df

In [None]:
# X = X.astype(float)

In [None]:
orig_results = predictions(X_train, y_train)

In [None]:
orig_results

In [None]:
resamp_results = predictions(X_resample, y_resample)

In [None]:
resamp_results

In [None]:
orig_results

In [None]:
# multi_visualize(data = [orig_results, resamp_results], vmin=30,vmax = 100)

# without heart disease

In [None]:
target = 'stroke'
features = ['hypertension', 'ever_married', 'work_type', 'age_bins', 'glucose_bins', 'bmi_bins']

In [None]:
# split train copy
X = validate[features]
y_validate = validate['stroke']

ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension']

for col in ordinal:
    X[col] = le.fit_transform(X[col])

nominal =  ['ever_married', 'work_type']

## norminal data one hot encoding for categorical features
temp = X.drop(columns = nominal)
dummies = pd.get_dummies(X[nominal])
X = pd.concat([temp,dummies], axis = 1)

X_validate = X

In [None]:
# split train copy
X = train[features]
y_train = train['stroke']

ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension']

for col in ordinal:
    X[col] = le.fit_transform(X[col])

nominal =  ['ever_married', 'work_type']

## norminal data one hot encoding for categorical features
temp = X.drop(columns = nominal)
dummies = pd.get_dummies(X[nominal])
X = pd.concat([temp,dummies], axis = 1)

X_train = X

In [None]:
# oversamplling
smote = SMOTE()
X_resample, y_resample = smote.fit_resample(X_train, y_train.ravel())

In [None]:
orig_results = predictions(X_train, y_train)
resamp_results = predictions(X_resample, y_resample)

In [None]:
orig_results

In [None]:
resamp_results

In [None]:
#### predictions with resampled data


def test_predictions(x_set,y_set):
    t1 = time.time()
    print('Classification Process Starts....')
    accuracy,precision,recall,f1,auc,conf_mat= [],[],[],[],[],[]
        
    random_state = 1017
    
    ##classifiers list 
    classifiers = []
    classifiers.append(SVC(random_state=random_state, probability = True))
    classifiers.append(LogisticRegression(random_state = random_state))


    for classifier in classifiers:
        
        t =time.time()
        print('fitting on classifier with parameters: {}'.format(classifier))
        
        #classifier and fitting
        clf = classifier
        clf.fit(x_set,y_set)
        
        #predictions
        y_preds = clf.predict(X_test)
        y_probs = clf.predict_proba(X_test)
        
        # metrics
        accuracy.append((round(accuracy_score(y_test,y_preds),2))*100)
        precision.append((round(precision_score(y_test,y_preds),2))*100)
        recall.append((round(recall_score(y_test,y_preds),2))*100)
        f1.append((round(f1_score(y_test,y_preds),2))*100)
        auc.append((round (roc_auc_score(y_test,y_probs[:,1]), 2))*100)
        conf_mat.append(confusion_matrix(y_test,y_preds))
        
        elapsed = time.time() - t
        print('Done and elapsed time is {}seconds'.format(round(elapsed,3)))
        print('\n')
    results_df = pd.DataFrame({"Accuracy Score":accuracy,"Precision Score":precision,
                        "Recall Score":recall, "f1 Score":f1,"AUC Score":auc,
                        "Confusion Matrix":conf_mat,
                        "Algorithm":["SVC", "LogisticRegression"]})
    
    results_df = (results_df.sort_values(by = 'Algorithm', ascending = False)
                  .reset_index(drop =  True))
    t2 = time.time() - t1
    print('\nClassification is Completed and results are strored in dataframe.\ntotal time elapsed is {}seconds'.format(t2))
    print('***************************************************************\n\n')
    
    return results_df

In [None]:
target = 'stroke'
features = ['hypertension', 'heart_disease', 'ever_married', 'work_type', 'age_bins', 'glucose_bins', 'bmi_bins']

In [None]:
# split train copy
X = train[features]
y_train = train['stroke']

ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension', 'heart_disease']

for col in ordinal:
    X[col] = le.fit_transform(X[col])

nominal =  ['ever_married', 'work_type']

## nominal data one hot encoding for categorical features
temp = X.drop(columns = nominal)
dummies = pd.get_dummies(X[nominal])
X = pd.concat([temp,dummies], axis = 1)

X_train = X

In [None]:
# split train copy
X = test[features]
y_test = test['stroke']

ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension', 'heart_disease']
nominal =  ['ever_married', 'work_type']


for col in ordinal:
    X[col] = le.fit_transform(X[col])

## norminal data one hot encoding for categorical features
temp = X.drop(columns = nominal)
dummies = pd.get_dummies(X[nominal])
X = pd.concat([temp,dummies], axis = 1)

X_test = X

In [None]:
# oversamplling
smote = SMOTE()
X_resample, y_resample = smote.fit_resample(X_train, y_train.ravel())

In [None]:
resamp_test = test_predictions(X_resample, y_resample)

In [None]:
resamp_test

Takeaways:
* SVC, Logistic Regression, Gradient Boost perform the best

Actions:
* Create a baseline and add to the function created
* Create a function specifically for this section

In [None]:
big_X = [X_train, X_validate, X_test]

In [None]:
X_resample

In [None]:
ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension', 'heart_disease', 'smoking_status']

for X in big_X:
    for col in ordinal:
        X[col] = le.fit_transform(X[col])
    

In [None]:
converted = []
for X in big_X:
    temp = X.drop(['ever_married', 'work_type', 'smoking_status'], axis=1)
#     temp
    dummies = pd.get_dummies(X[nominal], drop_first=True)
    
    converted.append(pd.concat([temp,dummies], axis = 1))
    

In [None]:
X_train, y_train, X_resample, y_resample, X_validate, y_validate, X_test, y_test = preprocess()

In [None]:
X_train, X_validate, X_test = converted

In [None]:
def encode_features(X_list):
    '''
    Arguments: [X_train, X_validate, X_test]
    Actions:
        1. Encodes variables
        2. Creates new datasets with encoded variables  
    Returns: [X_train_encoded, X_validate_encoded, X_test_encoded] 
    Modules:
        1. import pandas as pd
        2. from sklearn.preprocessing import LabelEncoder
    '''
    # set ordinal variables
    ordinal = ['age_bins',  'glucose_bins', 'bmi_bins', 'hypertension', 'heart_disease']
    
    # set nominal variables
    nominal = ['ever_married', 'work_type', 'smoking_status']
    
    # initialize encoder
    le = LabelEncoder()
    
    # for each dataset
    for X in X_list:
        
        # for each ordinal variable in each dataset
        for col in ordinal:
            
            # fit and transform each and replace the values in the original
            X[col] = le.fit_transform(X[col])
    
    # initialize list
    converted = []
    
    # for each data set
    for X in X_list:
        
        # create temporrary dataset with pre-encoded variables
        temp = X.drop(nominal, axis=1)

        # get the dummy variables for each nominal variable
        dummies = pd.get_dummies(X[nominal], drop_first=True)

        # add new datasets with all encoded variables to the list
        converted.append(pd.concat([temp, dummies], axis = 1))
       
    # exit function and return the list of encoded datasets
    return converted
    

In [None]:
# encode_features([X_train, X_validate, X_test])

In [None]:
def preprocess():
    '''
    Actions:
        1. Gets data
        2. Creates X, y datasets for train, validate, and test
        3. Encodes all X datasets
        4. Oversamples using X train and y train
    Returns: X_train, y_train, X_resample, y_resample, X_validate, y_validate, X_test, y_test
    Modules:
        1. import pandas as pd
        2. from wrangle import wrangle_data
        3. from model import encode_features
        4. from imblearn.over_sampling import SMOTE
    '''
    # get data
    train, validate, test = wrangle_data()
    
    # set target
    target = 'stroke'
    
    # set features of interest
    features = ['hypertension', 'heart_disease', 'ever_married', 'work_type', 'smoking_status', 'age_bins', 'glucose_bins', 'bmi_bins']
    
    # create train X, y
    X_train = train[features]
    y_train = train[target]
    
    # create validate X, y
    X_validate = validate[features]
    y_validate = validate[target]
    
    # create test X, y
    X_test = test[features]
    y_test = test[target]
    
    # encoding variables
    X_train, X_validate, X_test = encode_features([X_train, X_validate, X_test])
    
    # initialize oversampling 
    smote = SMOTE(random_state=1017)
    # fit and resample using X and y train
    X_resample, y_resample = smote.fit_resample(X_train, y_train.ravel())
    
    # exit function and return all preprocessed datasets
    return X_train, y_train, X_resample, y_resample, X_validate, y_validate, X_test, y_test

In [None]:
X_train, y_train, X_resample, y_resample, X_validate, y_validate, X_test, y_test = preprocess()

In [None]:
pd.Series(y_resample).value_counts()

In [11]:
#### predictions with resampled data

def predictions(x_set,y_set, X_validate, y_validate):
    '''
    Actions: Gets dataframe with evaluation scores for SVC, GradientBoost, and LogisticRegression classifiers
    '''
    
    # initialize lists to hold metrics
    accuracy,precision,recall,f1,conf_mat= [],[],[],[],[]
    
    # set a random state
    random_state = 1017
    
    # set baseline predictions
    y_preds = np.zeros(len(X_validate)).astype(int)

    # adding metrics for baseline
    accuracy.append((round(accuracy_score(y_validate,y_preds),2))*100)
    precision.append((round(precision_score(y_validate,y_preds),2))*100)
    recall.append((round(recall_score(y_validate,y_preds),2))*100)
    f1.append((round(f1_score(y_validate,y_preds),2))*100)
    conf_mat.append(confusion_matrix(y_validate,y_preds))

    
    # intitializing different classifiers
    clf1 = SVC(random_state=random_state, probability=True)
    clf2 = GradientBoostingClassifier(random_state=random_state)
    clf3 = LogisticRegression(random_state = random_state)
    clf4 = LogisticRegression(C=.25, random_state = random_state)
    clf5 = LogisticRegression(C=.5, random_state = random_state)

    # initializing voting classifier with top three classifiers from above
    eclf = VotingClassifier(estimators=[
        ('svc', clf1),('gbc', clf2), ('lr', clf3), ('lr.5', clf4), ('lr.25', clf5)])
    
    
    # initialize classifier list
    classifiers = []
    
    # adding classification models to be used
    classifiers.append(clf1)
    classifiers.append(clf2)
    classifiers.append(clf3)    
    classifiers.append(clf4)
    classifiers.append(clf5)
    classifiers.append(eclf)
    
    # for each classification method in the list
    for clf in classifiers:
        
        # fit classifier
        clf.fit(x_set,y_set)
        
        # assign predictions to variable
        y_preds = clf.predict(X_validate)
        
        # appending the metrics to each repsective metric list
        accuracy.append((round(accuracy_score(y_validate,y_preds),2))*100)
        precision.append((round(precision_score(y_validate,y_preds),2))*100)
        recall.append((round(recall_score(y_validate,y_preds),2))*100)
        f1.append((round(f1_score(y_validate,y_preds),2))*100)
        conf_mat.append(confusion_matrix(y_validate,y_preds))

    # creating a dataframe with the metrics from the list and each algorithm name
    results_df = pd.DataFrame({"Recall Score":recall,
                               "Accuracy Score":accuracy,
                               "Precision Score":precision,
                               "f1 Score":f1,
                               "Confusion Matrix":conf_mat,
                               "Algorithm":["Baseline",
                                            "SVC",
                                            "GradientBoosting",
                                            "LogisticRegression",
                                            "LR C=.25",
                                            "LR C=.5",
                                            "VotingClassifier"]})
                                     
    # sorting algorithm name alphabetically and setting index to the algorithm name 
    results_df = results_df.sort_values(by = 'Algorithm').set_index('Algorithm')
    
    # exit function and return df
    return results_df

In [None]:
predictions_difference = predictions_train(X_resample, y_resample) -  predictions_validate(X_resample, y_resample)

In [None]:
predictions_validate(X_resample, y_resample)['Algorithm']

In [None]:
predictions_difference['Algorithm'] = predictions_validate(X_resample, y_resample)['Algorithm']

In [None]:
# intial predictions on train
predictions_train(X_resample, y_resample).T

In [None]:
# validate predictions scores
predictions_validate(X_resample, y_resample).T

In [None]:
# difference between train and validate
predictions_difference.T

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
random_state = 1017

In [None]:
# intitializing differenct classifiers
clf1 = SVC(random_state=random_state, probability=True)
clf2 = GradientBoostingClassifier(random_state=random_state)
clf3 = LogisticRegression(random_state = random_state)

eclf1 = VotingClassifier(estimators=[
    ('svc', clf1), ('lr', clf3), ('gbc', clf2)], voting='hard')

eclf2 = VotingClassifier(estimators=[
    ('svc', clf1), ('gbc', clf2), ('lr', clf3)], voting='soft')

eclf3 = VotingClassifier(estimators=[
    ('svc', clf1), ('gbc', clf2), ('lr', clf3)])



In [None]:
votes = [eclf1, eclf2, eclf3]
# initialize lists to hold metrics
accuracy,precision,recall,f1,conf_mat= [],[],[],[],[]

# for each classification method in the list
for vote in votes:

    vote.fit(X_resample, y_resample)

    # assign predictions to variable
    y_preds = vote.predict(X_train)

    # appending the metrics to each repsective metric list
    accuracy.append((round(accuracy_score(y_train,y_preds),2))*100)
    precision.append((round(precision_score(y_train,y_preds),2))*100)
    recall.append((round(recall_score(y_train,y_preds),2))*100)
    f1.append((round(f1_score(y_train,y_preds),2))*100)
    conf_mat.append(confusion_matrix(y_train,y_preds))

# creating a dataframe with the metrics from the list and each algorithm name
results_df = pd.DataFrame({"Recall Score":recall,
                           "Accuracy Score":accuracy,
                           "Precision Score":precision,
                           "f1 Score":f1,
                           "Confusion Matrix":conf_mat,
                           "Algorithm":['eclf1_hard', 'eclf2_soft', 'eclf3_uniform']})

# sorting algorithm name alphabetically and setting index to the algorithm name 
results_df.sort_values(by = 'Algorithm').set_index('Algorithm')


In [None]:
def voting_predictions(X_train, y_train, X_validate, y_validate):
    '''
    Actions: Gets dataframe with evaluation scores for VotingClassifier that uses SVC, GradientBoost, and LogisticRegression classifiers as voting parties
    '''
    # setting random state
    random_state = 1017
    
     # intitializing different classifiers
    clf1 = SVC(random_state=random_state, probability=True)
    clf2 = GradientBoostingClassifier(random_state=random_state)
    clf3 = LogisticRegression(random_state = random_state)
    clf4 = LogisticRegression(C=.25, random_state = random_state)
    clf5 = LogisticRegression(C=.5, random_state = random_state)

    # initializing voting classifier with top three classifiers from above
    eclf = VotingClassifier(estimators=[
        ('svc', clf1),('gbc', clf2), ('lr', clf3), ('lr.5', clf4), ('lr.25', clf5)])

    # fitting the model on the resampled train data
    eclf.fit(X_train, y_train)

    # assign predictions to variable
    y_preds = eclf.predict(X_validate)

    # initialize lists to hold metrics
    accuracy,precision,recall,f1,conf_mat= [],[],[],[],[]

    # appending the metrics to each repsective metric list
    accuracy.append((round(accuracy_score(y_validate,y_preds),2))*100)
    precision.append((round(precision_score(y_validate,y_preds),2))*100)
    recall.append((round(recall_score(y_validate,y_preds),2))*100)
    f1.append((round(f1_score(y_validate,y_preds),2))*100)
    conf_mat.append(confusion_matrix(y_validate,y_preds))

    # creating a dataframe with the metrics from the list and each algorithm name
    results_df = pd.DataFrame({"Recall Score":recall,
                               "Accuracy Score":accuracy,
                               "Precision Score":precision,
                               "f1 Score":f1,
                               "Confusion Matrix":conf_mat,
                               "Algorithm":'VotingClassifier_uniform'})

    # sorting algorithm name alphabetically and setting index to the algorithm name 
    return results_df.sort_values(by = 'Algorithm').set_index('Algorithm').T


In [None]:
voting_predictions(X_resample, y_resample, X_validate, y_validate)