In [34]:
import os
import requests
import pandas as pd
import numpy as np
#import altair as alt
#from vega_datasets import data
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import statsmodels.api as sm
from sklearn.feature_selection import RFECV
from sklearn.ensemble import *
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import *
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
from sklearn.inspection import permutation_importance
import re
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestNeighbors
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle

In [2]:
train_df = pd.read_csv('train_df')
test_df = pd.read_csv('test_df')

train_df_std = pd.read_csv('train_df_std')
test_df_std = pd.read_csv('test_df_std')

In [3]:
X_train, y_train = train_df.drop('Cancer', axis=1), train_df.Cancer
X_test, y_test = test_df.drop('Cancer', axis=1), test_df.Cancer

X_train_std, y_train_std = train_df_std.drop('Cancer', axis=1), train_df_std.Cancer
X_test_std, y_test_std = test_df_std.drop('Cancer', axis=1), test_df_std.Cancer

In [4]:
#Smoting the data
smote = SMOTE(random_state=42)
X_train_std, y_train_std = smote.fit_resample(X_train_std, y_train_std)

In [5]:
# Baseline NB Model
nb = GaussianNB()
nb.fit(X_train_std, y_train_std)

y_pred = nb.predict(X_test_std)
accuracy = accuracy_score(y_test_std, y_pred)
precision = precision_score(y_test_std, y_pred)
recall = recall_score(y_test_std, y_pred)
f1 = f1_score(y_test_std, y_pred)

print('Test Scores')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('********************')
%time

Test Scores
Accuracy: 0.5403415251174557
Precision: 0.1715045234318055
Recall: 0.7957032027053909
F1 Score: 0.2821869488536155
********************
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.34 µs


## Naive Bayes Pipeline

In [40]:
#Column Transformer
cols_to_std=['Age Group','Num of Bad Mental Health Days','Years Since Last Checkup','Hours of Sleeping','Age Started Smoking',
 'Cigarettes per Day','Days Drinking','Income','BMI']

ct = ColumnTransformer([('std',StandardScaler(),cols_to_std)])


# NB pipeline
nb_pipeline = Pipeline([
    ('ct', ct),
    ('smote', SMOTE(random_state=42)),  
    ('nb', GaussianNB())
])

# NB Parameter Grid
# Note: more values have been tested than what is shown, small amounts of values were run at a time to increase runtime.
param_grid = {
    'nb__var_smoothing': [3.075,3.1,3.125,3.15],
}

# Grid search
grid_search = GridSearchCV(estimator=nb_pipeline, param_grid=param_grid, cv=5, scoring=['recall','precision','f1','accuracy'],refit='recall')
grid_search.fit(X_train, y_train)

#print("Results Table: ", pd.DataFrame(grid_search.cv_results_))
#pd.set_option('display.max_columns',None)
print("Best parameters found: ", grid_search.best_params_)
print("Cross-validation recall: ", np.round(grid_search.cv_results_['mean_test_recall'][grid_search.best_index_],4))
print("Cross-validation recall std: ", np.round(grid_search.cv_results_['std_test_recall'][grid_search.best_index_],4))
print("Cross-validation precision: ", np.round(grid_search.cv_results_['mean_test_precision'][grid_search.best_index_],4))
print("Cross-validation precision std: ", np.round(grid_search.cv_results_['std_test_precision'][grid_search.best_index_],4))
print("Cross-validation f1 score: ", np.round(grid_search.cv_results_['mean_test_f1'][grid_search.best_index_],4))
print("Cross-validation f1 std: ", np.round(grid_search.cv_results_['std_test_f1'][grid_search.best_index_],4))
print("Cross-validation accuracy score: ", np.round(grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_],4))
print("Cross-validation accuracy std: ", np.round(grid_search.cv_results_['std_test_accuracy'][grid_search.best_index_],4))

grid_nb = grid_search.best_estimator_
print(grid_nb)  

print('********************')
%time

Best parameters found:  {'nb__var_smoothing': 3.125}
Cross-validation recall:  0.8386
Cross-validation recall std:  0.0067
Cross-validation precision:  0.1821
Cross-validation precision std:  0.001
Cross-validation f1 score:  0.2992
Cross-validation f1 std:  0.0015
Cross-validation accuracy score:  0.5539
Cross-validation accuracy std:  0.0032
Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('std', StandardScaler(),
                                                  ['Age Group',
                                                   'Num of Bad Mental Health '
                                                   'Days',
                                                   'Years Since Last Checkup',
                                                   'Hours of Sleeping',
                                                   'Age Started Smoking',
                                                   'Cigarettes per Day',
                                                   'Days

In [41]:
# Testing Grid Search nb
y_pred = grid_nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Test Scores')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

#print(' ')
#print('Validation Scores')
#y_pred = grid_nb.predict(X_val)
#accuracy = accuracy_score(y_val, y_pred)
#precision = precision_score(y_val, y_pred)
#recall = recall_score(y_val, y_pred)
#f1 = f1_score(y_val, y_pred)

#print(f'Accuracy: {accuracy}')
#print(f'Precision: {precision}')
#print(f'Recall: {recall}')
#print(f'F1 Score: {f1}')

Test Scores
Accuracy: 0.5509577159378388
Precision: 0.18103337198814587
Recall: 0.8384722498508056
F1 Score: 0.29777463793712466


## SGDClassifier (SVM) Pipeline

In [6]:
# Baseline SGDClassifier Model
sgdc = SGDClassifier()
sgdc.fit(X_train_std, y_train_std)

y_pred = sgdc.predict(X_test_std)
accuracy = accuracy_score(y_test_std, y_pred)
precision = precision_score(y_test_std, y_pred)
recall = recall_score(y_test_std, y_pred)
f1 = f1_score(y_test_std, y_pred)

print('Test Scores')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('********************')
%time

Test Scores
Accuracy: 0.6328379110950488
Precision: 0.20468174644923726
Recall: 0.774020290431669
F1 Score: 0.32375088405375047
********************
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 10 µs


In [12]:
#Column Transformer
cols_to_std=['Age Group','Num of Bad Mental Health Days','Years Since Last Checkup','Hours of Sleeping','Age Started Smoking',
 'Cigarettes per Day','Days Drinking','Income','BMI']

ct = ColumnTransformer([('std',StandardScaler(),cols_to_std)])


# sgdc pipeline
sgdc_pipeline = Pipeline([
    ('ct', ct),
    ('smote', SMOTE(random_state=42)),  
    ('sgdc', SGDClassifier())
])

# sgdc Parameter Grid 
# Note: more values have been tested than what is shown, small amounts of values were run at a time to increase runtime.
param_grid = {
    'sgdc__loss': ['hinge'], #using only hinge loss in order to implement SVM
    'sgdc__penalty': ['l2'],
    'sgdc__learning_rate': ['adaptive'],
    'sgdc__alpha': [.75,.5,.25],
    'sgdc__eta0': [.01,.025,.05,.075]
}

# Grid search
grid_search = GridSearchCV(estimator=sgdc_pipeline, param_grid=param_grid, cv=5, scoring=['recall','precision','f1','accuracy'],refit='recall')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Cross-validation recall: ", np.round(grid_search.cv_results_['mean_test_recall'][grid_search.best_index_],4))
print("Cross-validation recall std: ", np.round(grid_search.cv_results_['std_test_recall'][grid_search.best_index_],4))
print("Cross-validation precision: ", np.round(grid_search.cv_results_['mean_test_precision'][grid_search.best_index_],4))
print("Cross-validation precision std: ", np.round(grid_search.cv_results_['std_test_precision'][grid_search.best_index_],4))
print("Cross-validation f1 score: ", np.round(grid_search.cv_results_['mean_test_f1'][grid_search.best_index_],4))
print("Cross-validation f1 std: ", np.round(grid_search.cv_results_['std_test_f1'][grid_search.best_index_],4))
print("Cross-validation accuracy score: ", np.round(grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_],4))
print("Cross-validation accuracy std: ", np.round(grid_search.cv_results_['std_test_accuracy'][grid_search.best_index_],4))

grid_sgdc = grid_search.best_estimator_
print(grid_sgdc)  

print('********************')
%time

Best parameters found:  {'sdgc__alpha': 0.5, 'sdgc__eta0': 0.025, 'sdgc__learning_rate': 'adaptive', 'sdgc__loss': 'hinge', 'sdgc__penalty': 'l2'}
Cross-validation recall:  0.8655
Cross-validation recall std:  0.0053
Cross-validation precision:  0.1799
Cross-validation precision std:  0.0007
Cross-validation f1 score:  0.2978
Cross-validation f1 std:  0.0012
Cross-validation accuracy score:  0.5366
Cross-validation accuracy std:  0.0011
Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('std', StandardScaler(),
                                                  ['Age Group',
                                                   'Num of Bad Mental Health '
                                                   'Days',
                                                   'Years Since Last Checkup',
                                                   'Hours of Sleeping',
                                                   'Age Started Smoking',
                                  

In [14]:
# Testing Grid Search sgdc
y_pred = grid_sgdc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Test Scores')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Test Scores
Accuracy: 0.5363209251897362
Precision: 0.18014114151293797
Recall: 0.8683111199522578
F1 Score: 0.2983799302754802


## Random Forest Classifier

In [26]:
# Baseline RFC Model
rfc = RandomForestClassifier()

# Even though we are doing an rfc, we are using a standardized dataset 
# since we get better results with it than non-standardized data
rfc.fit(X_train_std, y_train_std)  

y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Test Scores')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('********************')
%time

Test Scores
Accuracy: 0.8410056017347307
Precision: 0.12574404761904762
Recall: 0.06723692062860553
F1 Score: 0.08762151652624757
********************
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.44 µs


In [32]:
#Column Transformer
cols_to_std=['Age Group','Num of Bad Mental Health Days','Years Since Last Checkup','Hours of Sleeping','Age Started Smoking',
 'Cigarettes per Day','Days Drinking','Income','BMI']

ct = ColumnTransformer([('std',StandardScaler(),cols_to_std)])


# rfc pipeline
rfc_pipeline = Pipeline([
    ('ct', ct),
    ('smote', SMOTE(random_state=42)),  
    ('rfc', RandomForestClassifier())
])

# sgdc Parameter Grid  
# Note: more values have been tested than what is shown, small amounts of values were run at a time to increase runtime.
param_grid = {
    'rfc__n_estimators': [29,30,31],
    'rfc__criterion': ['gini'], #using gini to optimize run time
    'rfc__max_depth': [2,3,4],
    'rfc__random_state': [42]
}

# Grid search
grid_search = GridSearchCV(estimator=rfc_pipeline, param_grid=param_grid, cv=5, scoring=['recall','precision','f1','accuracy'],refit='recall')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Cross-validation recall: ", np.round(grid_search.cv_results_['mean_test_recall'][grid_search.best_index_],4))
print("Cross-validation recall std: ", np.round(grid_search.cv_results_['std_test_recall'][grid_search.best_index_],4))
print("Cross-validation precision: ", np.round(grid_search.cv_results_['mean_test_precision'][grid_search.best_index_],4))
print("Cross-validation precision std: ", np.round(grid_search.cv_results_['std_test_precision'][grid_search.best_index_],4))
print("Cross-validation f1 score: ", np.round(grid_search.cv_results_['mean_test_f1'][grid_search.best_index_],4))
print("Cross-validation f1 std: ", np.round(grid_search.cv_results_['std_test_f1'][grid_search.best_index_],4))
print("Cross-validation accuracy score: ", np.round(grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_],4))
print("Cross-validation accuracy std: ", np.round(grid_search.cv_results_['std_test_accuracy'][grid_search.best_index_],4))

grid_rfc = grid_search.best_estimator_
print(grid_rfc)  

print('********************')
%time

Best parameters found:  {'rfc__criterion': 'gini', 'rfc__max_depth': 3, 'rfc__n_estimators': 30, 'rfc__random_state': 42}
Cross-validation recall:  0.7859
Cross-validation recall std:  0.0051
Cross-validation precision:  0.1969
Cross-validation precision std:  0.0014
Cross-validation f1 score:  0.3149
Cross-validation f1 std:  0.0017
Cross-validation accuracy score:  0.6118
Cross-validation accuracy std:  0.0044
Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('std', StandardScaler(),
                                                  ['Age Group',
                                                   'Num of Bad Mental Health '
                                                   'Days',
                                                   'Years Since Last Checkup',
                                                   'Hours of Sleeping',
                                                   'Age Started Smoking',
                                                   'Cigaret

In [33]:
# Testing Grid Search rfcc
y_pred = grid_rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Test Scores')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Test Scores
Accuracy: 0.6132770148174919
Precision: 0.19674022066198596
Recall: 0.7803859160533121
F1 Score: 0.31425481635759206


In [35]:
pickle.dump(grid_sgdc, open('sgdc_model.pkl','wb'))
pickle.dump(grid_rfc, open('rfc_model.pkl','wb'))

In [None]:
pickle.dump(grid_nb, open('nb_model.pkl','wb'))