# data-mining-project

Use the "Run" button to execute the code.

In [2]:
'''
Warren Ball
sources : https://www.geeksforgeeks.org/how-to-reshape-pandas-series/
https://stackoverflow.com/questions/14863125/sklearn-logistic-regression-with-unbalanced-classes
https://machinelearningmastery.com/cost-sensitive-logistic-regression/
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix
from scipy.stats import spearmanr
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import classification_report


import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [3]:
df = pd.read_csv("diabetes_binary.csv")
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [5]:
# Checking correlation with all features
Diabetes = df['Diabetes_binary']
#Every single column is in this array
column_array = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
all_correlations = []
for i in range (len(column_array)):
    current_index = df[column_array[i]]
    spearmanr_coefficient, p_value = spearmanr(Diabetes, current_index)
    all_correlations.append([column_array[i], spearmanr_coefficient])

all_correlations.sort(key = lambda all_correlations : all_correlations[1], reverse = True)
for i in range(len(all_correlations)):
    print("For " + str(all_correlations[i][0]) + ', correlation with diabetes: ' + str(all_correlations[i][1]))

For GenHlth, correlation with diabetes: 0.28769724599353597
For HighBP, correlation with diabetes: 0.26312878992336214
For BMI, correlation with diabetes: 0.22631421069982066
For DiffWalk, correlation with diabetes: 0.21834435192101798
For HighChol, correlation with diabetes: 0.20027619187912013
For Age, correlation with diabetes: 0.17768426329341755
For HeartDiseaseorAttack, correlation with diabetes: 0.1772822578072029
For PhysHlth, correlation with diabetes: 0.15675169713222603
For Stroke, correlation with diabetes: 0.10581606726811367
For CholCheck, correlation with diabetes: 0.06476081015893631
For Smoker, correlation with diabetes: 0.06078850564034085
For MentHlth, correlation with diabetes: 0.04004895281242356
For NoDocbcCost, correlation with diabetes: 0.031432763359259056
For Sex, correlation with diabetes: 0.03142999802068085
For AnyHealthcare, correlation with diabetes: 0.016255139545865795
For Fruits, correlation with diabetes: -0.04077922810406531
For Veggies, correlation 

In [6]:
#note, starting with highest correlation, i kept adding cols until scores get lower
#then i stopped
df = df[['Diabetes_binary', 'GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 'Age']]

x_full, x_test, y_full, y_test = train_test_split(df[['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 'Age']], df['Diabetes_binary'], test_size = 0.15)

x_train, x_val, y_train, y_val = train_test_split(x_full, y_full, test_size = 0.15)


In [7]:
mod_obj = CategoricalNB(fit_prior=False).fit(x_train, y_train)

In [8]:
print(mod_obj.score(x_val, y_val))
pred_y = mod_obj.predict(x_val)
print(confusion_matrix(y_val, pred_y))
print(classification_report(y_val, pred_y))


0.715844798268666
[[19688  8141]
 [ 1050  3466]]
              precision    recall  f1-score   support

         0.0       0.95      0.71      0.81     27829
         1.0       0.30      0.77      0.43      4516

    accuracy                           0.72     32345
   macro avg       0.62      0.74      0.62     32345
weighted avg       0.86      0.72      0.76     32345



In [9]:
#from imblearn import over_sampling
from imblearn.over_sampling import RandomOverSampler

x_over, y_over = RandomOverSampler(random_state = 0).fit_resample(x_train, y_train)

#okay, with new data, we can use oversampled data to see if it improves naive bayes
#also we can stop fit_prior since it doesn't help anymore
mod_over_obj = CategoricalNB().fit(x_over, y_over)

In [10]:
print(mod_over_obj.score(x_val, y_val))
over_pred_y = mod_over_obj.predict(x_val)
print(confusion_matrix(y_val, over_pred_y))
print(classification_report(y_val, over_pred_y))

0.7152264646776936
[[19666  8163]
 [ 1048  3468]]
              precision    recall  f1-score   support

         0.0       0.95      0.71      0.81     27829
         1.0       0.30      0.77      0.43      4516

    accuracy                           0.72     32345
   macro avg       0.62      0.74      0.62     32345
weighted avg       0.86      0.72      0.76     32345



In [11]:
from imblearn.over_sampling import SMOTE

#note i tried a bunch of different k_neighbor. 10 seems to be the best
smote_x, smote_y = SMOTE(sampling_strategy = 'minority', k_neighbors = 10).fit_resample(x_train, y_train)
mod_obj_smote = CategoricalNB().fit(smote_x, smote_y)

In [12]:
print(mod_obj_smote.score(x_val, y_val))
smote_pred_y = mod_obj_smote.predict(x_val)
print(confusion_matrix(y_val, smote_pred_y))
print(classification_report(y_val, smote_pred_y))

0.7107744628226929
[[19498  8331]
 [ 1024  3492]]
              precision    recall  f1-score   support

         0.0       0.95      0.70      0.81     27829
         1.0       0.30      0.77      0.43      4516

    accuracy                           0.71     32345
   macro avg       0.62      0.74      0.62     32345
weighted avg       0.86      0.71      0.75     32345



In [13]:
from imblearn import under_sampling
from imblearn.under_sampling import RandomUnderSampler

x_under, y_under = RandomUnderSampler(random_state = 0).fit_resample(x_train,y_train)
mod_under = CategoricalNB().fit(x_under, y_under)

In [14]:
print(mod_under.score(x_val, y_val))
under_pred_y = mod_under.predict(x_val)
print(confusion_matrix(y_val, under_pred_y))
print(classification_report(y_val, under_pred_y))

0.7151646313185964
[[19664  8165]
 [ 1048  3468]]
              precision    recall  f1-score   support

         0.0       0.95      0.71      0.81     27829
         1.0       0.30      0.77      0.43      4516

    accuracy                           0.72     32345
   macro avg       0.62      0.74      0.62     32345
weighted avg       0.86      0.72      0.76     32345



it doesn't seem like it's much better.

okay, our best is SMOTE. At least in terms of TN. so we'll try this for test

In [15]:
print(mod_under.score(x_test, y_test))
under_pred_y = mod_under.predict(x_test)
print(confusion_matrix(y_test, under_pred_y))
print(classification_report(y_test, under_pred_y))

0.7221171029118049
[[23393  9397]
 [ 1177  4085]]
              precision    recall  f1-score   support

         0.0       0.95      0.71      0.82     32790
         1.0       0.30      0.78      0.44      5262

    accuracy                           0.72     38052
   macro avg       0.63      0.74      0.63     38052
weighted avg       0.86      0.72      0.76     38052



Oh wow, it seems like the SMOTE model actually works slightly better than predicted.
With this. I thnk this is the best that naive bayes can offer.

Below is what the function would look like without anything done to it. It achieves a high accuracy, but it's at
the expense of poorly predicted positive diabetes cases. This is useless for our purposes.

In [17]:

df = pd.read_csv("diabetes_binary.csv")

# #print(df)

df = df[['Diabetes_binary', 'GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 'Age']]



x_full, x_test, y_full, y_test = train_test_split(df[['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 'Age']], df['Diabetes_binary'], test_size = 0.15)

x_train, x_val, y_train, y_val = train_test_split(x_full, y_full, test_size = 0.15)

ret_obj = CategoricalNB().fit(x_train, y_train)


print(ret_obj.score(x_test, y_test))

pred_y = ret_obj.predict(x_test)
print(confusion_matrix(y_test, pred_y))
print(classification_report(y_test, under_pred_y))

0.8341480079890676
[[29436  3273]
 [ 3038  2305]]
              precision    recall  f1-score   support

         0.0       0.86      0.65      0.74     32709
         1.0       0.14      0.36      0.20      5343

    accuracy                           0.61     38052
   macro avg       0.50      0.50      0.47     38052
weighted avg       0.76      0.61      0.66     38052

