In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import spearmanr

df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")
df['Diabetes_012'].replace({2.0:1.0}, inplace = True)

#Gathering the data 
#health_df = df[['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'GenHlth', 'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income', 'Diabetes_012']]

#This is the dataset with all column names
#oversample_copy = df[['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income', 'Diabetes_012']]
oversample_copy = df[['Fruits', 'Veggies','Diabetes_012']]
oversample_copy.sample(5)

Unnamed: 0,Fruits,Veggies,Diabetes_012
202073,0.0,0.0,0.0
169745,1.0,1.0,0.0
4227,1.0,1.0,1.0
106925,0.0,0.0,0.0
71579,1.0,1.0,0.0


In [3]:
#Splitting into train then test set
x_train, x_test, y_train, y_test = train_test_split(oversample_copy.drop('Diabetes_012', axis = 1), oversample_copy['Diabetes_012'], test_size = 0.15)
#Splitting into train, validation, then test
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.15)

In [4]:
# Now preparing data for OVERSAMPLING using SMOTE which uses K-means clustering
x = x_tr
y = y_tr

In [5]:
y.value_counts()

0.0    154390
1.0     28893
Name: Diabetes_012, dtype: int64

In [6]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 'minority', k_neighbors = 15)
X, Y = smote.fit_resample(x,y)
Y.value_counts()

0.0    154390
1.0    154390
Name: Diabetes_012, dtype: int64

In [7]:
LogReg = LogisticRegression(max_iter=1000).fit(X,Y)
print("Logistic Regression score: %" + str(round((LogReg.score(x_val,y_val)) * 100 , 2)))

Logistic Regression score: %56.48


In [8]:
y_predict = LogReg.predict(x_test)
print (y_test.values)
print (y_predict)

[0. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 1. 0.]


In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.86      0.57      0.69     31966
         1.0       0.19      0.51      0.27      6086

    accuracy                           0.56     38052
   macro avg       0.52      0.54      0.48     38052
weighted avg       0.75      0.56      0.62     38052



In [10]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_predict))

[[18312 13654]
 [ 2956  3130]]


In [11]:
#[ True Negative False positive] 
#[ False Negative True Positive ]
array = (confusion_matrix(y_test, y_predict))
print ("The amount of True Positives are: " + str(array[1][1]))
print ("The amount of True Negatives are: " + str(array[0][0]))

The amount of True Positives are: 3130
The amount of True Negatives are: 18312


In [12]:
#PREDICTION
Fruits_ = 1 # 0 = no, 1 = yes (Eat Fruits?)
Veggies_ = 0 # 0 = no, 1 = yes (Eat veggies?)
LogReg.predict(np.array([[Fruits_, Veggies_]]))[0]
# 0 = No Diabetes, 1 = Diabetes

1.0

In [14]:
# Checking correlation with all features
Diabetes = df['Fruits']
#Every single column is in this array
column_array = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
all_correlations = []
for i in range (len(column_array)):
    current_index = df[column_array[i]]
    spearmanr_coefficient, p_value = spearmanr(Diabetes, current_index)
    all_correlations.append([column_array[i], spearmanr_coefficient])

all_correlations.sort(key = lambda all_correlations : all_correlations[1], reverse = True)
for i in range(len(all_correlations)):
    print("For " + str(all_correlations[i][0]) + ', correlation with fruits: ' + str(all_correlations[i][1]))

For Fruits, correlation with fruits: 1.0
For Veggies, correlation with fruits: 0.25434224443197606
For PhysActivity, correlation with fruits: 0.1427558627380735
For Education, correlation with fruits: 0.11492851122865408
For Income, correlation with fruits: 0.07641482388218547
For Age, correlation with fruits: 0.06693417991910805
For AnyHealthcare, correlation with fruits: 0.03154391879221864
For CholCheck, correlation with fruits: 0.023849406401563314
For Stroke, correlation with fruits: -0.013389353021372062
For HeartDiseaseorAttack, correlation with fruits: -0.019790347908616793
For HvyAlcoholConsump, correlation with fruits: -0.035287732906116885
For HighBP, correlation with fruits: -0.0405546586179681
For HighChol, correlation with fruits: -0.04085908133163381
For NoDocbcCost, correlation with fruits: -0.04424268944660032
For PhysHlth, correlation with fruits: -0.04527815403860908
For DiffWalk, correlation with fruits: -0.04835167462915144
For MentHlth, correlation with fruits: -0