In [1]:
#Andreas Constantinou
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import spearmanr

df = pd.read_csv("diabetes_012.csv")
df['Diabetes_012'].replace({2.0:1.0}, inplace = True)
oversample_copy = df[[ 'HighBP', 'HighChol', 'Diabetes_012']]
oversample_copy.sample(10)


Unnamed: 0,HighBP,HighChol,Diabetes_012
140531,0.0,0.0,0.0
253184,0.0,1.0,0.0
94015,1.0,0.0,0.0
24946,1.0,1.0,1.0
218434,1.0,0.0,0.0
148268,1.0,1.0,0.0
806,0.0,1.0,0.0
182253,0.0,1.0,1.0
196124,0.0,0.0,0.0
28239,0.0,0.0,0.0


In [2]:
x_train, x_test, y_train, y_test = train_test_split(oversample_copy[['HighBP', 'HighChol']], oversample_copy['Diabetes_012'], test_size = .15, train_size = .85)
x_train2, x_val, y_train2, y_val = train_test_split(x_train,y_train, test_size = .15, train_size = .85)

In [3]:
Diabetes = df['HighBP']
column_array = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
all_correlations = []
for i in range (len(column_array)):
    current_index = df[column_array[i]]
    spearmanr_coefficient, p_value = spearmanr(Diabetes, current_index)
    all_correlations.append([column_array[i], spearmanr_coefficient])

all_correlations.sort(key = lambda all_correlations : all_correlations[1], reverse = True)
for i in range(len(all_correlations)):
    print("For " + str(all_correlations[i][0]) + ', correlation with diabetes: ' + str(all_correlations[i][1]))

For HighBP, correlation with diabetes: 1.0
For Age, correlation with diabetes: 0.34453457401969934
For GenHlth, correlation with diabetes: 0.3033004153206846
For HighChol, correlation with diabetes: 0.29819929508040555
For BMI, correlation with diabetes: 0.2437560144296664
For DiffWalk, correlation with diabetes: 0.22361846590455572
For HeartDiseaseorAttack, correlation with diabetes: 0.2093612106900363
For PhysHlth, correlation with diabetes: 0.1495504814647678
For Stroke, correlation with diabetes: 0.12957491304959706
For CholCheck, correlation with diabetes: 0.09850827252915051
For Smoker, correlation with diabetes: 0.09699146704969772
For Sex, correlation with diabetes: 0.052206960687285774
For AnyHealthcare, correlation with diabetes: 0.038424768934771536
For MentHlth, correlation with diabetes: 0.02049928526481007
For NoDocbcCost, correlation with diabetes: 0.017357983577041575
For HvyAlcoholConsump, correlation with diabetes: -0.003971573995750689
For Fruits, correlation with di

In [4]:
x = x_train2
y = y_train2

In [5]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 'minority', k_neighbors = 15)
X, Y = smote.fit_resample(x,y)
Y.value_counts()

0.0    154269
1.0    154269
Name: Diabetes_012, dtype: int64

In [6]:
LogReg = LogisticRegression(max_iter=1000).fit(X,Y)
print("Logistic Regression score: %" + str(round((LogReg.score(x_val,y_val)) * 100 , 2)))

Logistic Regression score: %64.59


In [7]:
#Person is healthy
HighBP_ = 0 
HighChol_ = 0 
LogReg.predict(np.array([[HighBP_, HighChol_]]))[0]
# 0 = No Diabetes, 1 = Diabetes



0.0

In [8]:
#Person has high cholesterol
HighBP_ = 0 
HighChol_ = 1 
LogReg.predict(np.array([[HighBP_,HighChol_]]))[0]
# 0 = No Diabetes, 1 = Diabetes



0.0

In [9]:
#Person has high blood pressure
HighBP_ = 1 
HighChol_ = 0 
LogReg.predict(np.array([[HighBP_,HighChol_]]))[0]
# 0 = No Diabetes, 1 = Diabetes



1.0

In [10]:
#Person has high blood pressure and high cholesterol
HighBP_ = 1 
HighChol_ = 1 
LogReg.predict(np.array([[HighBP_,HighChol_]]))[0]
# 0 = No Diabetes, 1 = Diabetes



1.0