In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

df = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
df['Diabetes_012'].replace({2.0:1.0}, inplace = True)

In [5]:
df = df[['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']]
#Top 5 Columns
#df = df[['Diabetes_012', 'HighBP', 'HighChol','BMI','GenHlth','DiffWalk']]

#For Undersampling, comment out this section below (Age and BMI binarization) to get better performance
df['Age'] = np.where(df['Age'].between(1,6), 0, df['Age']) #Between and including 1 and 6
df['Age'] = np.where(df['Age'].between(7,13), 1, df['Age']) #Between and including 7 and 13

df['BMI'] = np.where(df['BMI'].between(18.5, 29.9), 0, df['BMI']) 
df['BMI'] = np.where(df['BMI'].between(1, 18.4), 1, df['BMI'])
df['BMI'] = np.where(df['BMI'].between(30, 50), 1, df['BMI']) 
df.sample(5)

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
216501,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,6.0,8.0
170903,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,6.0,8.0
18675,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,30.0,0.0,0.0,0.0,1.0,6.0,8.0
253597,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,6.0,8.0
197743,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0


In [6]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(df[['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']], df['Diabetes_012'], test_size = 0.15)
#Top 5 Columns (Train test split)
#x_train, x_test, y_train, y_test = train_test_split(df[['HighBP', 'HighChol','BMI','GenHlth','DiffWalk']], df['Diabetes_012'], test_size = 0.15)

x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.15)

In [7]:
############### CLASS_WEIGHT = BALANCED ###############

In [8]:
# Train model
RandForest = RandomForestClassifier(class_weight = "balanced").fit(x_tr, y_tr)
print("Random Forest score: %" + str(round((RandForest.score(x_val, y_val)) * 100 , 2)))

Random Forest score: %80.85


In [9]:
#CLASSIFICATION REPORT
y_predict = RandForest.predict(x_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89     32050
         1.0       0.35      0.26      0.30      6002

    accuracy                           0.81     38052
   macro avg       0.61      0.58      0.59     38052
weighted avg       0.79      0.81      0.79     38052



In [10]:
#CONFUSION MATRIX
print(confusion_matrix(y_test, y_predict))
array = (confusion_matrix(y_test, y_predict))
print ("The amount of True Positives are: " + str(array[1][1]))
print ("The amount of True Negatives are: " + str(array[0][0]))

[[29121  2929]
 [ 4438  1564]]
The amount of True Positives are: 1564
The amount of True Negatives are: 29121


In [11]:
############### OVERSAMPLING ###############

In [12]:
from imblearn import over_sampling
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

# OverSampling Data
ros = RandomOverSampler(random_state = 0)
x_resample, y_resample = ros.fit_resample(x_tr, y_tr)

In [13]:
#Training the model
RandForest_OverSamp = RandomForestClassifier().fit(x_resample, y_resample)
print("Random Forest score: %" + str(round((RandForest_OverSamp.score(x_val, y_val)) * 100, 2)))

Random Forest score: %78.57


In [14]:
#CLASSIFICATION REPORT
y_predict_OverSamp = RandForest_OverSamp.predict(x_test)
print(classification_report(y_test, y_predict_OverSamp))

              precision    recall  f1-score   support

         0.0       0.88      0.85      0.87     32050
         1.0       0.33      0.39      0.36      6002

    accuracy                           0.78     38052
   macro avg       0.61      0.62      0.61     38052
weighted avg       0.80      0.78      0.79     38052



In [15]:
#CONFUSION MATRIX
print(confusion_matrix(y_test, y_predict_OverSamp))
array_over = (confusion_matrix(y_test, y_predict_OverSamp))
print ("The amount of True Positives are: " + str(array_over[1][1]))
print ("The amount of True Negatives are: " + str(array_over[0][0]))

[[27342  4708]
 [ 3660  2342]]
The amount of True Positives are: 2342
The amount of True Negatives are: 27342


In [16]:
############### SMOTE-OVERSAMPLING ###############

In [17]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 'minority', k_neighbors = 15)
x_smote, y_smote = smote.fit_resample(x_tr, y_tr)

In [18]:
#Training the model
RandForest_smote = RandomForestClassifier().fit(x_smote, y_smote)
print("Random Forest score: %" + str(round((RandForest_smote.score(x_val, y_val)) * 100, 2)))

Random Forest score: %82.51


In [19]:
#CLASSIFICATION REPORT
y_predict_smote = RandForest_smote.predict(x_test)
print(classification_report(y_test, y_predict_smote))

              precision    recall  f1-score   support

         0.0       0.87      0.93      0.90     32050
         1.0       0.40      0.25      0.30      6002

    accuracy                           0.82     38052
   macro avg       0.63      0.59      0.60     38052
weighted avg       0.79      0.82      0.80     38052



In [20]:
#CONFUSION MATRIX
print(confusion_matrix(y_test, y_predict_smote))
array_smote = (confusion_matrix(y_test, y_predict_smote))
print ("The amount of True Positives are: " + str(array_smote[1][1]))
print ("The amount of True Negatives are: " + str(array_smote[0][0]))

[[29788  2262]
 [ 4522  1480]]
The amount of True Positives are: 1480
The amount of True Negatives are: 29788


In [21]:
############### UNDERSAMPLING(Comment out Age and BMI binarization to get better results) ###############

In [22]:
from imblearn import under_sampling
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state = 0)
x_undersample, y_undersample = rus.fit_resample(x_tr, y_tr)

In [23]:
#Training the model
RandForest_under = RandomForestClassifier().fit(x_undersample, y_undersample)
print("Random Forest score: %" + str(round((RandForest_under.score(x_val, y_val)) * 100, 2)))

Random Forest score: %68.75


In [24]:
#CLASSIFICATION REPORT
y_predict_under = RandForest_under.predict(x_test)
print(classification_report(y_test, y_predict_under))

              precision    recall  f1-score   support

         0.0       0.93      0.68      0.78     32050
         1.0       0.30      0.74      0.43      6002

    accuracy                           0.69     38052
   macro avg       0.62      0.71      0.61     38052
weighted avg       0.83      0.69      0.73     38052



In [25]:
#CONFUSION MATRIX
print(confusion_matrix(y_test, y_predict_under))
array_under = (confusion_matrix(y_test, y_predict_under))
print ("The amount of True Positives are: " + str(array_under[1][1]))
print ("The amount of True Negatives are: " + str(array_under[0][0]))

[[21641 10409]
 [ 1542  4460]]
The amount of True Positives are: 4460
The amount of True Negatives are: 21641


In [26]:
############### NO UNDERSAMPLING, OVERSAMPLING, SMOTE, BALANCED WEIGHT ###############

In [27]:
# Train model
RandForest_nothing = RandomForestClassifier().fit(x_tr, y_tr)
print("Random Forest score: %" + str(round((RandForest_nothing.score(x_val, y_val)) * 100 , 2)))

Random Forest score: %83.35


In [28]:
#CLASSIFICATION REPORT
y_predict_nothing = RandForest_nothing.predict(x_test)
print(classification_report(y_test, y_predict_nothing))

              precision    recall  f1-score   support

         0.0       0.87      0.95      0.90     32050
         1.0       0.43      0.21      0.28      6002

    accuracy                           0.83     38052
   macro avg       0.65      0.58      0.59     38052
weighted avg       0.80      0.83      0.81     38052



In [30]:
#CONFUSION MATRIX
print(confusion_matrix(y_test, y_predict_nothing))
array_n = (confusion_matrix(y_test, y_predict_nothing))
print ("The amount of True Positives are: " + str(array_n[1][1]))
print ("The amount of True Negatives are: " + str(array_n[0][0]))

[[30378  1672]
 [ 4731  1271]]
The amount of True Positives are: 1271
The amount of True Negatives are: 30378
