<a href="https://colab.research.google.com/github/alexandriaorvis/predicting_diabetes/blob/main/random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)

In [2]:
# Read in spreadsheet
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read CSV file directly without changing directory
file_path = '/content/drive/My Drive/predicting_diabetes/Resources/diabetes_2_classes.csv'
diabetes_df = pd.read_csv(file_path)
diabetes_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


## Preparing data for Random Forest

In [4]:
# Set our target and feature variables for the RF model
y = diabetes_df['Diabetes_binary']
X = diabetes_df.drop(columns='Diabetes_binary')

In [5]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Random Forest Model

In [7]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

In [8]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [9]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [10]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.18353541000627333, 'BMI'),
 (0.1211699051829007, 'Age'),
 (0.09856545715166677, 'Income'),
 (0.08412441188908018, 'PhysHlth'),
 (0.07169226335995714, 'GenHlth'),
 (0.06984587695890691, 'Education'),
 (0.06341220604990261, 'MentHlth'),
 (0.043146298969736985, 'HighBP'),
 (0.03323610092494402, 'Smoker'),
 (0.03302956823085998, 'Fruits'),
 (0.027938287624013763, 'Sex'),
 (0.027002815609910902, 'HighChol'),
 (0.026566816620707295, 'PhysActivity'),
 (0.026457372245987872, 'Veggies'),
 (0.023695814305337123, 'DiffWalk'),
 (0.01905381521405324, 'HeartDiseaseorAttack'),
 (0.014949180140879498, 'NoDocbcCost'),
 (0.012346024981504474, 'Stroke'),
 (0.00873416623117736, 'AnyHealthcare'),
 (0.007824841074291962, 'HvyAlcoholConsump'),
 (0.003673367227907871, 'CholCheck')]

In [11]:
# Create and save the training classification report
rf_training_report = classification_report(y_test, rf_predictions)

# Print the training classification report
print(rf_training_report)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.82      0.86      0.83     63420



In [27]:
from sklearn.metrics import roc_auc_score
accuracy_score = roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1])
accuracy_score

0.8004097437939114

# Model Optimization

## Scaling only continuous variables

Most of the features are catagorical. We attempted to optimize the data by scaling ONLY the continuous features.

In [13]:
# Copy the original df for scaling
scaled_df = diabetes_df.copy()

# Define the continuous columns to be scaled
scaled_cols = ['BMI', 'MentHlth', 'PhysHlth']

# Scale the columns in the copied dataframe
diabetes_scaled = StandardScaler().fit_transform(scaled_df[scaled_cols])

# Make these new scaled columns into a dataframe and display
diabetes_scaled = pd.DataFrame(diabetes_scaled, columns = ['BMI_scaled', 'MentHlth_scaled', 'PhysHlth_scaled'])

In [14]:
#Concatinate the original dataframe and the new scaled columns
diabetes_scaled = pd.concat([scaled_df, diabetes_scaled], axis=1)
diabetes_scaled.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592


In [15]:
# Drop columns that are going to be scaled
scaled_df = diabetes_scaled.drop(columns = scaled_cols)
scaled_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592


# Scaled Variables Random Forest


In [16]:
# Set our target and feature variables for the ML model
y = scaled_df['Diabetes_binary']
X = scaled_df.drop(columns='Diabetes_binary')

In [17]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [18]:
# Create the random forest classifier instance
rf_model_2 = RandomForestClassifier(n_estimators=500, random_state=1)

In [19]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model_2 = rf_model_2.fit(X_train, y_train.ravel())

In [21]:
# Making predictions using the testing data
rf_predictions_2 = rf_model_2.predict(X_test)

In [23]:
# Create and save the training classification report
rf_training_report_2 = classification_report(y_test, rf_predictions)

# Print the training classification report
print(rf_training_report_2)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.83      0.86      0.83     63420



In [29]:
accuracy_score_2 = roc_auc_score(y_test, rf_model_2.predict_proba(X_test)[:, 1])
accuracy_score_2

0.8003314221816846

Scalign the continuous variables compared to scaling all of the variables does not make a significant difference in the accuracy, recall, or precision of the model.

## Feature Engineering

In [30]:
scaled_df["Age_/_BMI"] = scaled_df["BMI_scaled"] / scaled_df["Age"]
scaled_df["MentHlth_*_BMI"] = scaled_df["MentHlth_scaled"] * scaled_df["BMI_scaled"]
scaled_df["Age_*_BMI"] = scaled_df["BMI_scaled"] * scaled_df["Age"]
scaled_df["Income_*_BMI"] = scaled_df["Income"] * scaled_df["BMI_scaled"]
scaled_df["PhysHlth_*_MentHlth"] = scaled_df["PhysHlth_scaled"] * scaled_df["MentHlth_scaled"]
scaled_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129


In [31]:
# Dropping the least important columns
least_columns = ["CholCheck", "HvyAlcoholConsump", "AnyHealthcare", "Stroke", "NoDocbcCost"]
scaled_df.drop(columns=least_columns)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth
0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.466260
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.429630,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.954590,-0.006429,-0.209295,-0.520720,-0.462862,10.687953
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.429630,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055
4,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,5.0,6.0,7.0,2.514516,-0.429630,0.086938,0.502903,-1.080311,12.572581,17.601614,-0.037351
253676,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,11.0,2.0,4.0,-1.571019,-0.429630,-0.486592,-0.142820,0.674956,-17.281209,-6.284076,0.209055
253677,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,5.0,2.0,-0.057858,-0.429630,-0.486592,-0.028929,0.024857,-0.115716,-0.115716,0.209055
253678,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,7.0,5.0,1.0,-0.814438,-0.429630,-0.486592,-0.116348,0.349907,-5.701069,-0.814438,0.209055


In [32]:
# Change data types to string to turn categorical variables into dummy variables
catagorical_columns = ["GenHlth", "Age", "Income", "Education"]
scaled_df[catagorical_columns] = scaled_df[catagorical_columns].astype(object)
scaled_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129


### Splitting data

In [33]:
# Set our target and feature variables for the ML model
y = scaled_df['Diabetes_binary']
X = scaled_df.drop(columns='Diabetes_binary')

In [34]:
# Set catagorical variables as dummy variables
X = pd.get_dummies(X)
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,DiffWalk,Sex,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth,GenHlth_1.0,GenHlth_2.0,GenHlth_3.0,GenHlth_4.0,GenHlth_5.0,Age_1.0,Age_2.0,Age_3.0,Age_4.0,Age_5.0,Age_6.0,Age_7.0,Age_8.0,Age_9.0,Age_10.0,Age_11.0,Age_12.0,Age_13.0,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,Education_6.0,Income_1.0,Income_2.0,Income_3.0,Income_4.0,Income_5.0,Income_6.0,Income_7.0,Income_8.0
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False


In [35]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [36]:
# Create the random forest classifier instance
rf_model_3 = RandomForestClassifier(n_estimators=500, random_state=1)

In [37]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model_3 = rf_model_3.fit(X_train, y_train.ravel())

In [38]:
# Making predictions using the testing data
rf_predictions_3 = rf_model_3.predict(X_test)

In [39]:
# Create and save the training classification report
rf_training_report_3 = classification_report(y_test, rf_predictions)

# Print the training classification report
print(rf_training_report_3)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.83      0.86      0.83     63420



In [40]:
accuracy_score_3 = roc_auc_score(y_test, rf_model_3.predict_proba(X_test)[:, 1])
accuracy_score_3

0.7964348367131628

# All Classificatino Reports

In [41]:
print(rf_training_report)
print(rf_training_report_2)
print(rf_training_report_3)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.83      0.86      0.83     63420

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.83      0.86      0.83     63420

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.83      0.86      0.83     63420

