<a href="https://colab.research.google.com/github/alexandriaorvis/predicting_diabetes/blob/main/Scaled_Continuous_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)

In [2]:
# Read in spreadsheet
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read CSV file directly without changing directory
file_path = '/content/drive/My Drive/predicting_diabetes/Resources/diabetes_2_classes.csv'
diabetes_df = pd.read_csv(file_path)
diabetes_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


The following code represents our efforts to fully understand the nature of each feature in order to determine how the features might be scaled to produce the most accurate model.
<br>-BMI: continuous variable
<br>-Age: cleaned to be categorical from 1 - 13
<br>-General Health: a categorical feature from 1 - 5
<br>-Mental Health: Range of 0 - 30 for number of days
<br>-Physical Health: Range of 0 - 30 for number of days
<br>-Education: Categorical variable from 1 - 6
<br>-Income: Categorical variable from 1 - 8


Scaled continuous variables random forest

In [8]:
# Copy the original df for scaling
scaled_df = diabetes_df.copy()

# Define the continuous columns to be scaled
scaled_cols = ['BMI', 'MentHlth', 'PhysHlth']

# Scale the columns in the copied dataframe
diabetes_scaled = StandardScaler().fit_transform(scaled_df[scaled_cols])

# Make these new scaled columns into a dataframe and display
diabetes_scaled = pd.DataFrame(diabetes_scaled, columns = ['BMI_scaled', 'MentHlth_scaled', 'PhysHlth_scaled'])

diabetes_scaled.head()


Unnamed: 0,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,1.757936,1.998592,1.233999
1,-0.511806,-0.42963,-0.486592
2,-0.057858,3.617407,2.95459
3,-0.209174,-0.42963,-0.486592
4,-0.663122,-0.024926,-0.486592


In [9]:
#Concatinate the original dataframe and the new scaled columns
diabetes_scaled = pd.concat([scaled_df, diabetes_scaled], axis=1)
diabetes_scaled.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592


In [13]:
# Drop columns that are going to be scaled
scaled_df = diabetes_scaled.drop(columns = scaled_cols)

In [14]:
scaled_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592


In [15]:
# Set our target and feature variables for the ML model
y = scaled_df['Diabetes_binary']
X = scaled_df.drop(columns='Diabetes_binary')

In [16]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Scaled Continuous Random Forest Classification

In [18]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [19]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [20]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [21]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.184239643755795, 'BMI_scaled'),
 (0.1215411005058792, 'Age'),
 (0.09828969827062785, 'Income'),
 (0.0836137019487867, 'PhysHlth_scaled'),
 (0.07085887481717447, 'GenHlth'),
 (0.06914777431618098, 'Education'),
 (0.06363067460930899, 'MentHlth_scaled'),
 (0.04306473443080814, 'HighBP'),
 (0.03319645157727806, 'Smoker'),
 (0.03301885748534989, 'Fruits'),
 (0.027811841279045384, 'Sex'),
 (0.027352308440983387, 'HighChol'),
 (0.02639937350999149, 'Veggies'),
 (0.02639184551290278, 'PhysActivity'),
 (0.025012954708162716, 'DiffWalk'),
 (0.01881078434694898, 'HeartDiseaseorAttack'),
 (0.015084330683739083, 'NoDocbcCost'),
 (0.012403062667201759, 'Stroke'),
 (0.008695398739328944, 'AnyHealthcare'),
 (0.007762020465648849, 'HvyAlcoholConsump'),
 (0.003674567928857286, 'CholCheck')]

In [22]:
# Create and save the training classification report
rf_training_report = classification_report(y_test, rf_predictions)

# Print the training classification report
print(rf_training_report)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.82      0.86      0.83     63420



In [None]:
# Redefine X variable with new columns
X = diabetes_df.drop(columns='Diabetes_binary')

In [None]:
# Run Train Test Split on data scaled only for continuous variables
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y, random_state=1)

In [None]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model_1 = rf_model.fit(X_train_scaled, y_train.ravel())

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1])

0.8006719112996465