In [1]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import tensorflow as tf
from scipy.stats import linregress
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)

In [2]:
# Read in spreadsheet
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read CSV file directly without changing directory
file_path = '/content/drive/My Drive/Colab Notebooks/Resources/diabetes_prediction_dataset.csv'
diabetes_df = pd.read_csv(file_path)
diabetes_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
# Check the dtypes of the dataset. Looks like all values are set as float64
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [5]:
# Check gender column for value counts
diabetes_df['gender'].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [6]:
# Remove 'other' from gender column because it doesn't help us predict anything
diabetes_df = diabetes_df[diabetes_df['gender'] != 'Other']
diabetes_df['gender'].value_counts()

gender
Female    58552
Male      41430
Name: count, dtype: int64

In [7]:
# Set our target and feature variables for the ML model
y = diabetes_df['diabetes']
X = diabetes_df.drop(columns='diabetes')

In [8]:
# Encode the categorical data using pd.get_dummies
X = pd.get_dummies(X)

In [9]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [12]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [13]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [14]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [15]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,22757,94
Actual 1,654,1491


Accuracy Score : 0.9700752120339254
Classification Report
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22851
           1       0.94      0.70      0.80      2145

    accuracy                           0.97     24996
   macro avg       0.96      0.85      0.89     24996
weighted avg       0.97      0.97      0.97     24996



In [16]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.39824483941083055, 'HbA1c_level'),
 (0.3235730667944942, 'blood_glucose_level'),
 (0.12423766891926505, 'bmi'),
 (0.1037387088280158, 'age'),
 (0.014464633536073171, 'hypertension'),
 (0.011046863224058867, 'heart_disease'),
 (0.004411657229451841, 'smoking_history_No Info'),
 (0.0038186431817052867, 'smoking_history_former'),
 (0.0036438388857344996, 'smoking_history_never'),
 (0.0026973276389077898, 'smoking_history_current')]

In [17]:
# Save the dataframe to another variable to use for feature engineering
diabetes2_df = diabetes_df
diabetes2_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [18]:
# Remove a few features that weren't impactful
drop_columns = ['smoking_history']
diabetes2_df = diabetes2_df.drop(columns=drop_columns)
diabetes2_df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,25.19,6.6,140,0
1,Female,54.0,0,0,27.32,6.6,80,0
2,Male,28.0,0,0,27.32,5.7,158,0
3,Female,36.0,0,0,23.45,5.0,155,0
4,Male,76.0,1,1,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,27.32,6.2,90,0
99996,Female,2.0,0,0,17.37,6.5,100,0
99997,Male,66.0,0,0,27.83,5.7,155,0
99998,Female,24.0,0,0,35.42,4.0,100,0


In [19]:
# Want to determine the outliers of the BMI column to remove in order to optimize the model
#lower_quartile = diabetes2_df['bmi'].quantile(.25)
#upper_quartile = diabetes2_df['bmi'].quantile(.75)

#iqr = upper_quartile - lower_quartile
#upper_outlier = upper_quartile + 1.5 * iqr
#lower_outlier = lower_quartile -1.5 * iqr

#print(f'The lower boundary is {lower_outlier}')
#print(f'The upper boundary is {upper_outlier}')

In [20]:
# Filter out the BMI outliers in the data frame above and below the boundaries
#diabetes2_df = diabetes2_df[(diabetes2_df['bmi'] > lower_outlier) & (diabetes2_df['bmi'] < upper_outlier)]
#diabetes2_df['bmi'].describe()

In [21]:
# Set our target and feature variables for the ML model
y2 = diabetes2_df['diabetes']
X2 = diabetes2_df.drop(columns='diabetes')

In [22]:
# Encode the categorical data using pd.get_dummies
X2 = pd.get_dummies(X2)

In [23]:
# Use SKlearn to train the data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=1)

In [24]:
# Copy the original df for scaling
scaled2_df = diabetes2_df.copy()

# Define the continuous columns to be scaled
scaled2_cols = ['bmi', 'age', 'HbA1c_level', 'blood_glucose_level']

# Scale the columns in the copied dataframe
diabetes2_scaled = StandardScaler().fit_transform(scaled2_df[scaled2_cols])

# Make these new scaled columns into a dataframe and display
diabetes2_scaled = pd.DataFrame(diabetes2_scaled, columns = ['bmi_scaled', 'age_scaled', 'HbA1c_level_scaled', 'blood_glucose_level_scaled'])


In [25]:
#Concatinate the original dataframe and the new scaled columns
diabetes2_scaled = pd.concat([scaled2_df, diabetes2_scaled], axis=1)


In [26]:
# Drop columns that are going to be scaled
scaled2_df = diabetes2_scaled.drop(columns = scaled2_cols)

In [27]:
# Create the random forest classifier instance
rf_model2 = RandomForestClassifier(n_estimators=500, random_state=78)

In [28]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model2 = rf_model2.fit(X_train2, y_train2.ravel())

In [29]:
# Making predictions using the testing data
predictions2 = rf_model2.predict(X_test2)

In [30]:
# Calculating the confusion matrix
cm2 = confusion_matrix(y_test2, predictions2)
cm2_df = pd.DataFrame(
    cm2, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score2 = accuracy_score(y_test2, predictions2)

In [31]:
# Get the feature importance array
importances = rf_model2.feature_importances_
# List the top 10 most important features
importances_sorted2 = sorted(zip(rf_model2.feature_importances_, X.columns), reverse=True)
importances_sorted2[:10]

[(0.4023400495008674, 'HbA1c_level'),
 (0.3250510490144988, 'blood_glucose_level'),
 (0.14074578465215176, 'bmi'),
 (0.10514928235187565, 'age'),
 (0.013510869382652646, 'hypertension'),
 (0.010002005518222761, 'heart_disease'),
 (0.0016188184475364534, 'gender_Male'),
 (0.0015821411321944455, 'gender_Female')]

In [32]:
# Displaying results
print("Confusion Matrix")
display(cm2_df)
print(f"Accuracy Score : {acc_score2}")
print("Classification Report")
print(classification_report(y_test2, predictions2))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,22716,135
Actual 1,650,1495


Accuracy Score : 0.9685949751960313
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     22851
           1       0.92      0.70      0.79      2145

    accuracy                           0.97     24996
   macro avg       0.94      0.85      0.89     24996
weighted avg       0.97      0.97      0.97     24996



In [33]:
# Displaying results for both models side by side to see differences
print("Confusion Matrix Alpha")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report Alpha")
print(classification_report(y_test, predictions))

print("Confusion Matrix Beta")
display(cm_df)
print(f"Accuracy Score : {acc_score2}")
print("Classification Report Beta")
print(classification_report(y_test2, predictions2))

Confusion Matrix Alpha


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,22757,94
Actual 1,654,1491


Accuracy Score : 0.9700752120339254
Classification Report Alpha
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22851
           1       0.94      0.70      0.80      2145

    accuracy                           0.97     24996
   macro avg       0.96      0.85      0.89     24996
weighted avg       0.97      0.97      0.97     24996

Confusion Matrix Beta


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,22757,94
Actual 1,654,1491


Accuracy Score : 0.9685949751960313
Classification Report Beta
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     22851
           1       0.92      0.70      0.79      2145

    accuracy                           0.97     24996
   macro avg       0.94      0.85      0.89     24996
weighted avg       0.97      0.97      0.97     24996

