In [17]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Read in spreadsheet
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read CSV file directly without changing directory
file_path = '/content/drive/My Drive/Colab Notebooks/Resources/diabetes_binary_health_indicators_BRFSS2015.csv'
diabetes_df = pd.read_csv(file_path)
diabetes_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [30]:
diabetes_df['Age'].value_counts()

9.0     33244
10.0    32194
8.0     30832
7.0     26314
11.0    23533
6.0     19819
13.0    17363
5.0     16157
12.0    15980
4.0     13823
3.0     11123
2.0      7598
1.0      5700
Name: Age, dtype: int64

# Logistic Regression Analysis

In [4]:
# Set our target and feature variables for the ML model
y = diabetes_df['Diabetes_binary']
X = diabetes_df.drop(columns='Diabetes_binary')

In [5]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Need to scale only numerical data here

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a logistic regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=500)

# Train the model
log_classifier.fit(X_train,y_train)

In [9]:
# Score the model
print(f"Training Data Score: {log_classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {log_classifier.score(X_test, y_test)}")

Training Data Score: 0.8639177966992536
Testing Data Score: 0.8626458530432041


In [11]:
# Predict outcomes for test data set
predictions = log_classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
235899,0.0,0.0
74852,0.0,1.0
8205,0.0,0.0
127632,0.0,1.0
32021,0.0,0.0
...,...,...
108360,0.0,0.0
8531,0.0,0.0
183429,0.0,0.0
52932,0.0,1.0


In [15]:
# Create and save the training classification report
training_report = classification_report(y_test, predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     54551
         1.0       0.53      0.15      0.23      8869

    accuracy                           0.86     63420
   macro avg       0.70      0.56      0.58     63420
weighted avg       0.83      0.86      0.83     63420



In [12]:
# Accuracy score for logistic regression
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.863


# Random Forest Classification

In [18]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [19]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [20]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [25]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.18501261769696284, 'BMI'),
 (0.121675163514546, 'Age'),
 (0.09787708871566989, 'Income'),
 (0.083877112217441, 'PhysHlth'),
 (0.07196515787559798, 'GenHlth'),
 (0.06975689776582429, 'Education'),
 (0.06355341407413395, 'MentHlth'),
 (0.04252510348320702, 'HighBP'),
 (0.03316838247755782, 'Fruits'),
 (0.033119970093333194, 'Smoker'),
 (0.02781096172193543, 'Sex'),
 (0.027279910221981032, 'HighChol'),
 (0.026352166939613002, 'Veggies'),
 (0.026115079699863524, 'PhysActivity'),
 (0.023530441905937987, 'DiffWalk'),
 (0.018835993353355243, 'HeartDiseaseorAttack'),
 (0.015016990414439319, 'NoDocbcCost'),
 (0.012328455397597242, 'Stroke'),
 (0.008699430051438379, 'AnyHealthcare'),
 (0.007834437027709302, 'HvyAlcoholConsump'),
 (0.0036652253518555684, 'CholCheck')]

In [23]:
# Create and save the training classification report
rf_training_report = classification_report(y_test, rf_predictions)

# Print the training classification report
print(rf_training_report)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     54551
         1.0       0.50      0.17      0.25      8869

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.83      0.86      0.83     63420



In [24]:
print(training_report)

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     54551
         1.0       0.53      0.15      0.23      8869

    accuracy                           0.86     63420
   macro avg       0.70      0.56      0.58     63420
weighted avg       0.83      0.86      0.83     63420



In [29]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1])

0.8006719112996465