<a href="https://colab.research.google.com/github/alexandriaorvis/predicting_diabetes/blob/main/Scaled_Continuous_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)

In [2]:
# Read in spreadsheet
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read CSV file directly without changing directory
file_path = '/content/drive/My Drive/Colab Notebooks/Resources/diabetes_2_classes.csv'
diabetes_df = pd.read_csv(file_path)
diabetes_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


##Preparing data for Logistic Regression

In [4]:
# Set our target and feature variables for the ML model
y = diabetes_df['Diabetes_binary']
X = diabetes_df.drop(columns='Diabetes_binary')

In [5]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

##Logistic Regression Model

In [7]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs", random_state = 1, max_iter=450)

# Train the model
log_classifier.fit(X_train,y_train)

We tried multiple max_iter values and found that 450 was the lowest value at which the model converged. We didn't see increased accuracy with higher values.

In [8]:
# Score the model
print(f"Training Data Score: {log_classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {log_classifier.score(X_test, y_test)}")

Training Data Score: 0.8639177966992536
Testing Data Score: 0.8626458530432041


In [9]:
# Predict outcomes for test data set
predictions = log_classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
235899,0.0,0.0
74852,0.0,1.0
8205,0.0,0.0
127632,0.0,1.0
32021,0.0,0.0
...,...,...
108360,0.0,0.0
8531,0.0,0.0
183429,0.0,0.0
52932,0.0,1.0


Note that the model fails at predicting actual diabetes cases for the majority of the time.

In [10]:
# Create and save the training classification report
training_report = classification_report(y_test, predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     54551
         1.0       0.53      0.15      0.23      8869

    accuracy                           0.86     63420
   macro avg       0.70      0.56      0.58     63420
weighted avg       0.83      0.86      0.83     63420



#Model Optimization


## Scaling only continuous variables

In [11]:
# Copy the original df for scaling
scaled_df = diabetes_df.copy()

# Define the continuous columns to be scaled
scaled_cols = ['BMI', 'MentHlth', 'PhysHlth']

# Scale the columns in the copied dataframe
diabetes_scaled = StandardScaler().fit_transform(scaled_df[scaled_cols])

# Make these new scaled columns into a dataframe and display
diabetes_scaled = pd.DataFrame(diabetes_scaled, columns = ['BMI_scaled', 'MentHlth_scaled', 'PhysHlth_scaled'])



In [12]:
#Concatinate the original dataframe and the new scaled columns
diabetes_scaled = pd.concat([scaled_df, diabetes_scaled], axis=1)
diabetes_scaled.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592


In [13]:
# Drop columns that are going to be scaled
scaled_df = diabetes_scaled.drop(columns = scaled_cols)
scaled_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592


# Scaled Variables Logistic Regression

In [14]:
# Set our target and feature variables for the ML model
y = scaled_df['Diabetes_binary']
X = scaled_df.drop(columns='Diabetes_binary')

In [15]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

##Scaled Variable Split Data

In [16]:
# Define the logistic regression model
log_classifier2 = LogisticRegression(solver="lbfgs", random_state = 1, max_iter=450)

# Train the model
log_classifier2.fit(X_train,y_train)

In [17]:
# Score the model
print(f"Training Data Score: {log_classifier2.score(X_train, y_train)}")
print(f"Testing Data Score: {log_classifier2.score(X_test, y_test)}")

Training Data Score: 0.8639493324923788
Testing Data Score: 0.8625985493535162


In [18]:
# Predict outcomes for test data set
predictions2 = log_classifier2.predict(X_test)
preds_v_actual = pd.DataFrame({"Prediction": predictions2, "Actual": y_test})
preds_v_actual.head()

Unnamed: 0,Prediction,Actual
235899,0.0,0.0
74852,0.0,1.0
8205,0.0,0.0
127632,0.0,1.0
32021,0.0,0.0


In [19]:
# Create and save the training classification report
training_report2 = classification_report(y_test, predictions2)

# Print the training classification report
print(training_report2)

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     54551
         1.0       0.53      0.15      0.23      8869

    accuracy                           0.86     63420
   macro avg       0.70      0.56      0.58     63420
weighted avg       0.83      0.86      0.83     63420



In [20]:
# Accuracy score for logistic regression
y_pred = log_classifier2.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.863


Scaling the continuous variables compared to scaling all of the variables does not make a difference in the accuracy of the model


#Feature Engineering

In [21]:
# FEATURE ENGINEERING
scaled_df["Age_/_BMI"] = scaled_df["BMI_scaled"] / scaled_df["Age"]
scaled_df["MentHlth_*_BMI"] = scaled_df["MentHlth_scaled"] * scaled_df["BMI_scaled"]
scaled_df["Age_*_BMI"] = scaled_df["BMI_scaled"] * scaled_df["Age"]
scaled_df["Income_*_BMI"] = scaled_df["Income"] * scaled_df["BMI_scaled"]
scaled_df["PhysHlth_*_MentHlth"] = scaled_df["PhysHlth_scaled"] * scaled_df["MentHlth_scaled"]
scaled_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129


In [22]:
#Dropping least important columns
least_columns = ["CholCheck", "HvyAlcoholConsump", "AnyHealthcare", "Stroke", "NoDocbcCost"]
scaled_df.drop(columns = least_columns)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth
0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.466260
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.429630,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.954590,-0.006429,-0.209295,-0.520720,-0.462862,10.687953
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.429630,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055
4,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,5.0,6.0,7.0,2.514516,-0.429630,0.086938,0.502903,-1.080311,12.572581,17.601614,-0.037351
253676,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,11.0,2.0,4.0,-1.571019,-0.429630,-0.486592,-0.142820,0.674956,-17.281209,-6.284076,0.209055
253677,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,5.0,2.0,-0.057858,-0.429630,-0.486592,-0.028929,0.024857,-0.115716,-0.115716,0.209055
253678,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,7.0,5.0,1.0,-0.814438,-0.429630,-0.486592,-0.116348,0.349907,-5.701069,-0.814438,0.209055


In [23]:
get_dummies_columns = ["GenHlth", "Age", "Income", "Education"]
scaled_df[get_dummies_columns] = scaled_df[get_dummies_columns].astype(object)
scaled_df.head()


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129


In [24]:
# Set our target and feature variables for the ML model
y = scaled_df['Diabetes_binary']
X = scaled_df.drop(columns='Diabetes_binary')

###Converting Categorical Data to Dummie Variables

In [25]:
# Generate dummy variables
X = pd.get_dummies(X)
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,DiffWalk,Sex,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_/_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth,GenHlth_1.0,GenHlth_2.0,GenHlth_3.0,GenHlth_4.0,GenHlth_5.0,Age_1.0,Age_2.0,Age_3.0,Age_4.0,Age_5.0,Age_6.0,Age_7.0,Age_8.0,Age_9.0,Age_10.0,Age_11.0,Age_12.0,Age_13.0,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,Education_6.0,Income_1.0,Income_2.0,Income_3.0,Income_4.0,Income_5.0,Income_6.0,Income_7.0,Income_8.0
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False


In [26]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [27]:
# Define the logistic regression model
log_classifier3 = LogisticRegression(solver="lbfgs", random_state = 1, max_iter=1000)

# Train the model
log_classifier3.fit(X_train,y_train)

In [28]:
# Score the model
print(f"Training Data Score: {log_classifier3.score(X_train, y_train)}")
print(f"Testing Data Score: {log_classifier3.score(X_test, y_test)}")

Training Data Score: 0.865279091769158
Testing Data Score: 0.86480605487228


In [29]:
# Predict outcomes for test data set
predictions3 = log_classifier3.predict(X_test)
pd.DataFrame({"Prediction": predictions3, "Actual": y_test})

Unnamed: 0,Prediction,Actual
235899,0.0,0.0
74852,0.0,1.0
8205,0.0,0.0
127632,0.0,1.0
32021,0.0,0.0
...,...,...
108360,0.0,0.0
8531,0.0,0.0
183429,0.0,0.0
52932,0.0,1.0


In [30]:
# Create and save the training classification report
training_report3 = classification_report(y_test, predictions3)

# Print the training classification report
print(training_report3)



              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     54551
         1.0       0.56      0.15      0.24      8869

    accuracy                           0.86     63420
   macro avg       0.72      0.57      0.58     63420
weighted avg       0.83      0.86      0.83     63420



#ALL CLASSIFICATION REPORTS

In [32]:
print(training_report)
print(training_report2)
print(training_report3)

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     54551
         1.0       0.53      0.15      0.23      8869

    accuracy                           0.86     63420
   macro avg       0.70      0.56      0.58     63420
weighted avg       0.83      0.86      0.83     63420

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     54551
         1.0       0.53      0.15      0.23      8869

    accuracy                           0.86     63420
   macro avg       0.70      0.56      0.58     63420
weighted avg       0.83      0.86      0.83     63420

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     54551
         1.0       0.56      0.15      0.24      8869

    accuracy                           0.86     63420
   macro avg       0.72      0.57      0.58     63420
weighted avg       0.83      0.86      0.83     63420

