# Diabetes Prediction Analysis, Results

"Patients Data ( Used for Heart Disease Prediction ).xlsx"

## Data loading


In [None]:
import pandas as pd

df = pd.read_excel('Patients_Data.xlsx')  # Adjust path if needed
display(df.head())



Mounted at /content/drive


Unnamed: 0,PatientID,State,Sex,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,...,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,1,Alabama,Female,Fair,Age 75 to 79,1.63,84.82,32.099998,0,1,...,Never used e-cigarettes in my entire life,1,"White only, Non-Hispanic",0,0,0,1,"No, did not receive any tetanus shot in the pa...",0,1
1,2,Alabama,Female,Very good,Age 65 to 69,1.6,71.669998,27.99,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received Tdap",0,0
2,3,Alabama,Male,Excellent,Age 60 to 64,1.78,71.209999,22.530001,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",1,0,0,0,"Yes, received tetanus shot but not sure what type",0,0
3,4,Alabama,Male,Very good,Age 70 to 74,1.78,95.25,30.129999,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received tetanus shot but not sure what type",0,0
4,5,Alabama,Female,Good,Age 50 to 54,1.68,78.019997,27.76,0,0,...,Never used e-cigarettes in my entire life,1,"Black only, Non-Hispanic",0,0,1,0,"No, did not receive any tetanus shot in the pa...",0,0


## Identify key variables and data structure.

In [None]:
print(df.columns)
print(df.info())
#View the unique categories for all categorical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f"Column: {col}")
    print("Unique Categories:", unique_values)
    print("-" * 40)

Index(['PatientID', 'State', 'Sex', 'GeneralHealth', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'HadHeartAttack',
       'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory',
       'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237630 entries, 0 to 237629
Data columns (total 35 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PatientID                  237630 non-null  int64  
 1   State       

## Performing Data Mutations
### Recode "HadDiabetes":

In [None]:
import numpy as np
'''Binary Conversion: Map "Yes" to 1 and "No" to 0.
Exclude Ambiguous Cases: Remove the entries labeled “No, pre-diabetes or borderline diabetes” and
“Yes, but only during pregnancy (female)” to ensure a clear definition of diabetes status.'''
def recode_diabetes(status):
    if status == 'Yes':
        return 1
    elif status == 'No':
        return 0
    else:
        return np.nan

df['Diabetes'] = df['HadDiabetes'].apply(recode_diabetes)
df = df.dropna(subset=['Diabetes'])

print(df.head())

   PatientID    State     Sex GeneralHealth   AgeCategory  HeightInMeters  \
0          1  Alabama  Female          Fair  Age 75 to 79            1.63   
1          2  Alabama  Female     Very good  Age 65 to 69            1.60   
2          3  Alabama    Male     Excellent  Age 60 to 64            1.78   
3          4  Alabama    Male     Very good  Age 70 to 74            1.78   
4          5  Alabama  Female          Good  Age 50 to 54            1.68   

   WeightInKilograms        BMI  HadHeartAttack  HadAngina  ...  ChestScan  \
0          84.820000  32.099998               0          1  ...          1   
1          71.669998  27.990000               0          0  ...          0   
2          71.209999  22.530001               0          0  ...          0   
3          95.250000  30.129999               0          0  ...          0   
4          78.019997  27.760000               0          0  ...          1   

      RaceEthnicityCategory  AlcoholDrinkers  HIVTesting  FluVaxLast

## Check the number of individuals with diabetes in comparison to those without.

In [None]:
diabetes_counts = df['Diabetes'].value_counts()
print('Number of individuals with diabetes: ', diabetes_counts[1])
print('Number of individuals without diabetes:', diabetes_counts[0])
print('Diabetic to non diabetic ratio:', diabetes_counts[1]/diabetes_counts[0])

Number of individuals with diabetes:  33055
Number of individuals without diabetes: 197463
Diabetic to non diabetic ratio: 0.16739844932974784


### Mapping the age ranges to ordinal numbers, making it easier for models to understand the relationship

In [None]:
# Define the mapping dictionary
age_mapping = {
    'Age 18 to 24': 0,
    'Age 25 to 29': 1,
    'Age 30 to 34': 2,
    'Age 35 to 39': 3,
    'Age 40 to 44': 4,
    'Age 45 to 49': 5,
    'Age 50 to 54': 6,
    'Age 55 to 59': 7,
    'Age 60 to 64': 8,
    'Age 65 to 69': 9,
    'Age 70 to 74': 10,
    'Age 75 to 79': 11,
    'Age 80 or older': 12
}


df['AgeCategory'] = df['AgeCategory'].astype(str)
df['AgeCategory'] = df['AgeCategory'].map(age_mapping)
print(df[['AgeCategory']].head())


   AgeCategory
0           11
1            9
2            8
3           10
4            6


## Data Selection and Wrangling
Lifestyle dataset:

In [None]:
lifestyle_df= df.loc[:, ['BMI', 'SmokerStatus', 'ECigaretteUsage', 'AlcoholDrinkers', 'Diabetes']].copy()
print("lifestyle_df:")
print(lifestyle_df.head())

smoker_mapping = {
    'Never smoked': 0,
    'Former smoker': 1,
    'Current smoker - now smokes some days': 2,
    'Current smoker - now smokes every day': 3
}
lifestyle_df['SmokerStatus'] = lifestyle_df['SmokerStatus'].map(smoker_mapping)
#lifestyle_df['SmokerStatus'] = lifestyle_df['SmokerStatusMapped']
ecig_mapping = {
    'Never used e-cigarettes in my entire life': 0,
    'Not at all (right now)': 1,
    'Use them some days': 2,
    'Use them every day': 3
}
lifestyle_df['ECigaretteUsage'] = lifestyle_df['ECigaretteUsage'].map(ecig_mapping)
#lifestyle_df['ECigaretteUsage'] = lifestyle_df['ECigaretteUsage']

print("\nTransformed lifestyle_df:")
print(lifestyle_df.head())

lifestyle_df:
         BMI   SmokerStatus                            ECigaretteUsage  \
0  32.099998  Former smoker  Never used e-cigarettes in my entire life   
1  27.990000  Former smoker  Never used e-cigarettes in my entire life   
2  22.530001   Never smoked  Never used e-cigarettes in my entire life   
3  30.129999  Former smoker  Never used e-cigarettes in my entire life   
4  27.760000   Never smoked  Never used e-cigarettes in my entire life   

   AlcoholDrinkers  Diabetes  
0                0       1.0  
1                0       0.0  
2                1       0.0  
3                0       1.0  
4                0       0.0  

Transformed lifestyle_df:
         BMI  SmokerStatus  ECigaretteUsage  AlcoholDrinkers  Diabetes
0  32.099998             1                0                0       1.0
1  27.990000             1                0                0       0.0
2  22.530001             0                0                1       0.0
3  30.129999             1                0 

## Data Selection & Permutations
### Demographic dataset:
States are grouped into regions to increase significance and reduce complexity of the dataset. regions are picked based off this map:
https://www.cpc.ncep.noaa.gov/products/analysis_monitoring/regional_monitoring/regions.shtml

Alaska was then included as part of the northwest region due to location. A new region was made for Hawaii, Guam, Puerto Rico, and the Virgin Islands due to their similar tropical climates.

In [None]:
state_to_region = {
    # NORTHEAST
    'Maine': 'Northeast', 'New Hampshire': 'Northeast', 'Vermont': 'Northeast',
    'Massachusetts': 'Northeast', 'Rhode Island': 'Northeast', 'Connecticut': 'Northeast',
    'New York': 'Northeast', 'New Jersey': 'Northeast', 'Pennsylvania': 'Northeast',

    # EAST NORTH CENTRAL
    'Ohio': 'East North Central', 'Indiana': 'East North Central',
    'Illinois': 'East North Central', 'Michigan': 'East North Central', 'Wisconsin': 'East North Central',

    # WEST NORTH CENTRAL
    'Minnesota': 'West North Central', 'Iowa': 'West North Central',
    'Missouri': 'West North Central', 'North Dakota': 'West North Central',
    'South Dakota': 'West North Central', 'Nebraska': 'West North Central',
    'Kansas': 'West North Central',

    # CENTRAL
    'Kentucky': 'Central', 'Tennessee': 'Central',
    'Arkansas': 'Central', 'Missouri': 'Central', 'Illinois': 'East North Central',  # also belongs here in some maps

    # SOUTHEAST
    'West Virginia': 'Southeast', 'Virginia': 'Southeast', 'North Carolina': 'Southeast',
    'South Carolina': 'Southeast', 'Georgia': 'Southeast', 'Florida': 'Southeast',
    'Alabama': 'Southeast', 'Mississippi': 'Southeast', 'District of Columbia': 'Southeast',  'Delaware': 'Southeast',
    'Maryland': 'Southeast',
    # SOUTH
    'Texas': 'South', 'Oklahoma': 'South', 'Louisiana': 'South',

    # SOUTHWEST
    'New Mexico': 'Southwest', 'Arizona': 'Southwest',
    'Utah': 'Southwest', 'Colorado': 'Southwest',

    # WEST
    'California': 'West', 'Nevada': 'West',

    # NORTHWEST
    'Washington': 'Northwest', 'Oregon': 'Northwest', 'Idaho': 'Northwest',
    'Montana': 'Northwest', 'Wyoming': 'Northwest', 'Alaska': 'Northwest',

    # TROPICAL (custom region)
    'Hawaii': 'Tropical',
    'Guam': 'Tropical',
    'Puerto Rico': 'Tropical',
    'Virgin Islands': 'Tropical'
}

df['Region'] = df['State'].map(state_to_region)

demographic_df = df.loc[:, ['Sex', 'Region', 'AgeCategory', 'HeightInMeters', 'RaceEthnicityCategory', 'Diabetes']].copy()
demographic_df['Sex'] = demographic_df['Sex'].map({'Female': 0, 'Male': 1})

# One hot encoding
demographic_df = pd.get_dummies(demographic_df, columns=['Region', 'RaceEthnicityCategory'], drop_first=True)
print(demographic_df.head())

   Sex  AgeCategory  HeightInMeters  Diabetes  Region_East North Central  \
0    0           11            1.63       1.0                      False   
1    0            9            1.60       0.0                      False   
2    1            8            1.78       0.0                      False   
3    1           10            1.78       1.0                      False   
4    0            6            1.68       0.0                      False   

   Region_Northeast  Region_Northwest  Region_South  Region_Southeast  \
0             False             False         False              True   
1             False             False         False              True   
2             False             False         False              True   
3             False             False         False              True   
4             False             False         False              True   

   Region_Southwest  Region_Tropical  Region_West  Region_West North Central  \
0             False     

## Model testing: Random Forest, Logistic Regression, K-Nearest Neighhbors
### Utilizing a 5-fold cross validation and then testing each model again without the cross validation for an unbiased evaulation

In [None]:
# Define a function to train and evaluate models using 5-fold cross-validation without SMOTE
def train_evaluate_models_no_smote(X, y, dataset_name="Dataset", n_splits=5):
    # Split data into training and test sets (80-20) with stratification
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y)

    # Define the four models to evaluate
    models = {
        'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
        #'Support Vector Machine': SVC(random_state=42, class_weight='balanced'),
        'K-Nearest Neighbors': KNeighborsClassifier()
    }

    # Set up 5-fold stratified cross-validation
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    print(f"---- {dataset_name} Cross-Validation Results  {n_splits} folds) ----")
    for name, model in models.items():
        cv_results = cross_validate(model, X_train, y_train,
                                    cv=cv,
                                    scoring=['accuracy','precision','recall','f1'])
        print(f"\nModel: {name}")
        print("  Accuracy:  {:.4f}".format(np.mean(cv_results['test_accuracy'])))
        print("  Precision: {:.4f}".format(np.mean(cv_results['test_precision'])))
        print("  Recall:    {:.4f}".format(np.mean(cv_results['test_recall'])))
        print("  F1 Score:  {:.4f}".format(np.mean(cv_results['test_f1'])))

    # Train each model on the entire training set and evaluate on the test set
    # This is without any cross valdiating, and just a simple testing of each algorithm
    print(f"\n---- {dataset_name} Test Set Evaluation  ----")
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\nModel: {name}")
        print(classification_report(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("-" * 50)

# Testing the demographic dataset
X_demo = demographic_df.drop(columns=['Diabetes'])
y_demo = demographic_df['Diabetes']

train_evaluate_models_no_smote(X_demo, y_demo, dataset_name="Demographic Model")

# Testing the lifestyle dataset
X_lifestyle = lifestyle_df.drop(columns=['Diabetes'])
y_lifestyle = lifestyle_df['Diabetes']

train_evaluate_models_no_smote(X_lifestyle, y_lifestyle, dataset_name="Lifestyle Model")


### Retesting the models again with inclusion of SMOTE(Synthetic Minority Over-sampling Technique) to address class imbalances by creating synthetic samples of the minority class, improving model performance and generalization

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE



def train_evaluate_models(X, y, dataset_name="Dataset"):
    # Split data (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y)

    # Apply SMOTE to the training data to address class imbalance
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    # Define the four models to evaluate
    models = {
        'Random Forest': RandomForestClassifier(random_state=22),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=22),
        #'Support Vector Machine': SVC(random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier()
    }

    # Set up 5-fold stratified cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)

    print(f"---- {dataset_name} Cross-Validation Results (5 folds) ----")
    for name, model in models.items():
        cv_results = cross_validate(model, X_train_res, y_train_res,
                                    cv=cv,
                                    scoring=['accuracy','precision','recall','f1'])
        print(f"\nModel: {name}")
        print("  Accuracy:  {:.4f}".format(np.mean(cv_results['test_accuracy'])))
        print("  Precision: {:.4f}".format(np.mean(cv_results['test_precision'])))
        print("  Recall:    {:.4f}".format(np.mean(cv_results['test_recall'])))
        print("  F1 Score:  {:.4f}".format(np.mean(cv_results['test_f1'])))


In [None]:
# 1. Demographic Model (SMOTE)
# For demographic_df, features include all columns except 'Diabetes'
X_demo = demographic_df.drop(columns=['Diabetes'])
y_demo = demographic_df['Diabetes']

train_evaluate_models(X_demo, y_demo, dataset_name="Demographic Model")


---- Demographic Model Cross-Validation Results (5 folds) ----

Model: Random Forest
  Accuracy:  0.6993
  Precision: 0.6764
  Recall:    0.7642
  F1 Score:  0.7177

Model: Logistic Regression
  Accuracy:  0.6391
  Precision: 0.6287
  Recall:    0.6796
  F1 Score:  0.6531

Model: K-Nearest Neighbors
  Accuracy:  0.5816
  Precision: 0.7714
  Recall:    0.2321
  F1 Score:  0.3568

---- Demographic Model Test Set Evaluation ----

Model: Random Forest
              precision    recall  f1-score   support

         0.0       0.90      0.64      0.74     39493
         1.0       0.21      0.57      0.30      6611

    accuracy                           0.63     46104
   macro avg       0.55      0.60      0.52     46104
weighted avg       0.80      0.63      0.68     46104

Confusion Matrix:
[[25104 14389]
 [ 2845  3766]]
--------------------------------------------------

Model: Logistic Regression
              precision    recall  f1-score   support

         0.0       0.92      0.60     

In [None]:
# 2. Lifestyle Model (SMOTE)
# For lifestyle_df, features include all columns except 'Diabetes'
X_lifestyle = lifestyle_df.drop(columns=['Diabetes'])
y_lifestyle = lifestyle_df['Diabetes']

train_evaluate_models(X_lifestyle, y_lifestyle, dataset_name="Lifestyle Model")


---- Lifestyle Model Cross-Validation Results (5 folds) ----

Model: Random Forest
  Accuracy:  0.7175
  Precision: 0.7215
  Recall:    0.7086
  F1 Score:  0.7150

Model: Logistic Regression
  Accuracy:  0.6506
  Precision: 0.6542
  Recall:    0.6387
  F1 Score:  0.6464

Model: K-Nearest Neighbors
  Accuracy:  0.6238
  Precision: 0.7794
  Recall:    0.3454
  F1 Score:  0.4786

---- Lifestyle Model Test Set Evaluation ----

Model: Random Forest
              precision    recall  f1-score   support

         0.0       0.89      0.74      0.81     39493
         1.0       0.23      0.47      0.31      6611

    accuracy                           0.70     46104
   macro avg       0.56      0.60      0.56     46104
weighted avg       0.80      0.70      0.73     46104

Confusion Matrix:
[[29028 10465]
 [ 3516  3095]]
--------------------------------------------------

Model: Logistic Regression
              precision    recall  f1-score   support

         0.0       0.92      0.66      0.7

In [None]:

import plotly.express as px