In [1]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\t14\Downloads\archive (2)\diabetes_012_health_indicators_BRFSS2015.csv"
df = pd.read_csv(file_path)

# Preview
print("Shape:", df.shape)
df.head()


Shape: (253680, 22)


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
import numpy as np

# Convert to numeric and replace 0s
df['BMI'] = pd.to_numeric(df['BMI'], errors='coerce')
df['BMI'] = df['BMI'].replace(0, np.nan)

# Fill missing BMI with column mean
df['BMI'].fillna(df['BMI'].mean(), inplace=True)




In [5]:
from sklearn.preprocessing import MinMaxScaler

# Columns to normalize
numeric_cols = ['BMI', 'MentHlth', 'PhysHlth']

# Apply Min-Max Scaling
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [9]:
from sklearn.model_selection import train_test_split

# Define target and features
X = df.drop(columns=['Diabetes_012'])  # All columns except the target
y = df['Diabetes_012']                 # Target column

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
from sklearn.linear_model import LogisticRegression

# Initialize and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Print performance metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))

# Full report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.848155156102176
Precision (macro): 0.466121189077682
Recall (macro): 0.3861479955170702
F1 Score (macro): 0.396231762230333

Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.53      0.18      0.27      6997

    accuracy                           0.85     50736
   macro avg       0.47      0.39      0.40     50736
weighted avg       0.80      0.85      0.81     50736



In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest F1 Score:", f1_score(y_test, y_pred_rf, average='macro'))


Random Forest Accuracy: 0.8421830652790918
Random Forest F1 Score: 0.39678250951970734


In [7]:
from pycaret.classification import *

# Setup PyCaret with your target
# Removed the 'silent' parameter as it's not supported in this version
clf = setup(data=df, target='Diabetes_012', session_id=123, normalize=True, html=False)

# Compare models
best_model = compare_models()


                    Description             Value
0                    Session id               123
1                        Target      Diabetes_012
2                   Target type        Multiclass
3           Original data shape      (253680, 22)
4        Transformed data shape      (253680, 22)
5   Transformed train set shape      (177576, 22)
6    Transformed test set shape       (76104, 22)
7              Numeric features                21
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12                    Normalize              True
13             Normalize method            zscore
14               Fold Generator   StratifiedKFold
15                  Fold Number                10
16                     CPU Jobs                -1
17                      Use GPU             False
18               Log Experiment             False


                                                                                                                       

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8499  0.8234  0.8499  0.8061   
gbc          Gradient Boosting Classifier    0.8498  0.0000  0.8498  0.8064   
ada                  Ada Boost Classifier    0.8481  0.0000  0.8481  0.8040   
lr                    Logistic Regression    0.8467  0.0000  0.8467  0.8001   
ridge                    Ridge Classifier    0.8452  0.0000  0.8452  0.7972   
lda          Linear Discriminant Analysis    0.8445  0.0000  0.8445  0.8003   
svm                   SVM - Linear Kernel    0.8424  0.0000  0.8424  0.7097   
dummy                    Dummy Classifier    0.8424  0.5000  0.8424  0.7097   
rf               Random Forest Classifier    0.8417  0.7906  0.8417  0.7953   
et                 Extra Trees Classifier    0.8333  0.7704  0.8333  0.7872   
knn                K Neighbors Classifier    0.8309  0.7186  0.8309  0.7848   
dt               Decision Tree Classifier    0.7668 

