In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
df=pd.read_csv('Vitals_with_RiskLevels.csv')

In [3]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,risk
0,0,80.0,0,1,25.19,6.6,140,1
1,0,54.0,0,0,27.32,6.6,80,1
2,1,28.0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,23.45,5.0,155,0
4,1,76.0,1,1,20.14,4.8,155,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95873 entries, 0 to 95872
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               95873 non-null  int64  
 1   age                  95873 non-null  float64
 2   hypertension         95873 non-null  int64  
 3   heart_disease        95873 non-null  int64  
 4   bmi                  95873 non-null  float64
 5   HbA1c_level          95873 non-null  float64
 6   blood_glucose_level  95873 non-null  int64  
 7   risk                 95873 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 5.9 MB


In [5]:
# Count total duplicate rows
duplicates_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates_count}")

# Show the actual duplicate rows (if any)
duplicates = df[df.duplicated()]
print(duplicates.head())


Number of duplicate rows: 8888
      gender   age  hypertension  heart_disease    bmi  HbA1c_level  \
663        0  80.0             0              0  27.32          6.0   
1878       0  49.0             0              0  27.32          6.1   
2068       0  62.0             0              0  27.32          5.8   
2271       0  24.0             0              0  27.32          4.0   
2537       1  52.0             0              0  27.32          5.0   

      blood_glucose_level  risk  
663                   200     1  
1878                  160     0  
2068                   90     0  
2271                  130     0  
2537                  159     0  


In [6]:
# Drop duplicates
df = df.drop_duplicates()

print("Shape after removing duplicates:", df.shape)


Shape after removing duplicates: (86985, 8)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86985 entries, 0 to 95872
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               86985 non-null  int64  
 1   age                  86985 non-null  float64
 2   hypertension         86985 non-null  int64  
 3   heart_disease        86985 non-null  int64  
 4   bmi                  86985 non-null  float64
 5   HbA1c_level          86985 non-null  float64
 6   blood_glucose_level  86985 non-null  int64  
 7   risk                 86985 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 6.0 MB


In [8]:
X = df.drop('risk', axis=1)
y = df['risk']

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importances)

selected_features = importances.head(5).index.tolist()
print("\nSelected Features:", selected_features)

X_selected = X[selected_features]

HbA1c_level            0.502815
blood_glucose_level    0.237012
hypertension           0.165404
heart_disease          0.078167
age                    0.012990
bmi                    0.003234
gender                 0.000379
dtype: float64

Selected Features: ['HbA1c_level', 'blood_glucose_level', 'hypertension', 'heart_disease', 'age']


In [10]:
# Drop leakage columns (used in risk definition)
leakage_cols = ["HbA1c_level", "blood_glucose_level", "hypertension", "heart_disease", "bmi"]

X = df.drop(columns=leakage_cols + ["risk"])
y = df["risk"]

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.67      0.88      0.76     10970
           1       0.56      0.27      0.37      6427

    accuracy                           0.65     17397
   macro avg       0.62      0.57      0.56     17397
weighted avg       0.63      0.65      0.61     17397



In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# -----------------------------
# 1. Load dataset
# -----------------------------

# Drop duplicate rows
df = df.drop_duplicates()

# Drop leakage features (used in risk definition)
leakage_cols = ["HbA1c_level", "blood_glucose_level", "hypertension", "heart_disease", "bmi"]
X = df.drop(columns=leakage_cols + ["risk"])
y = df["risk"]

# -----------------------------
# 2. Feature Engineering
# -----------------------------

# Age groups
df["age_group"] = pd.cut(df["age"], 
                         bins=[0, 30, 50, 70, 120], 
                         labels=["young", "adult", "mid_age", "elderly"])

# BMI categories (we still use BMI indirectly)
df["bmi_category"] = pd.cut(df["bmi"], 
                            bins=[0, 18.5, 25, 30, 100], 
                            labels=["underweight", "normal", "overweight", "obese"])

# Interaction feature: Age × BMI
df["age_bmi_interaction"] = df["age"] * df["bmi"]

# Replace X with engineered features + original safe ones
X = df[["age", "gender", "bmi_category", "age_group", "age_bmi_interaction"]]

# -----------------------------
# 3. Encoding
# -----------------------------
X = pd.get_dummies(X, drop_first=True)

# -----------------------------
# 4. Handle Imbalance
# -----------------------------
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# -----------------------------
# 5. Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# -----------------------------
# 6. Models
# -----------------------------
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=500, random_state=42)
}

# -----------------------------
# 7. Evaluate with Cross-Validation
# -----------------------------
for name, model in models.items():
    scores = cross_val_score(model, X_resampled, y_resampled, cv=5, scoring="accuracy")
    print(f"\n{name} - CV Mean Accuracy: {scores.mean():.4f}")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Classification Report:\n", classification_report(y_test, y_pred))



RandomForest - CV Mean Accuracy: 0.6352
RandomForest Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.64      0.64     10971
           1       0.64      0.63      0.63     10970

    accuracy                           0.64     21941
   macro avg       0.64      0.64      0.64     21941
weighted avg       0.64      0.64      0.64     21941


GradientBoosting - CV Mean Accuracy: 0.6180
GradientBoosting Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.68      0.64     10971
           1       0.64      0.57      0.60     10970

    accuracy                           0.62     21941
   macro avg       0.62      0.62      0.62     21941
weighted avg       0.62      0.62      0.62     21941



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


LogisticRegression - CV Mean Accuracy: 0.6036
LogisticRegression Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.66      0.62     10971
           1       0.62      0.55      0.58     10970

    accuracy                           0.60     21941
   macro avg       0.61      0.60      0.60     21941
weighted avg       0.61      0.60      0.60     21941



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
!pip install xgboost



In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb

df = df.drop_duplicates()

# Drop leakage columns
leakage_cols = ["HbA1c_level", "blood_glucose_level", "hypertension", "heart_disease", "bmi"]
X = df.drop(columns=leakage_cols + ["risk"])
y = df["risk"]

# Feature engineering
df["age_group"] = pd.cut(df["age"], bins=[0,30,50,70,120], labels=["young","adult","mid_age","elderly"])
df["bmi_category"] = pd.cut(df["bmi"], bins=[0,18.5,25,30,100], labels=["underweight","normal","overweight","obese"])
df["age_bmi_interaction"] = df["age"] * df["bmi"]

# Final feature set
X = df[["age", "gender", "bmi_category", "age_group", "age_bmi_interaction"]]
X = pd.get_dummies(X, drop_first=True)

# Balance dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# -----------------------------
# 2. XGBoost Classifier + Grid Search
# -----------------------------
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring="f1",  # optimize for F1-score (balance precision & recall)
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

# -----------------------------
# 3. Best Model Evaluation
# -----------------------------
best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

y_pred = best_model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 108 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.69      0.65     10971
           1       0.64      0.56      0.60     10970

    accuracy                           0.62     21941
   macro avg       0.63      0.62      0.62     21941
weighted avg       0.63      0.62      0.62     21941



In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [26]:
import joblib

joblib.dump(rf, "risk_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(pca, "pca.pkl")
joblib.dump(X.columns, "feature_columns.pkl")


['feature_columns.pkl']

In [17]:
from sklearn.decomposition import PCA

# Pick min(n_features, 10) so it never breaks
n_features = X_resampled.shape[1]
pca = PCA(n_components=min(10, n_features))

X_resampled_pca = pca.fit_transform(X_resampled)
X_test_pca = pca.transform(X_test_scaled)




In [22]:
print(df["risk"].value_counts())
print(df["risk"].unique())


risk
0    54851
1    32134
Name: count, dtype: int64
[1 0]


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86985 entries, 0 to 95872
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   gender               86985 non-null  int64   
 1   age                  86985 non-null  float64 
 2   hypertension         86985 non-null  int64   
 3   heart_disease        86985 non-null  int64   
 4   bmi                  86985 non-null  float64 
 5   HbA1c_level          86985 non-null  float64 
 6   blood_glucose_level  86985 non-null  int64   
 7   risk                 86985 non-null  int64   
 8   age_group            86985 non-null  category
 9   bmi_category         86985 non-null  category
 10  age_bmi_interaction  86985 non-null  float64 
dtypes: category(2), float64(4), int64(5)
memory usage: 6.8 MB


In [24]:

features = ["age", "bmi", "blood_glucose_level", "gender", "risk"]
df_small = df[features].dropna()

df_small["gender"] = df_small["gender"].map({"Male": 1, "Female": 0})

X = df_small.drop("risk", axis=1)
y = df_small["risk"].astype(int)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=7)
rf.fit(X_train_scaled, y_train)

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

y_pred = rf.predict(X_test_scaled)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test_scaled)[:,1]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

import joblib
joblib.dump(rf, "risk_model_simple.pkl")
joblib.dump(scaler, "scaler_simple.pkl")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.97      0.83     16456
           1       0.89      0.37      0.52      9640

    accuracy                           0.75     26096
   macro avg       0.81      0.67      0.67     26096
weighted avg       0.78      0.75      0.71     26096

ROC-AUC: 0.7525196576007035
Confusion Matrix:
 [[16011   445]
 [ 6112  3528]]


['scaler_simple.pkl']

In [25]:
import pandas as pd
import joblib

# Load simple model
model = joblib.load("risk_model_simple.pkl")
scaler = joblib.load("scaler_simple.pkl")

def predict_new(age, bmi, gender, blood_glucose):
    # Prepare new data
    new_data = pd.DataFrame([{
        "age": age,
        "bmi": bmi,
        "blood_glucose_level": blood_glucose,
        "gender": 1 if gender == "Male" else 0
    }])

    # Scale
    new_scaled = scaler.transform(new_data)

    # Predict
    pred = model.predict(new_scaled)[0]
    prob = model.predict_proba(new_scaled).max()

    return pred, prob

# Example
print(predict_new(40, 28.0, "Male", 150))
print(predict_new(25, 20.0, "Male", 100))   
print(predict_new(70, 40.0, "Female", 300))
print(predict_new(50, 30.0, "Male", 189))

# as it moves close to 1 the model is very confident with the decision

(0, 0.7933502754771135)
(0, 0.7206810808737298)
(1, 0.995819871255556)
(1, 0.9958467409352365)


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86985 entries, 0 to 95872
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   gender               86985 non-null  int64   
 1   age                  86985 non-null  float64 
 2   hypertension         86985 non-null  int64   
 3   heart_disease        86985 non-null  int64   
 4   bmi                  86985 non-null  float64 
 5   HbA1c_level          86985 non-null  float64 
 6   blood_glucose_level  86985 non-null  int64   
 7   risk                 86985 non-null  int64   
 8   age_group            86985 non-null  category
 9   bmi_category         86985 non-null  category
 10  age_bmi_interaction  86985 non-null  float64 
dtypes: category(2), float64(4), int64(5)
memory usage: 6.8 MB


In [36]:

features = ["age", "bmi", "blood_glucose_level", "gender", "risk","HbA1c_level"]
df_small = df[features].dropna()

df_small["gender"] = df_small["gender"].map({"Male": 1, "Female": 0})

X = df_small.drop("risk", axis=1)
y = df_small["risk"].astype(int)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=7)
rf.fit(X_train_scaled, y_train)

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

y_pred = rf.predict(X_test_scaled)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test_scaled)[:,1]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

import joblib
joblib.dump(rf, "risk_model_simple.pkl")
joblib.dump(scaler, "scaler_simple.pkl")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95     16456
           1       1.00      0.81      0.89      9640

    accuracy                           0.93     26096
   macro avg       0.95      0.90      0.92     26096
weighted avg       0.94      0.93      0.93     26096

ROC-AUC: 0.9654214741132899
Confusion Matrix:
 [[16456     0]
 [ 1871  7769]]


['scaler_simple.pkl']