In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#modelling libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [2]:
%pwd

'c:\\Users\\ASUS\\Desktop\\diabetes_prediction\\data\\raw'

In [3]:
df_train = pd.read_csv('train.csv')

In [4]:
# Create age groups
df_train['age_group'] = pd.cut(
    df_train['age'],
    bins=[18, 29, 44, 59, 120],
    labels=['18-29', '30-44', '45-59', '60+']
)

# Create BMI categories
df_train['bmi_category'] = pd.cut(
    df_train['bmi'],
    bins=[0, 18.5, 24.9, 29.9, 100],
    labels=['Underweight', 'Normal', 'Overweight', 'Obese']
)

# Create waist-to-hip ratio risk indicator
df_train['waist_hip_risk'] = np.where(
    ((df_train['gender'] == 'Male') & (df_train['waist_to_hip_ratio'] > 0.90)) |
    ((df_train['gender'] == 'Female') & (df_train['waist_to_hip_ratio'] > 0.85)),
    1, 0
)


# Create physical activity level categories
df_train['activity_level'] = pd.cut(
    df_train['physical_activity_minutes_per_week'],
    bins=[0, 75, 150, 10000],
    labels=['Low', 'Moderate', 'High']
)

# Create sedentary lifestyle indicator
df_train['sedentary_lifestyle'] = np.where(
    (df_train['screen_time_hours_per_day'] > 6) &
    (df_train['physical_activity_minutes_per_week'] < 75),
    1, 0
)


# Create sleep quality categories
df_train['sleep_category'] = pd.cut(
    df_train['sleep_hours_per_day'],
    bins=[0, 6, 8, 24],
    labels=['Short', 'Normal', 'Long']
)

# Create alcohol consumption risk categories
df_train['alcohol_risk'] = pd.cut(
    df_train['alcohol_consumption_per_week'],
    bins=[0, 2, 7, 100],
    labels=['Low', 'Moderate', 'High']
)


# Create pulse pressure feature
df_train['pulse_pressure'] = df_train['systolic_bp'] - df_train['diastolic_bp']

# Create hypertension category


conditions_bp = [
    (df_train['systolic_bp'] >= 140) | (df_train['diastolic_bp'] >= 90),
    (df_train['systolic_bp'] >= 120) | (df_train['diastolic_bp'] >= 80)
]

choices_bp = ['High', 'Elevated']


# Create heart rate risk categories
df_train['heart_rate_risk'] = pd.cut(
    df_train['heart_rate'],
    bins=[0, 60, 80, 200],
    labels=['Low', 'Normal', 'High']
)

# Create lipid ratios
df_train['ldl_hdl_ratio'] = df_train['ldl_cholesterol'] / df_train['hdl_cholesterol']
df_train['total_hdl_ratio'] = df_train['cholesterol_total'] / df_train['hdl_cholesterol']


# Create high triglycerides flag
df_train['high_triglycerides'] = np.where(df_train['triglycerides'] > 150, 1, 0)

# Create metabolic syndrome flag (composite feature)
df_train['metabolic_syndrome'] = np.where(
    (df_train['bmi'] >= 30) &
    (df_train['waist_hip_risk'] == 1) &
    (df_train['triglycerides'] > 150) &
    (df_train['hdl_cholesterol'] < 40) &
    ((df_train['systolic_bp'] >= 130) | (df_train['diastolic_bp'] >= 85)),
    1, 0
)

# Create lifestyle risk score(0-5)
df_train['lifestyle_risk_score'] = (
    (df_train['physical_activity_minutes_per_week'] < 75).astype(int) +
    (df_train['diet_score'] < 5).astype(int) +
    (df_train['screen_time_hours_per_day'] > 6).astype(int) +
    (df_train['sleep_hours_per_day'] < 6).astype(int) +
    (df_train['alcohol_consumption_per_week'] > 7).astype(int)
)

# Create interaction features
df_train['age_bmi_interaction'] = df_train['age'] * df_train['bmi']
df_train['activity_bmi_interaction'] = df_train['physical_activity_minutes_per_week'] * df_train['bmi']


# Drop ID
df_train = df_train.drop(columns=['id'])

# Optional: check new features
df_train.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,alcohol_risk,pulse_pressure,heart_rate_risk,ldl_hdl_ratio,total_hdl_ratio,high_triglycerides,metabolic_syndrome,lifestyle_risk_score,age_bmi_interaction,activity_bmi_interaction
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,Low,42,Normal,1.965517,3.431034,0,0,2,1035.4,1503.0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,Low,43,Normal,2.42,3.98,0,0,1,1190.0,1737.4
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,Moderate,6,Normal,1.932203,3.186441,0,0,1,771.2,3807.8
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,...,Moderate,52,Normal,1.574074,3.37037,0,0,2,1436.4,2048.2
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,...,Low,48,High,2.673469,4.204082,0,0,1,1555.2,1584.0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 41 columns):
 #   Column                              Non-Null Count   Dtype   
---  ------                              --------------   -----   
 0   age                                 700000 non-null  int64   
 1   alcohol_consumption_per_week        700000 non-null  int64   
 2   physical_activity_minutes_per_week  700000 non-null  int64   
 3   diet_score                          700000 non-null  float64 
 4   sleep_hours_per_day                 700000 non-null  float64 
 5   screen_time_hours_per_day           700000 non-null  float64 
 6   bmi                                 700000 non-null  float64 
 7   waist_to_hip_ratio                  700000 non-null  float64 
 8   systolic_bp                         700000 non-null  int64   
 9   diastolic_bp                        700000 non-null  int64   
 10  heart_rate                          700000 non-null  int64   
 11  cholesterol_t

In [6]:
# Identify categorical and numerical columns
categorical_cols = df_train.select_dtypes(
    include=["object", "category"]
).columns.tolist()

numerical_cols = df_train.select_dtypes(
    include=["int64", "float64"]
).columns.tolist()

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status', 'age_group', 'bmi_category', 'activity_level', 'sleep_category', 'alcohol_risk', 'heart_rate_risk']
Numerical Columns: ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history', 'diagnosed_diabetes', 'waist_hip_risk', 'sedentary_lifestyle', 'pulse_pressure', 'ldl_hdl_ratio', 'total_hdl_ratio', 'high_triglycerides', 'metabolic_syndrome', 'lifestyle_risk_score', 'age_bmi_interaction', 'activity_bmi_interaction']


In [7]:
import os

target_col = 'diagnosed_diabetes'  
X = df_train.drop(columns=[target_col])
y = df_train[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

ordinal_cols = ['education_level', 'income_level', 'sleep_category',
                'activity_level', 'bmi_category', 'age_group']

nominal_cols = ['gender', 'ethnicity', 'smoking_status']

numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()


preprocessor = ColumnTransformer(
    transformers=[
        # Ordinal features → preserve order
        ("ord", OrdinalEncoder(), ordinal_cols),

        # Nominal features → one-hot encode
        ("nom", OneHotEncoder(handle_unknown="ignore", sparse_output=False), nominal_cols),

        # Numeric features
        ("num", StandardScaler(), numerical_cols)
    ]
)

# Fit and Transform Train Data
X_train_processed = preprocessor.fit_transform(X_train)

# Transform Test Data
X_test_processed = preprocessor.transform(X_test)


# Step 5: Save Preprocessor

os.makedirs("artifacts", exist_ok=True)
joblib.dump(preprocessor, "artifacts/preprocessor.pkl")

print("Preprocessor saved successfully!")


Preprocessor saved successfully!


In [8]:
X_train_processed, X_test_processed


(array([[ 1.        ,  2.        ,  1.        , ...,  1.80592405,
          0.55959194, -0.45617755],
        [ 1.        ,  4.        ,  1.        , ..., -0.44835119,
         -0.79671785, -0.69966835],
        [ 0.        ,  3.        ,  0.        , ..., -1.57548882,
         -0.16644295,  0.86253917],
        ...,
        [ 0.        ,  3.        ,  1.        , ...,  0.67878643,
          1.91764283, -1.24786786],
        [ 0.        ,  4.        ,  1.        , ..., -0.44835119,
          1.10977666, -0.1750968 ],
        [ 1.        ,  2.        ,  1.        , ..., -1.57548882,
          0.90955121,  1.38035987]], shape=(560000, 45)),
 array([[ 1.        ,  2.        ,  1.        , ..., -0.44835119,
          0.23574904,  0.96886502],
        [ 0.        ,  3.        ,  1.        , ...,  1.80592405,
          0.07324722, -0.60346877],
        [ 1.        ,  2.        ,  1.        , ...,  0.67878643,
         -1.40842112,  0.69108298],
        ...,
        [ 1.        ,  2.        ,

Create an evaluate function to give all metrics after model training

In [9]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

def evaluate_model_classification(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')  # weighted handles class imbalance
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    report = classification_report(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        "accuracy": acc,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "classification_report": report,
        "confusion_matrix": cm
    }


In [15]:
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib, os

lgbm = LGBMClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    eval_metric='logloss',
    random_state=42,
    n_jobs=4
)

ensemble_model = VotingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('xgb', xgb)
    ],
    voting='soft'
)

print("Training ensemble...")
ensemble_model.fit(X_train_processed, y_train)
print("Training completed")

y_train_pred = ensemble_model.predict(X_train_processed)
# --- Probability-based prediction (threshold tuning) ---
# --- Find best threshold ---
import numpy as np
from sklearn.metrics import f1_score, classification_report

y_test_proba = ensemble_model.predict_proba(X_test_processed)[:, 1]

thresholds = np.arange(0.30, 0.70, 0.01)

best_f1 = 0
best_threshold = 0.5

for t in thresholds:
    y_pred_tmp = (y_test_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_tmp, average='weighted')

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best Threshold: {best_threshold:.2f}")
print(f"Best F1 Score : {best_f1:.4f}")

# --- FINAL predictions using best threshold ---
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("Final Test F1 Score:", f1_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred))



print("Training Accuracy :", accuracy_score(y_train, y_train_pred))
print("Testing Accuracy  :", accuracy_score(y_test, y_test_pred))
print("Test F1 Score     :", f1_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred))

os.makedirs("artifacts", exist_ok=True)
joblib.dump(
    {
        "model": ensemble_model,
        "threshold": 0.56
    },
    "artifacts/best_model.pkl"
)

joblib.dump(preprocessor, "artifacts/preprocessor.pkl")

print("Best Model + Threshold Saved")


Training ensemble...
[LightGBM] [Info] Number of positive: 348936, number of negative: 211064
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.160320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2798
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623100 -> initscore=0.502727
[LightGBM] [Info] Start training from score 0.502727
Training completed




Best Threshold: 0.56
Best F1 Score : 0.6764
Final Test F1 Score: 0.676400549013492
              precision    recall  f1-score   support

         0.0       0.57      0.55      0.56     52629
         1.0       0.74      0.75      0.74     87371

    accuracy                           0.68    140000
   macro avg       0.66      0.65      0.65    140000
weighted avg       0.68      0.68      0.68    140000

Training Accuracy : 0.6927375
Testing Accuracy  : 0.6775142857142857
Test F1 Score     : 0.676400549013492
              precision    recall  f1-score   support

         0.0       0.57      0.55      0.56     52629
         1.0       0.74      0.75      0.74     87371

    accuracy                           0.68    140000
   macro avg       0.66      0.65      0.65    140000
weighted avg       0.68      0.68      0.68    140000

Best Model + Threshold Saved
