In [1]:
import numpy as np
import pandas as pd
import joblib   
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [2]:
%pwd

'c:\\Users\\ASUS\\Desktop\\diabetes_prediction\\data'

In [3]:
df_test=pd.read_csv("raw/test.csv")
df_test.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,...,111,Female,White,Highschool,Middle,Former,Employed,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,...,145,Female,White,Highschool,Middle,Never,Unemployed,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,...,184,Male,White,Highschool,Low,Never,Employed,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,...,128,Male,White,Graduate,Middle,Former,Employed,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,...,133,Male,White,Graduate,Low,Current,Unemployed,0,0,0


In [4]:
# Apply SAME feature engineering to df_test

df_test['age_group'] = pd.cut(
    df_test['age'],
    bins=[18, 29, 44, 59, 120],
    labels=['18-29', '30-44', '45-59', '60+']
)

df_test['bmi_category'] = pd.cut(
    df_test['bmi'],
    bins=[0, 18.5, 24.9, 29.9, 100],
    labels=['Underweight', 'Normal', 'Overweight', 'Obese']
)

df_test['waist_hip_risk'] = np.where(
    ((df_test['gender'] == 'Male') & (df_test['waist_to_hip_ratio'] > 0.90)) |
    ((df_test['gender'] == 'Female') & (df_test['waist_to_hip_ratio'] > 0.85)),
    1, 0
)

df_test['activity_level'] = pd.cut(
    df_test['physical_activity_minutes_per_week'],
    bins=[0, 75, 150, 10000],
    labels=['Low', 'Moderate', 'High']
)

df_test['sedentary_lifestyle'] = np.where(
    (df_test['screen_time_hours_per_day'] > 6) &
    (df_test['physical_activity_minutes_per_week'] < 75),
    1, 0
)

df_test['sleep_category'] = pd.cut(
    df_test['sleep_hours_per_day'],
    bins=[0, 6, 8, 24],
    labels=['Short', 'Normal', 'Long']
)

df_test['alcohol_risk'] = pd.cut(
    df_test['alcohol_consumption_per_week'],
    bins=[0, 2, 7, 100],
    labels=['Low', 'Moderate', 'High']
)

df_test['pulse_pressure'] = df_test['systolic_bp'] - df_test['diastolic_bp']

df_test['heart_rate_risk'] = pd.cut(
    df_test['heart_rate'],
    bins=[0, 60, 80, 200],
    labels=['Low', 'Normal', 'High']
)

df_test['ldl_hdl_ratio'] = df_test['ldl_cholesterol'] / df_test['hdl_cholesterol']
df_test['total_hdl_ratio'] = df_test['cholesterol_total'] / df_test['hdl_cholesterol']

df_test['high_triglycerides'] = (df_test['triglycerides'] > 150).astype(int)

df_test['metabolic_syndrome'] = np.where(
    (df_test['bmi'] >= 30) &
    (df_test['waist_hip_risk'] == 1) &
    (df_test['triglycerides'] > 150) &
    (df_test['hdl_cholesterol'] < 40) &
    ((df_test['systolic_bp'] >= 130) | (df_test['diastolic_bp'] >= 85)),
    1, 0
)

df_test['lifestyle_risk_score'] = (
    (df_test['physical_activity_minutes_per_week'] < 75).astype(int) +
    (df_test['diet_score'] < 5).astype(int) +
    (df_test['screen_time_hours_per_day'] > 6).astype(int) +
    (df_test['sleep_hours_per_day'] < 6).astype(int) +
    (df_test['alcohol_consumption_per_week'] > 7).astype(int)
)

df_test['age_bmi_interaction'] = df_test['age'] * df_test['bmi']
df_test['activity_bmi_interaction'] = (
    df_test['physical_activity_minutes_per_week'] * df_test['bmi']
)




In [5]:
# Drop target if it accidentally exists
X_test_final = df_test.drop(columns=["diabetes_diagnosed"], errors="ignore")

# Load the saved preprocessor
import joblib
preprocessor = joblib.load("raw/artifacts/preprocessor.pkl")

# Transform ONLY
X_test_processed = preprocessor.transform(X_test_final)

print("df_test processed successfully!")


df_test processed successfully!


In [6]:
X_test_processed 

array([[ 1.        ,  3.        ,  1.        , ...,  0.67878643,
        -0.45633459,  0.37325614],
       [ 1.        ,  3.        ,  2.        , ...,  1.80592405,
        -0.88145094,  0.32584678],
       [ 1.        ,  1.        ,  1.        , ...,  0.67878643,
        -0.06458914, -0.24927941],
       ...,
       [ 1.        ,  3.        ,  1.        , ..., -0.44835119,
         0.80247412,  3.26936977],
       [ 1.        ,  1.        ,  1.        , ...,  0.67878643,
         0.0720865 , -0.05296783],
       [ 0.        ,  4.        ,  1.        , ..., -1.57548882,
        -0.22651059, -0.08127536]], shape=(300000, 45))

In [11]:


# Load saved the model

bundle = joblib.load("raw/artifacts/best_model.pkl")

model = bundle["model"]
threshold = bundle["threshold"]


In [12]:
y_pred_prob = model.predict_proba(X_test_processed)[:, 1]
y_pred = (y_pred_prob >= threshold).astype(int)




In [16]:
y_pred_prob

array([0.53008591, 0.67688924, 0.77724733, ..., 0.51172092, 0.62400727,
       0.59258163], shape=(300000,))

In [17]:
import pandas as pd

submission = pd.DataFrame({
    "id": df_test["id"],
    "diagnosed_diabetes": y_pred_prob
})

# Save CSV
submission.to_csv("submission.csv", index=False)
print("submission.csv saved successfully!")



submission.csv saved successfully!
