In [67]:
import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")


In [68]:
train_data.drop(["id"] ,axis= 1 ,inplace= True )
test_data.drop(["id"] ,axis= 1 ,inplace= True )

In [69]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder

objects = [
    "gender",
    "ethnicity",
    "education_level",
    "income_level",
    "smoking_status",
    "employment_status"
]

for cat in objects:
    le = LabelEncoder()
    combined = pd.concat([train_data[cat] , test_data[cat]] , axis = 0).astype(str)
    le.fit(combined)
    train_data[cat] = le.transform(train_data[cat].astype(str))
    test_data[cat] = le.transform(test_data[cat].astype(str))

In [70]:
train_data["age_bmi"] = train_data["age"] * train_data["bmi"]
train_data["bmi_category"] = pd.cut(train_data["bmi"], bins=[0, 18.5, 25, 30, 100], labels=[0, 1, 2, 3]).astype(int)
train_data["bp_difference"] = train_data["systolic_bp"] - train_data["diastolic_bp"]
train_data["cholesterol_ratio"] = train_data["cholesterol_total"] / train_data["hdl_cholesterol"]

test_data["age_bmi"] = test_data["age"] * test_data["bmi"]
test_data["bmi_category"] = pd.cut(test_data["bmi"], bins=[0, 18.5, 25, 30, 100], labels=[0, 1, 2, 3]).astype(int)
test_data["bp_difference"] = test_data["systolic_bp"] - test_data["diastolic_bp"]
test_data["cholesterol_ratio"] = test_data["cholesterol_total"] / test_data["hdl_cholesterol"]

In [71]:
x = train_data.drop(["diagnosed_diabetes"] , axis=1)
y = train_data["diagnosed_diabetes"]


In [72]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    enable_categorical=True,
    n_estimators=150,           # Fewer trees
    max_depth=4,                # Much shallower trees (was 6)
    learning_rate=0.05,         # Slower learning (was 0.1)
    subsample=0.7,              # Less data per tree (was 0.8)
    colsample_bytree=0.7,       # Fewer features per tree (was 0.8)
    reg_alpha=1.0,              # Stronger L1 regularization (was 0.1)
    reg_lambda=10.0,            # Much stronger L2 regularization (was 1.0)
    gamma=1.0,                  # Minimum loss reduction (was 0)
    min_child_weight=10,        # More samples needed (was 3)
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

xgb_model.fit(x , y )

In [73]:
y_test_prediction = xgb_model.predict(test_data)

In [74]:
#Precision
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

x_prediction = xgb_model.predict(x)
mse_error = mean_squared_error(y , x_prediction)
rmse_error = np.sqrt(mse_error)
cv_score = cross_val_score(xgb_model , x , y , scoring="neg_mean_squared_error" , cv= 3 , n_jobs=-1)
cv_rmse = np.sqrt(-cv_score.mean())
print(rmse_error)

0.45255483380938005


In [75]:
#Submisson 
test = pd.read_csv("test.csv")

submisson = pd.DataFrame({
    'id' : test["id"],
    "loan_paid_back" : y_test_prediction
})

submisson.to_csv("submisson.csv" , index= False)