In [48]:
import pandas as pd
import numpy as np

In [49]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

# EDA

In [50]:
# Get common columns
common_cols = set(df_train.columns) & set(df_test.columns)

# Check if they have the same columns (excluding target column)
train_cols = set(df_train.columns) - {'diagnosed_diabetes'}
test_cols = set(df_test.columns)

if train_cols == test_cols:
    print("✓ Both datasets have the same columns (excluding target)")
else:
    print("✗ Datasets do NOT have the same columns")

print(f"\nComparing data types for {len(common_cols)} common columns:")
print("-" * 60)

for col in sorted(common_cols):
    train_dtype = df_train[col].dtype
    test_dtype = df_test[col].dtype
    
    if train_dtype == test_dtype:
        status = "✓"
    else:
        status = "✗"
    
    print(f"{status} {col:<35} | Train: {str(train_dtype):<10} | Test: {str(test_dtype)}")

✓ Both datasets have the same columns (excluding target)

Comparing data types for 25 common columns:
------------------------------------------------------------
✓ age                                 | Train: int64      | Test: int64
✓ alcohol_consumption_per_week        | Train: int64      | Test: int64
✓ bmi                                 | Train: float64    | Test: float64
✓ cardiovascular_history              | Train: int64      | Test: int64
✓ cholesterol_total                   | Train: int64      | Test: int64
✓ diastolic_bp                        | Train: int64      | Test: int64
✓ diet_score                          | Train: float64    | Test: float64
✓ education_level                     | Train: object     | Test: object
✓ employment_status                   | Train: object     | Test: object
✓ ethnicity                           | Train: object     | Test: object
✓ family_history_diabetes             | Train: int64      | Test: int64
✓ gender                              

In [51]:
null_counts_train = df_train.isnull().sum()
total_nulls_train = null_counts_train.sum()

for col in df_train.columns:
    null_count = null_counts_train[col]
    if null_count > 0:
        print(f"{col:<35} | {null_count:>5} nulls")

if total_nulls_train == 0:
    print("✓ No null values found in train dataset")
else:
    print(f"\nTotal null values in train: {total_nulls_train}")

✓ No null values found in train dataset


In [52]:
# Numerical features summary (limited output)
numerical_cols = df_train.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['id', 'diagnosed_diabetes']]

print(f"Numerical features ({len(numerical_cols)}):")
print(df_train[numerical_cols].describe().T[['mean', 'std', 'min', 'max']].round(2))

Numerical features (18):
                                      mean    std     min     max
age                                  50.36  11.66   19.00   89.00
alcohol_consumption_per_week          2.07   1.05    1.00    9.00
physical_activity_minutes_per_week   80.23  51.20    1.00  747.00
diet_score                            5.96   1.46    0.10    9.90
sleep_hours_per_day                   7.00   0.90    3.10    9.90
screen_time_hours_per_day             6.01   2.02    0.60   16.50
bmi                                  25.87   2.86   15.10   38.40
waist_to_hip_ratio                    0.86   0.04    0.68    1.05
systolic_bp                         116.29  11.01   91.00  163.00
diastolic_bp                         75.44   6.83   51.00  104.00
heart_rate                           70.17   6.94   42.00  101.00
cholesterol_total                   186.82  16.73  117.00  289.00
hdl_cholesterol                      53.82   8.27   21.00   90.00
ldl_cholesterol                     102.91  19.02  

In [53]:
# Categorical features
categorical_cols = df_train.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical features ({len(categorical_cols)}):")
for col in categorical_cols:
    unique_count = df_train[col].nunique()
    print(f"  {col}: {unique_count} unique values")

Categorical features (6):
  gender: 3 unique values
  ethnicity: 5 unique values
  education_level: 4 unique values
  income_level: 5 unique values
  smoking_status: 3 unique values
  employment_status: 4 unique values


In [54]:
# Correlation with target (top 10)
correlations = df_train[numerical_cols + ['diagnosed_diabetes']].corr()['diagnosed_diabetes'].abs().sort_values(ascending=False)
print("Top correlations with target:")
print(correlations.head(11).round(3))  # 11 to exclude target itself

Top correlations with target:
diagnosed_diabetes                    1.000
family_history_diabetes               0.211
physical_activity_minutes_per_week    0.170
age                                   0.161
systolic_bp                           0.107
bmi                                   0.106
ldl_cholesterol                       0.103
triglycerides                         0.091
cholesterol_total                     0.088
waist_to_hip_ratio                    0.081
hdl_cholesterol                       0.053
Name: diagnosed_diabetes, dtype: float64


In [55]:
# Check for outliers (IQR method) - only show features with significant outliers
outlier_summary = {}
for col in numerical_cols:
    Q1 = df_train[col].quantile(0.25)
    Q3 = df_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((df_train[col] < lower_bound) | (df_train[col] > upper_bound)).sum()
    if outliers > 0:
        outlier_summary[col] = outliers

if outlier_summary:
    print("Features with outliers (>1.5*IQR):")
    for col, count in sorted(outlier_summary.items(), key=lambda x: x[1], reverse=True)[:10]:
        pct = (count / len(df_train)) * 100
        print(f"  {col}: {count} ({pct:.1f}%)")
else:
    print("No significant outliers detected")

Features with outliers (>1.5*IQR):
  hypertension_history: 127393 (18.2%)
  family_history_diabetes: 104581 (14.9%)
  physical_activity_minutes_per_week: 33490 (4.8%)
  cardiovascular_history: 21227 (3.0%)
  triglycerides: 9053 (1.3%)
  waist_to_hip_ratio: 6159 (0.9%)
  sleep_hours_per_day: 6152 (0.9%)
  diastolic_bp: 5752 (0.8%)
  hdl_cholesterol: 4693 (0.7%)
  bmi: 4254 (0.6%)


In [56]:
# Target by categorical features (top categories only)
for col in categorical_cols:
    target_by_cat = df_train.groupby(col)['diagnosed_diabetes'].mean().sort_values(ascending=False)
    print(target_by_cat.head(5).round(3))
    print("\n")

gender
Other     0.641
Male      0.624
Female    0.622
Name: diagnosed_diabetes, dtype: float64


ethnicity
Other       0.636
Asian       0.628
White       0.624
Black       0.624
Hispanic    0.616
Name: diagnosed_diabetes, dtype: float64


education_level
No formal       0.636
Graduate        0.627
Highschool      0.621
Postgraduate    0.617
Name: diagnosed_diabetes, dtype: float64


income_level
Low             0.630
Lower-Middle    0.627
High            0.624
Upper-Middle    0.620
Middle          0.620
Name: diagnosed_diabetes, dtype: float64


smoking_status
Former     0.625
Current    0.623
Never      0.623
Name: diagnosed_diabetes, dtype: float64


employment_status
Employed      0.625
Unemployed    0.622
Student       0.622
Retired       0.618
Name: diagnosed_diabetes, dtype: float64




# Feature Engineering


In [57]:
# Ordinal encoding for variables with natural order
education_order = {'No formal': 0, 'Highschool': 1, 'Graduate': 2, 'Postgraduate': 3}
income_order = {'Low': 0, 'Lower-Middle': 1, 'Middle': 2, 'Upper-Middle': 3, 'High': 4}

df_train['education_level_enc'] = df_train['education_level'].map(education_order)
df_train['income_level_enc'] = df_train['income_level'].map(income_order)

df_test['education_level_enc'] = df_test['education_level'].map(education_order)
df_test['income_level_enc'] = df_test['income_level'].map(income_order)

# Verify mappings worked
print(f"Education missing: {df_train['education_level_enc'].isna().sum()} train, {df_test['education_level_enc'].isna().sum()} test")
print(f"Income missing: {df_train['income_level_enc'].isna().sum()} train, {df_test['income_level_enc'].isna().sum()} test")

Education missing: 0 train, 0 test
Income missing: 0 train, 0 test


In [58]:
# Label encoding for nominal categorical variables (for tree-based models)
from sklearn.preprocessing import LabelEncoder

nominal_cols = ['gender', 'ethnicity', 'smoking_status', 'employment_status']
label_encoders = {}

for col in nominal_cols:
    le = LabelEncoder()
    # Fit on combined data to ensure consistency
    le.fit(pd.concat([df_train[col], df_test[col]]))
    df_train[f'{col}_enc'] = le.transform(df_train[col])
    df_test[f'{col}_enc'] = le.transform(df_test[col])
    label_encoders[col] = le

print("Label encoding complete for:", nominal_cols)


Label encoding complete for: ['gender', 'ethnicity', 'smoking_status', 'employment_status']


In [59]:
# Define feature columns for modeling
encoded_cat_cols = [f'{col}_enc' for col in categorical_cols]
feature_cols = numerical_cols + encoded_cat_cols

print(f"Total features: {len(feature_cols)}")
print(f"  Numerical: {len(numerical_cols)}")
print(f"  Categorical (encoded): {len(encoded_cat_cols)}")


Total features: 24
  Numerical: 18
  Categorical (encoded): 6


In [60]:
# One-Hot Encoding for linear models (Logistic Regression, SVM, etc.)
# Only for nominal variables (no natural order)
nominal_cols = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

# Create one-hot encoded features (drop_first=True to avoid multicollinearity)
df_train_ohe = pd.get_dummies(df_train[nominal_cols], drop_first=True)
df_test_ohe = pd.get_dummies(df_test[nominal_cols], drop_first=True)

# Ensure both have same columns
missing_in_test = set(df_train_ohe.columns) - set(df_test_ohe.columns)
for col in missing_in_test:
    df_test_ohe[col] = 0
df_test_ohe = df_test_ohe[df_train_ohe.columns]

# Add to dataframes
df_train = pd.concat([df_train, df_train_ohe], axis=1)
df_test = pd.concat([df_test, df_test_ohe], axis=1)

print(f"One-Hot features added: {list(df_train_ohe.columns)}")


One-Hot features added: ['gender_Male', 'gender_Other', 'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Other', 'ethnicity_White', 'smoking_status_Former', 'smoking_status_Never', 'employment_status_Retired', 'employment_status_Student', 'employment_status_Unemployed']


In [61]:
# Feature sets for different model types
ohe_cols = list(df_train_ohe.columns)
ordinal_cols = ['education_level_enc', 'income_level_enc']

# For tree-based models (XGBoost, LightGBM, RandomForest) - can use label encoding
features_tree = numerical_cols + encoded_cat_cols

# For linear models (Logistic Regression, SVM) - use ordinal + one-hot
features_linear = numerical_cols + ordinal_cols + ohe_cols

print(f"Features for tree models: {len(features_tree)}")
print(f"Features for linear models: {len(features_linear)}")


Features for tree models: 24
Features for linear models: 31


# Modeling

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Prepare data
X = df_train[features_tree]
y = df_train['diagnosed_diabetes']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train: {X_train.shape[0]:,} samples")
print(f"Val: {X_val.shape[0]:,} samples")


Train: 560,000 samples
Val: 140,000 samples


In [63]:
# Prepare scaled data for linear models
X_linear = df_train[features_linear]
X_train_lin, X_val_lin, _, _ = train_test_split(X_linear, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_lin)
X_val_scaled = scaler.transform(X_val_lin)

print("Data scaled for linear models")


Data scaled for linear models


In [64]:
%%time
# 1. Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_val_scaled)
lr_acc = accuracy_score(y_val, lr_pred)
print(f"Logistic Regression: {lr_acc:.4f}")


Logistic Regression: 0.6644
CPU times: total: 1.61 s
Wall time: 413 ms


In [65]:
%%time
# 2. Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
rf_acc = accuracy_score(y_val, rf_pred)
print(f"Random Forest: {rf_acc:.4f}")


Random Forest: 0.6650
CPU times: total: 3min 6s
Wall time: 18.3 s


In [66]:
%%time
# 3. Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_val)
gb_acc = accuracy_score(y_val, gb_pred)
print(f"Gradient Boosting: {gb_acc:.4f}")


Gradient Boosting: 0.6770
CPU times: total: 3min 49s
Wall time: 3min 52s


In [67]:
%%time
# 4. XGBoost (GPU)
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=100, 
    max_depth=6, 
    learning_rate=0.1, 
    random_state=42,
    tree_method='hist',
    device='cuda',
    verbosity=0
)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_val)
xgb_acc = accuracy_score(y_val, xgb_pred)
print(f"XGBoost (GPU): {xgb_acc:.4f}")


XGBoost (GPU): 0.6783
CPU times: total: 6.19 s
Wall time: 1.96 s


In [68]:
%%time
# 5. LightGBM (GPU)
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    n_estimators=100, 
    max_depth=6, 
    learning_rate=0.1, 
    random_state=42, 
    device='gpu',  # <-- GPU acceleration
    verbose=-1
)
lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_val)
lgbm_acc = accuracy_score(y_val, lgbm_pred)
print(f"LightGBM (GPU): {lgbm_acc:.4f}")


LightGBM (GPU): 0.6794
CPU times: total: 13 s
Wall time: 2.66 s


In [69]:
# Results comparison
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM'],
    'Accuracy': [lr_acc, rf_acc, gb_acc, xgb_acc, lgbm_acc]
}).sort_values('Accuracy', ascending=False)

print("=" * 40)
print("MODEL COMPARISON (Accuracy)")
print("=" * 40)
print(results.to_string(index=False))
print("=" * 40)
best_model = results.iloc[0]['Model']
best_acc = results.iloc[0]['Accuracy']
print(f"Best: {best_model} ({best_acc:.4f})")


MODEL COMPARISON (Accuracy)
              Model  Accuracy
           LightGBM  0.679407
            XGBoost  0.678307
  Gradient Boosting  0.676964
      Random Forest  0.664950
Logistic Regression  0.664357
Best: LightGBM (0.6794)


We are going to continue with LightGBM and XGBoost

# Model Improvement


In [70]:
import optuna
from sklearn.model_selection import cross_val_score
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [71]:
%%time
# Hyperparameter tuning for XGBoost with Optuna (GPU)
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'random_state': 42,
        'tree_method': 'hist',
        'device': 'cuda',
        'verbosity': 0
    }
    
    model = XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return scores.mean()

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30, show_progress_bar=True)

print(f"Best XGBoost accuracy (CV): {study_xgb.best_value:.4f}")


Best trial: 5. Best value: 0.683361: 100%|██████████| 30/30 [08:20<00:00, 16.68s/it]

Best XGBoost accuracy (CV): 0.6834
CPU times: total: 14min 29s
Wall time: 8min 20s





In [72]:
# Best XGBoost parameters found
print("Best XGBoost params:")
for k, v in study_xgb.best_params.items():
    print(f"  {k}: {v}")


Best XGBoost params:
  n_estimators: 466
  max_depth: 5
  learning_rate: 0.13685396236512357
  min_child_weight: 9
  subsample: 0.8167677754456445
  colsample_bytree: 0.6056940961298176
  reg_alpha: 1.5277483435281087e-06
  reg_lambda: 1.1879967032598484


In [73]:
%%time
# Train optimized XGBoost (GPU)
xgb_opt = XGBClassifier(**study_xgb.best_params, random_state=42, tree_method='hist', device='cuda', verbosity=0)
xgb_opt.fit(X_train, y_train)
xgb_opt_pred = xgb_opt.predict(X_val)
xgb_opt_acc = accuracy_score(y_val, xgb_opt_pred)

print(f"Optimized XGBoost (GPU): {xgb_opt_acc:.4f} (before: {xgb_acc:.4f})")
print(f"Improvement: {(xgb_opt_acc - xgb_acc)*100:.2f}%")


Optimized XGBoost (GPU): 0.6833 (before: 0.6783)
Improvement: 0.50%
CPU times: total: 10.6 s
Wall time: 5.75 s


In [74]:
%%time
# Hyperparameter tuning for LightGBM with Optuna (GPU)
def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'random_state': 42,
        'device': 'gpu',
        'verbose': -1
    }
    
    model = LGBMClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return scores.mean()

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=30, show_progress_bar=True)

print(f"Best LightGBM accuracy (CV): {study_lgbm.best_value:.4f}")


Best trial: 28. Best value: 0.683673: 100%|██████████| 30/30 [14:40<00:00, 29.34s/it]

Best LightGBM accuracy (CV): 0.6837
CPU times: total: 1h 22min 46s
Wall time: 14min 40s





In [75]:
# Best LightGBM parameters found
print("Best LightGBM params:")
for k, v in study_lgbm.best_params.items():
    print(f"  {k}: {v}")


Best LightGBM params:
  n_estimators: 352
  max_depth: 11
  learning_rate: 0.06501279852289088
  num_leaves: 122
  min_child_samples: 43
  subsample: 0.7435239638223387
  colsample_bytree: 0.6009309753449563
  reg_alpha: 0.048005549238116146
  reg_lambda: 0.781592542334054


In [76]:
%%time
# Train optimized LightGBM (GPU)
lgbm_opt = LGBMClassifier(**study_lgbm.best_params, random_state=42, device='gpu', verbose=-1)
lgbm_opt.fit(X_train, y_train)
lgbm_opt_pred = lgbm_opt.predict(X_val)
lgbm_opt_acc = accuracy_score(y_val, lgbm_opt_pred)

print(f"Optimized LightGBM (GPU): {lgbm_opt_acc:.4f} (before: {lgbm_acc:.4f})")
print(f"Improvement: {(lgbm_opt_acc - lgbm_acc)*100:.2f}%")


Optimized LightGBM (GPU): 0.6849 (before: 0.6794)
Improvement: 0.55%
CPU times: total: 1min 36s
Wall time: 17.1 s


In [77]:
# Final comparison
final_results = pd.DataFrame({
    'Model': ['LightGBM (base)', 'LightGBM (tuned)', 'XGBoost (base)', 'XGBoost (tuned)'],
    'Accuracy': [lgbm_acc, lgbm_opt_acc, xgb_acc, xgb_opt_acc]
}).sort_values('Accuracy', ascending=False)

print("FINAL COMPARISON")

print(final_results.to_string(index=False))

# Select best model
if lgbm_opt_acc >= xgb_opt_acc:
    best_final_model = lgbm_opt
    best_final_name = "LightGBM (tuned)"
    best_final_acc = lgbm_opt_acc
else:
    best_final_model = xgb_opt
    best_final_name = "XGBoost (tuned)"
    best_final_acc = xgb_opt_acc

print(f"\nBest model: {best_final_name} ({best_final_acc:.4f})")


FINAL COMPARISON
           Model  Accuracy
LightGBM (tuned)  0.684886
 XGBoost (tuned)  0.683336
 LightGBM (base)  0.679407
  XGBoost (base)  0.678307

Best model: LightGBM (tuned) (0.6849)


# Kaggle Submission


In [None]:
# Retrain best model on full training data (GPU)
X_full = df_train[features_tree]
y_full = df_train['diagnosed_diabetes']

final_model = LGBMClassifier(**study_lgbm.best_params, random_state=42, device='gpu', verbose=-1)
final_model.fit(X_full, y_full)

print(f"Final model trained on {len(X_full):,} samples")


Final model (GPU) trained on 700,000 samples


In [79]:
# Prepare test data and make predictions
X_test = df_test[features_tree]
test_predictions = final_model.predict(X_test)

print(f"Predictions made for {len(test_predictions):,} test samples")
print(f"Prediction distribution: {pd.Series(test_predictions).value_counts(normalize=True).round(3).to_dict()}")


Predictions made for 300,000 test samples
Prediction distribution: {1.0: 0.689, 0.0: 0.311}


In [80]:
# Create submission file
submission = pd.DataFrame({
    'id': df_test['id'],
    'diagnosed_diabetes': test_predictions.astype(int)
})

submission.to_csv('submission.csv', index=False)

print(f"Submission saved to 'submission.csv'")
print(f"Shape: {submission.shape}")
submission.head(10)


Submission saved to 'submission.csv'
Shape: (300000, 2)


Unnamed: 0,id,diagnosed_diabetes
0,700000,1
1,700001,1
2,700002,1
3,700003,0
4,700004,1
5,700005,1
6,700006,1
7,700007,1
8,700008,1
9,700009,1
