In [None]:
!pip install xgboost



In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
#from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']
target_col = ['WeightCategory']

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))  # fit on train
    test[col] = le.transform(test[col].astype(str))        # transform on test
    label_encoders[col] = le

# Target encoding
target_encoder = LabelEncoder()
train[target_col[0]] = target_encoder.fit_transform(train[target_col[0]].astype(str))
#test[target_col[0]] = target_encoder.transform(test[target_col[0]].astype(str))

In [None]:
x = train.drop('WeightCategory', axis=1)
x = x.drop('id', axis=1)
y = train['WeightCategory']

In [None]:
a = test['id']
z = test.drop('id', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 570, 700),
        'max_depth': trial.suggest_int('max_depth', 5, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.0475, 0.0495, step=0.0001),
        'subsample': trial.suggest_float('subsample', 0.518, 0.538),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.407, 0.5),
        'gamma': trial.suggest_float('gamma', 0.590, 0.896),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 4),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.25, 0.6),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.4, 2.8),
        'random_state': 42,
        'eval_metric': 'mlogloss',
        'use_label_encoder': False,
        'n_jobs': -1
    }

    model = XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = cross_validate(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        return_train_score=False
    )

    return scores['test_score'].mean()


2025-10-24 10:46:38,769] Trial 171 finished with value: 0.9097859886592282 and parameters: {'n_estimators': 591, 'max_depth': 8, 'learning_rate': 0.0486, 'subsample': 0.5294362645863591, 'colsample_bytree': 0.40810876257770307, 'gamma': 0.5914696370814657, 'min_child_weight': 1, 'reg_alpha': 0.35196879936563696, 'reg_lambda': 1.5459952657701226}. Best is trial 171 with value: 0.9097859886592282.

2025-10-24 14:19:29,939] Trial 211 finished with value: 0.907210989185313 and parameters: {'n_estimators': 579, 'max_depth': 8, 'learning_rate': 0.0487, 'subsample': 0.5360427193925438, 'colsample_bytree': 0.4101385332050717, 'gamma': 0.5915864928857023, 'min_child_weight': 1, 'reg_alpha': 0.44530608551240997, 'reg_lambda': 2.099943285494622}. Best is trial 211 with value: 0.907210989185313.

[I 2025-10-24 19:09:23,089] Trial 171 finished with value: 0.9072915044571532 and parameters: {'n_estimators': 603, 'max_depth': 8, 'learning_rate': 0.0486, 'subsample': 0.5256748066272386, 'colsample_bytree': 0.4120439796865733, 'gamma': 0.5956033365053646, 'min_child_weight': 1, 'reg_alpha': 0.5431248836578476, 'reg_lambda': 2.6978553976423196}. Best is trial 171 with value: 0.9072915044571532.

In [None]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from scipy import stats

# -----------------------------
# 1) Preprocessing functions
# -----------------------------
def add_derived_features(df):
    df = df.copy()
    if 'Weight' in df.columns and 'Height' in df.columns:
        df['Height_m'] = df['Height'].apply(lambda h: h/100.0 if h>3 else h)
        df['BMI'] = df['Weight'] / (df['Height_m'] ** 2 + 1e-9)
    if 'FCVC' in df.columns and 'FAF' in df.columns and 'CH2O' in df.columns:
        df['Healthy_Lifestyle_Score'] = 0.3*df['FCVC'] + 0.3*df['FAF'] + 0.4*df['CH2O']
    if 'BMI' in df.columns and 'PhysicalActivity' in df.columns:
        df['BMI_x_activity'] = df['BMI'] * df['PhysicalActivity']
    if 'CH2O' in df.columns and 'NMeals' in df.columns:
        df['Water_per_meal'] = df['CH2O'] / (df['NMeals'] + 1e-9)
    if 'TechUse' in df.columns and 'PhysicalActivity' in df.columns:
        df['Activity_to_Tech'] = df['PhysicalActivity'] / (df['TechUse'] + 1e-9)
    return df

def log_transform_skewed(df, numeric_cols):
    df = df.copy()
    for col in numeric_cols:
        if (df[col] > 0).all():
            skew = stats.skew(df[col].dropna())
            if abs(skew) > 1.0:
                df[col] = np.log1p(df[col])
    return df

def prepare_Xy(X):
    Xp = add_derived_features(X)
    numeric_cols = Xp.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = Xp.select_dtypes(include=['object','category']).columns.tolist()
    Xp = log_transform_skewed(Xp, numeric_cols)
    return Xp, numeric_cols, cat_cols

# -----------------------------
# 2) Build pipeline
# -----------------------------
def make_model_pipeline(params, numeric_cols, cat_cols):
    num_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ])
    preproc = ColumnTransformer([
        ('num', num_transformer, numeric_cols),
        ('cat', cat_transformer, cat_cols)
    ], remainder='drop')

    xgb = XGBClassifier(
        objective='multi:softprob',
        use_label_encoder=False,
        eval_metric='mlogloss',
        n_jobs=-1,
        **params
    )

    pipe = Pipeline([
        ('preproc', preproc),
        ('model', xgb)
    ])
    return pipe

# -----------------------------
# 3) Prepare data once
# -----------------------------
Xp_full, numeric_cols, cat_cols = prepare_Xy(X_train)  # X_train defined outside
y_full = y_train  # just alias

# -----------------------------
# 4) Optuna objective
# -----------------------------
def objective(trial):
    booster = trial.suggest_categorical('booster', ['gbtree','dart'])
    grow_policy = trial.suggest_categorical('grow_policy', ['depthwise','lossguide'])

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 5.0),
        'booster': booster,
        'grow_policy': grow_policy,
        'verbosity': 0
    }

    pipe = make_model_pipeline(params, numeric_cols, cat_cols)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_validate(pipe, Xp_full, y_full, cv=cv, scoring='accuracy', n_jobs=1)
    return scores['test_score'].mean()

# -----------------------------
# 5) Run Optuna study
# -----------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Best trial params:", study.best_trial.params)
print("Best trial accuracy:", study.best_value)


[I 2025-10-25 07:42:22,833] A new study created in memory with name: no-name-aa4b9aa0-1d55-48a4-adee-00318d6ef097


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-10-25 07:42:55,949] Trial 0 finished with value: 0.8940125543215838 and parameters: {'booster': 'gbtree', 'grow_policy': 'lossguide', 'n_estimators': 598, 'max_depth': 8, 'learning_rate': 0.005165675722849387, 'subsample': 0.5786021354250325, 'colsample_bytree': 0.7669456973305728, 'gamma': 0.2515724524363616, 'min_child_weight': 6, 'reg_alpha': 0.16741539875601652, 'reg_lambda': 0.9394583934569212}. Best is trial 0 with value: 0.8940125543215838.
[I 2025-10-25 07:43:12,235] Trial 1 finished with value: 0.9007725736359248 and parameters: {'booster': 'gbtree', 'grow_policy': 'lossguide', 'n_estimators': 386, 'max_depth': 7, 'learning_rate': 0.05171214923253784, 'subsample': 0.5531447366270147, 'colsample_bytree': 0.7399632883884177, 'gamma': 0.18066089612211322, 'min_child_weight': 9, 'reg_alpha': 0.9882158546818965, 'reg_lambda': 3.5068395053606967}. Best is trial 1 with value: 0.9007725736359248.
[I 2025-10-25 08:06:13,677] Trial 2 finished with value: 0.8939320779011749 and p

In [None]:
storage_name = "sqlite:///xgb_optuna_v1.db"
study = optuna.create_study(
    study_name="xgb_tuning_v4",
    storage=storage_name,
    direction="maximize"
)

# Seed top 3 trials
study.enqueue_trial({
    'n_estimators': 625,
    'max_depth': 8,
    'learning_rate': 0.0486,
    'subsample': 0.5256748066272386,
    'colsample_bytree': 0.4120439796865733,
    'gamma': 0.590603365053646,
    'min_child_weight': 2,
    'reg_alpha': 0.54312,
    'reg_lambda': 2.7
})




study.optimize(objective, n_trials=300, show_progress_bar=True)


[I 2025-10-25 05:06:11,425] A new study created in RDB with name: xgb_tuning_v4


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-10-25 05:06:37,320] Trial 0 finished with value: 0.9055210425869781 and parameters: {'n_estimators': 625, 'max_depth': 8, 'learning_rate': 0.0486, 'subsample': 0.5256748066272386, 'colsample_bytree': 0.4120439796865733, 'gamma': 0.590603365053646, 'min_child_weight': 2, 'reg_alpha': 0.54312, 'reg_lambda': 2.7}. Best is trial 0 with value: 0.9055210425869781.
[I 2025-10-25 05:06:59,973] Trial 1 finished with value: 0.9043942172746859 and parameters: {'n_estimators': 661, 'max_depth': 8, 'learning_rate': 0.0486, 'subsample': 0.5310233472365982, 'colsample_bytree': 0.4821274854704905, 'gamma': 0.7594675603754767, 'min_child_weight': 3, 'reg_alpha': 0.5965084991146913, 'reg_lambda': 1.84331808971243}. Best is trial 0 with value: 0.9055210425869781.
[I 2025-10-25 05:07:19,743] Trial 2 finished with value: 0.9057623617813073 and parameters: {'n_estimators': 595, 'max_depth': 8, 'learning_rate': 0.0484, 'subsample': 0.5323315564766823, 'colsample_bytree': 0.42767251064516315, 'gamma':

KeyboardInterrupt: 