In [1]:
# XGBoost + SMOTE (One-Hot) with MTRANS & SMOKE dropped + Hyperparameter Tuning + Submission
import warnings, pandas as pd, numpy as np
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

warnings.filterwarnings("ignore")

# -----------------------------
# 1) Load data
# -----------------------------
TRAIN = Path("train.csv")
TEST  = Path("test.csv")
SAMPLE_SUB = Path("sample_submission.csv")

train = pd.read_csv(TRAIN)
test  = pd.read_csv(TEST)
sample_sub = pd.read_csv(SAMPLE_SUB)

TARGET = "WeightCategory"
assert TARGET in train.columns, f"{TARGET} not found in train columns"

# -----------------------------
# 2) Drop requested columns
# -----------------------------
cols_to_drop = ["MTRANS", "SMOKE"]
train = train.drop(columns=[c for c in cols_to_drop if c in train.columns])
test  = test.drop(columns=[c for c in cols_to_drop if c in test.columns])

# -----------------------------
# 3) Split features/target
# -----------------------------
y_raw = train[TARGET]
X = train.drop(columns=[TARGET])

# XGBoost expects numeric labels -> label-encode y
le = LabelEncoder()
y = le.fit_transform(y_raw)
print("Classes:", list(le.classes_))

# -----------------------------
# 4) Column types
# -----------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric: {len(num_cols)} | Categorical: {len(cat_cols)}")

# -----------------------------
# 5) Preprocessing (One-Hot, DENSE so SMOTE can work)
#    Use sparse_output for sklearn >=1.3, else fallback to sparse=False
# -----------------------------
numeric_transformer = SimpleImputer(strategy="median")

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:  # sklearn < 1.3
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

categorical_transformer = ImbPipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

# -----------------------------
# 6) Define SMOTE + XGBoost in a pipeline
#    SMOTE happens *after* preprocessing, *inside* CV folds — avoids leakage.
# -----------------------------
smote = SMOTE(random_state=42)

xgb = XGBClassifier(
    # solid baseline; tuning will override
    n_estimators=400,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.0,
    reg_lambda=1.0,
    reg_alpha=0.0,
    eval_metric="mlogloss",
    tree_method="hist",      # fast & memory-friendly
    n_jobs=-1,
    random_state=42
)

pipe = ImbPipeline([
    ("preprocess", preprocess),
    ("smote", smote),
    ("xgb", xgb),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Baseline CV accuracy with SMOTE
baseline_scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
print(f"Baseline (with SMOTE) CV Accuracy: mean={baseline_scores.mean():.5f}  std={baseline_scores.std():.5f}")

# -----------------------------
# 7) Hyperparameter Tuning (Randomized Search)
# -----------------------------
param_distributions = {
    "xgb__n_estimators":        [300, 500, 800, 1000, 1200],
    "xgb__learning_rate":       [0.03, 0.05, 0.07, 0.1, 0.15],
    "xgb__max_depth":           [3, 4, 5, 6, 7, 8],
    "xgb__min_child_weight":    [1, 2, 3, 4, 5],
    "xgb__subsample":           [0.6, 0.7, 0.8, 0.9, 1.0],
    "xgb__colsample_bytree":    [0.6, 0.7, 0.8, 0.9, 1.0],
    "xgb__gamma":               [0, 0.5, 1, 1.5, 2],
    "xgb__reg_lambda":          [0.5, 1.0, 1.5, 2.0, 3.0],
    "xgb__reg_alpha":           [0.0, 0.1, 0.5, 1.0],
    # Optional: you can also tune SMOTE's k_neighbors if classes are very imbalanced:
    # "smote__k_neighbors":       [3, 5, 7],
}

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=25,                    # 20–30 for speed; 50–80 for more thorough
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    refit=True                    # retrain on FULL train set with best params
)

search.fit(X, y)

print("\nBest CV Accuracy:", f"{search.best_score_:.6f}")
print("Best Params:")
for k, v in search.best_params_.items():
    print(f"  {k}: {v}")

best_model = search.best_estimator_

# -----------------------------
# 8) Predict on test (inverse-transform labels)
# -----------------------------
test_pred_int = best_model.predict(test)
test_pred = le.inverse_transform(test_pred_int)

# -----------------------------
# 9) Build Kaggle submission
# -----------------------------
ss_cols = sample_sub.columns.tolist()
id_col = ss_cols[0]
target_col = ss_cols[1] if len(ss_cols) > 1 else TARGET

if id_col in test.columns:
    sub_df = pd.DataFrame({id_col: test[id_col].values, target_col: test_pred})
else:
    sub_df = pd.DataFrame({id_col: np.arange(len(test_pred)), target_col: test_pred})

# align to sample_submission header if possible
if set(sub_df.columns) == set(sample_sub.columns):
    sub_df = sub_df[sample_sub.columns]

out_path = Path("submission_xgboost_smote.csv")
sub_df.to_csv(out_path, index=False)
print(f"\nSaved submission -> {out_path.resolve()}")
sub_df.head()


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject