In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import joblib

train_df = pd.read_csv("train_A.csv")

# Separate target
y = train_df["is_late"].map({"on_time": 0, "late": 1})
X = train_df.drop(columns=["id", "is_late"])

# One-hot encoding
X = pd.get_dummies(X)

# Save feature columns
feature_columns = X.columns

# Train-validation split (Stratified)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=15,
    learning_rate=0.008283,
    subsample=0.735985,
    colsample_bytree=0.577932,
    colsample_bylevel=0.820142,
    min_child_weight=1,
    gamma=0.310924,
    reg_alpha=0.006871,
    reg_lambda=0.008965,
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1
)

model.fit(X_train, y_train)
proba_valid = model.predict_proba(X_valid)[:, 1]

thresholds = np.linspace(0.1, 0.9, 100)
f1_scores = [
    f1_score(y_valid, (proba_valid >= t).astype(int), average="macro")
    for t in thresholds
]

best_threshold = thresholds[np.argmax(f1_scores)]
best_macro_f1 = max(f1_scores)

print("Best Threshold:", best_threshold)
print("Validation Macro F1:", best_macro_f1)

joblib.dump({
    "model": model,
    "feature_columns": feature_columns,
    "threshold": best_threshold
}, "xgb_bundle.pkl")

print("✓ Model bundle saved")
print("\nGenerating submission file...")

# Load test data
test_df = pd.read_csv("test_A.csv")

print("Test dataset shape:", test_df.shape)

# Save IDs
test_ids = test_df["id"]

# Drop id and target if accidentally present
X_test = test_df.drop(columns=["id", "is_late"], errors="ignore")

# One-hot encoding
X_test = pd.get_dummies(X_test)

# Align with training features
X_test = X_test.reindex(columns=feature_columns, fill_value=0)

# Predict probabilities
proba_test = model.predict_proba(X_test)[:, 1]

# Apply optimized threshold
pred_binary = (proba_test >= best_threshold).astype(int)

# Convert to required labels
pred_labels = np.where(pred_binary == 1, "late", "on_time")

# Create submission
submission = pd.DataFrame({
    "id": test_ids,
    "prediction": pred_labels
})

# Save submission file
submission.to_csv("submission.csv", index=False)

print("✓ submission.csv created successfully")
print(submission.head())

Best Threshold: 0.3424242424242424
Validation Macro F1: 0.5439643339270116
✓ Model bundle saved

Generating submission file...
Test dataset shape: (600, 18)
✓ submission.csv created successfully
   id prediction
0   1    on_time
1   2       late
2   3       late
3   4       late
4   5       late
