In [94]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings
import joblib

warnings.filterwarnings('ignore')

In [95]:
df_train = pd.read_csv('/kaggle/input/datasets/fathichellyy/insat-competition/train.csv')
df_test  = pd.read_csv('/kaggle/input/datasets/fathichellyy/insat-competition/test.csv')

TARGET = "Purchased_Coverage_Bundle"

columns_to_drop = [
    "User_ID", "Policy_Cancelled_Post_Purchase",
    "Employer_ID", "Policy_Start_Week", "Policy_Start_Day",
    "Broker_ID"  # too many nulls
]

df_train = df_train.drop(columns=columns_to_drop, errors="ignore")
df_test  = df_test.drop(columns=columns_to_drop, errors="ignore")

In [96]:
num_cols = df_train.select_dtypes(include=[np.number]).columns.drop(TARGET, errors='ignore').tolist()
cat_cols = df_train.select_dtypes(include=['object', 'category']).columns.tolist()

for col in num_cols:
    median_val = df_train[col].median()
    df_train[col] = df_train[col].fillna(median_val)
    df_test[col]  = df_test[col].fillna(median_val)

for col in cat_cols:
    mode_val = df_train[col].mode()[0]
    df_train[col] = df_train[col].fillna(mode_val)
    df_test[col]  = df_test[col].fillna(mode_val)

In [97]:
X = df_train.drop(columns=[TARGET])
y = df_train[TARGET].values
X_test = df_test.reindex(columns=X.columns)  # ensure same column order

# Encode categorical features
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[cat_cols] = oe.fit_transform(X[cat_cols])
X_test[cat_cols] = oe.transform(X_test[cat_cols])

# Scale all features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
NUM_CLASSES = len(le.classes_)
print(f"Number of classes: {NUM_CLASSES}")

Number of classes: 10


In [98]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_enc, test_size=0.1, random_state=42, stratify=y_enc
)

In [99]:
print("Before SMOTE:\n", pd.Series(y_train).value_counts().sort_index())

min_samples = pd.Series(y_train).value_counts().min()
k_neighbors = min(5, min_samples - 1)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE:\n", pd.Series(y_train_res).value_counts().sort_index())

Before SMOTE:
 0      741
1     1463
2    32522
3     4348
4    12562
5      431
6      647
7     2057
8        5
9        5
Name: count, dtype: int64

After SMOTE:
 0    32522
1    32522
2    32522
3    32522
4    32522
5    32522
6    32522
7    32522
8    32522
9    32522
Name: count, dtype: int64


In [116]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',   # multiclass
    num_class=NUM_CLASSES,
    eval_metric='mlogloss',
    use_label_encoder=False,
    n_estimators=150,
    max_depth=8,
    learning_rate=0.6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train_res, y_train_res,
    eval_set=[(X_val, y_val)],
    verbose=50
)

[0]	validation_0-mlogloss:1.68832
[49]	validation_0-mlogloss:0.70885


In [117]:
print("\nAccuracy       :", accuracy_score(y_val, y_pred))
print("F1 Weighted    :", f1_score(y_val, y_pred, average='weighted'))
print("F1 Macro       :", f1_score(y_val, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Accuracy       : 0.7148020371283063
F1 Weighted    : 0.709313418548962
F1 Macro       : 0.5220380174413924

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.48      0.52        82
           1       0.66      0.55      0.60       162
           2       0.81      0.86      0.83      3614
           3       0.39      0.38      0.38       483
           4       0.59      0.52      0.56      1396
           5       0.72      0.69      0.70        48
           6       0.54      0.47      0.50        72
           7       0.60      0.60      0.60       229
           8       0.00      0.00      0.00         1

    accuracy                           0.71      6087
   macro avg       0.54      0.50      0.52      6087
weighted avg       0.71      0.71      0.71      6087



In [118]:
import joblib

artifact = {
    "model":     xgb_model,
    "features":  list(X.columns),        # feature names after dropping User_ID/target, before scaling
    "cat_cols":  cat_cols,                # list of categorical column names
    "num_cols":  num_cols,                # list of numeric column names
    "medians":   {col: df_train[col].median() for col in num_cols},
    "modes":     {col: df_train[col].mode()[0] for col in cat_cols},
    "oe":        oe,                      # fitted OrdinalEncoder
    "scaler":    scaler,                  # fitted StandardScaler
    "le":        le,                      # fitted LabelEncoder
}

joblib.dump(artifact, "model.joblib")
print("✅ Saved!")

✅ Saved!


In [119]:
y_test_pred = xgb_model.predict(X_test_scaled)
# If needed, convert back to original labels
y_test_labels = le.inverse_transform(y_test_pred)

# Save submission
submission = pd.DataFrame({
    "User_ID": df_test["User_ID"] if "User_ID" in df_test.columns else np.arange(len(df_test)),
    TARGET: y_test_labels
})
submission.to_csv("xgb_submission.csv", index=False)
print("✅ Submission file saved: xgb_submission.csv")

✅ Submission file saved: xgb_submission.csv
