In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# Tree / Boosting models
from sklearn.ensemble import RandomForestClassifier

import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
TRAIN_PATH = '/kaggle/input/mock-test-2-mse-2/train.csv'
TEST_PATH  = '/kaggle/input/mock-test-2-mse-2/test.csv'
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

In [None]:
train_df.head()

In [None]:
TARGET_COL = 'Status' # Change to target col
ID_COL     = 'id'

In [None]:
if train_df[TARGET_COL].isna().any():
    target_mode = train_df[TARGET_COL].mode(dropna=True)[0]
    train_df[TARGET_COL] = train_df[TARGET_COL].fillna(target_mode)

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df[TARGET_COL])

X = train_df.drop(columns=[c for c in [TARGET_COL, ID_COL] if c in train_df.columns])
test_ids = test_df[ID_COL].copy()
X_test = test_df.drop(columns=[ID_COL])

In [None]:
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(exclude=['int64','float64']).columns

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [None]:
def cap_outliers(df, cols, lower=1, upper=99):
    """Caps outliers using percentile-based winsorization."""
    df = df.copy()
    for c in cols:
        lo, hi = df[c].quantile([lower/100, upper/100])
        df[c] = df[c].clip(lo, hi)
    return df

# Apply capping ONLY on feature matrices (never on target or id)
X = cap_outliers(X, num_cols)
X_test = cap_outliers(X_test, num_cols)

In [None]:
for col in num_cols[:6]:  # limit to avoid clutter
    plt.figure(figsize=(5, 2))
    sns.boxplot(x=X[col])
    plt.title(f"Boxplot: {col}")
    plt.show()

In [None]:
if len(num_cols) > 1:
    plt.figure(figsize=(10, 6))
    corr = X[num_cols].corr()
    sns.heatmap(corr, cmap='coolwarm', center=0)
    plt.title("Correlation Matrix (Numerical Features)")
    plt.show()

In [None]:
if len(num_cols) <= 5:
    sns.pairplot(train_df[num_cols.tolist() + [TARGET_COL]], hue=TARGET_COL)
    plt.show()

In [None]:
models = {
    'RandomForest': RandomForestClassifier(
    n_estimators=600,
    criterion="log_loss",
    max_depth=8,
    min_samples_split=25,
    min_samples_leaf=15,
    max_features=0.7,
    class_weight="balanced",
    bootstrap=True,
    n_jobs=-1,
    random_state=42
),

    'LightGBM': lgb.LGBMClassifier(
    objective="multiclass",
    metric="multi_logloss",
    learning_rate=0.05,
    n_estimators=800,
    num_leaves=15,
    max_depth=5,

    min_child_samples=50,
    subsample=0.7,
    colsample_bytree=0.7,

    reg_alpha=1.5,
    reg_lambda=3.0,

    class_weight="balanced",
    random_state=42,
    n_jobs=-1,
    verbose = -1

),

    'XGBoost': xgb.XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",

    learning_rate=0.05,
    n_estimators=700,
    max_depth=4,

    min_child_weight=10,
    subsample=0.7,
    colsample_bytree=0.7,

    reg_alpha=1.5,
    reg_lambda=3.0,

    random_state=42,
    n_jobs=-1
)
}



In [None]:
#FOR LARGE DATASET >3k

# models = {
#     'RandomForest': RandomForestClassifier(
#     n_estimators=1000,
#     criterion="log_loss",
#     max_depth=12,
#     min_samples_split=15,
#     min_samples_leaf=8,
#     max_features="sqrt",
#     class_weight="balanced",
#     bootstrap=True,
#     n_jobs=-1,
#     random_state=42
# ),

#     'LightGBM': lgb.LGBMClassifier(
#     objective="multiclass",
#     metric="multi_logloss",

#     learning_rate=0.03,
#     n_estimators=3000,
#     num_leaves=31,
#     max_depth=6,

#     min_child_samples=30,
#     subsample=0.8,
#     subsample_freq=1,
#     colsample_bytree=0.8,

#     reg_alpha=1.0,
#     reg_lambda=2.0,

#     class_weight="balanced",
#     random_state=42,
#     n_jobs=-1

# ),

#     'XGBoost': xgb.XGBClassifier(
#     objective="multi:softprob",
#     eval_metric="mlogloss",

#     learning_rate=0.03,
#     n_estimators=2500,
#     max_depth=6,

#     min_child_weight=5,
#     subsample=0.8,
#     colsample_bytree=0.8,

#     reg_alpha=1.0,
#     reg_lambda=2.0,

#     tree_method="hist",
#     random_state=42,
#     n_jobs=-1
# )
# }


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}

from sklearn.base import clone

for name, model in models.items():
    losses = []
    for train_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        # IMPORTANT: clone model for each fold (fixes LightGBM/XGBoost feature mismatch)
        model_clone = clone(model)

        pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model_clone)
        ])

        pipe.fit(X_tr, y_tr)
        val_pred = pipe.predict_proba(X_val)
        losses.append(log_loss(y_val, val_pred))

    results[name] = np.mean(losses)
    print(f"{name} CV LogLoss: {results[name]:.5f}")



In [None]:
sorted_models = sorted(results.items(), key=lambda x: x[1])

best_model_name, best_score = sorted_models[0]
second_model_name, second_score = sorted_models[1]

best_model = models[best_model_name]
second_model = models[second_model_name]

print(f"Top-2 Models Selected:")
print(f"{best_model_name} (LogLoss: {best_score:.5f})")
print(f"{second_model_name} (LogLoss: {second_score:.5f})")

In [None]:
USE_CALIBRATION_FOR_LOGLOSS = True  # keep True for log-loss competitions

from sklearn.calibration import CalibratedClassifierCV

pipelines = {}

for name, model in [(best_model_name, best_model), (second_model_name, second_model)]:
    if USE_CALIBRATION_FOR_LOGLOSS:
        calibrated_model = CalibratedClassifierCV(
            estimator=model,
            method='isotonic',
            cv=3
        )
        pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', calibrated_model)
        ])
    else:
        pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])


In [None]:
pipe.fit(X, y)
pipelines[name] = pipe

In [None]:
SUBMIT_PROBABILITIES = True   # True → log_loss submission
SUBMIT_LABELS        = False  # True → accuracy / precision submission

In [None]:
W1, W2 = 0.65, 0.35

proba_1 = pipelines[best_model_name].predict_proba(X_test)
proba_2 = pipelines[second_model_name].predict_proba(X_test)

y_pred_prob = W1 * proba_1 + W2 * proba_2
y_pred_labels_enc= pipelines[best_model_name].predict(X_test)

In [None]:
if SUBMIT_PROBABILITIES:
    # ---- GET ENCODED CLASS ORDER FROM MODEL ----
    # predict_proba columns follow encoded class order: 0..K-1
    n_classes = y_pred_prob.shape[1]
    encoded_classes = np.arange(n_classes)

    # ---- INVERSE TRANSFORM TO ORIGINAL LABELS ----
    original_labels = label_encoder.inverse_transform(encoded_classes)

    submission_cols = [f"{TARGET_COL}_{cls}" for cls in original_labels]

    submission = pd.DataFrame(y_pred_prob, columns=submission_cols)
    submission.insert(0, ID_COL, test_ids if test_ids is not None else range(len(submission)))

    # ---- REORDER EXACTLY LIKE sample_submission.csv IF AVAILABLE ----
    # try:
    #     sample_sub = pd.read_csv('sample_submission.csv')
    #     ordered_cols = sample_sub.columns.tolist()
    #     submission = submission[ordered_cols]
    #     print('Reordered columns using sample_submission.csv')
    # except Exception:
    #     print('sample_submission.csv not found – using inverse-transformed class order')

    submission.to_csv('Submission.csv', index=False)
    print('Submission.csv generated (probabilities)')
    print(submission.head())

In [None]:
if SUBMIT_LABELS:
    # • For accuracy / precision → use model.predict()

    y_pred_labels = label_encoder.inverse_transform(y_pred_labels_enc)

    labels_df = pd.DataFrame({
        ID_COL: test_df[ID_COL],
        f'{TARGET_COL}': y_pred_labels
    })

    labels_df.to_csv('Submission_labels.csv', index=False)
    print('Submission_labels.csv generated (labels)')
