In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# Tree / Boosting models
from sklearn.ensemble import RandomForestClassifier

import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
TRAIN_PATH = '/kaggle/input/mock-test-2-mse-2/train.csv'
TEST_PATH  = '/kaggle/input/mock-test-2-mse-2/test.csv'
TARGET_COL = 'Status'
ID_COL     = 'id'

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df[TARGET_COL])

X = train_df.drop(columns=[TARGET_COL])
X_test = test_df.copy()

In [None]:
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(exclude=['int64','float64']).columns


In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [None]:
def cap_outliers(df, cols, lower=1, upper=99):
    """Caps outliers using percentile-based winsorization."""
    df = df.copy()
    for c in cols:
        lo, hi = df[c].quantile([lower/100, upper/100])
        df[c] = df[c].clip(lo, hi)
    return df

# ---- APPLY OUTLIER CAPPING (ALWAYS) ----
train_df[num_cols] = cap_outliers(train_df, num_cols)
test_df[num_cols]  = cap_outliers(test_df, num_cols)

# Re-create X and X_test after capping
X = train_df.drop(columns=[TARGET_COL])
X_test = test_df.copy()

In [None]:
for col in num_cols[:6]:  # limit to first 6 to avoid clutter
    plt.figure(figsize=(5, 2))
    sns.boxplot(x=train_df[col])
    plt.title(f"Boxplot: {col}")
    plt.show()



In [None]:
if len(num_cols) > 1:
    plt.figure(figsize=(10, 6))
    corr = train_df[num_cols].corr()
    sns.heatmap(corr, cmap='coolwarm', center=0)
    plt.title("Correlation Matrix (Numerical Features)")
    plt.show()


In [None]:
if len(num_cols) <= 5:
    sns.pairplot(train_df[num_cols.tolist() + [TARGET_COL]], hue=TARGET_COL)
    plt.show()

In [None]:
models = {
    'RandomForest': RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=42
    ),

    'LightGBM': lgb.LGBMClassifier(
        objective='multiclass',
        n_estimators=700,
        learning_rate=0.03,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ),

    'XGBoost': xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        n_estimators=700,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        random_state=42
    )
}


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}

from sklearn.base import clone

for name, model in models.items():
    losses = []
    for train_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        # IMPORTANT: clone model for each fold (fixes LightGBM/XGBoost feature mismatch)
        model_clone = clone(model)

        pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model_clone)
        ])

        pipe.fit(X_tr, y_tr)
        val_pred = pipe.predict_proba(X_val)
        losses.append(log_loss(y_val, val_pred))

    results[name] = np.mean(losses)
    print(f"{name} CV LogLoss: {results[name]:.5f}")



In [None]:
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

print(f"\nBest Model Selected: {best_model_name}")

In [None]:
USE_CALIBRATION_FOR_LOGLOSS = True  # True for LogLoss, False for Accuracy/F1

from sklearn.calibration import CalibratedClassifierCV

if USE_CALIBRATION_FOR_LOGLOSS:
    calibrated_model = CalibratedClassifierCV(
        estimator=best_model,
        method='isotonic',
        cv=3
    )

    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', calibrated_model)
    ])
else:
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', best_model)
    ])


In [None]:
final_pipeline.fit(X, y)

In [None]:
SUBMIT_PROBABILITIES = True   # True → log_loss submission
SUBMIT_LABELS        = False  # True → accuracy / precision submission


In [None]:
y_pred_labels_enc = final_pipeline.predict(X_test)
y_pred_prob = final_pipeline.predict_proba(X_test)

In [None]:

if SUBMIT_PROBABILITIES:
    # Get class labels exactly as model sees them
    class_labels = final_pipeline.named_steps['model'].classes_

    submission_cols = [f"{TARGET_COL}_{label}" for label in class_labels]

    submission = pd.DataFrame(y_pred_prob, columns=submission_cols)
    submission.insert(0, ID_COL, test_df[ID_COL])

    # OPTIONAL: rename / reorder if professor requires specific names
    # Example mapping (EDIT AFTER SEEING sampleSubmission.csv)
    # submission = submission.rename(columns={
    #     'Status_0': 'Status_D',
    #     'Status_1': 'Status_C',
    #     'Status_2': 'Status_CL'
    # })

    # Example reorder
    # submission = submission[['id', 'Status_C', 'Status_CL', 'Status_D']]

    submission.to_csv('Submission.csv', index=False)
    print('Submission.csv generated (probabilities)')

In [None]:
if SUBMIT_LABELS:
    # • For accuracy / precision → use model.predict()

    y_pred_labels = label_encoder.inverse_transform(y_pred_labels_enc)

    labels_df = pd.DataFrame({
        ID_COL: test_df[ID_COL],
        f'{TARGET_COL}_pred': y_pred_labels
    })

    labels_df.to_csv('Submission_labels.csv', index=False)
    print('Submission_labels.csv generated (labels)')