In [None]:
#  Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt

# === 📥 Load and Clean Data ===
train_features = pd.read_csv("train.csv")
train_labels = pd.read_csv("train_labels.csv")
test_df = pd.read_csv("test.csv")

# ✅ Remove 'Class' from features if accidentally present
if 'Class' in train_features.columns:
    train_features.drop(columns=['Class'], inplace=True)

# ✅ Merge features + labels safely
train_df = train_features.merge(train_labels, on='Id')
assert 'Class' in train_df.columns

# === 🎯 Prepare Feature and Target Matrices ===
X = train_df.drop(columns=['Id', 'Class'])
y = train_df['Class']

X_test = test_df.drop(columns=['Id'])
test_ids = test_df['Id']

# === 🧼 Impute Missing Values ===
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

# === 🌟 Feature Selection using LightGBM Importance ===
selector_model = LGBMClassifier(n_estimators=200, random_state=42)
selector_model.fit(X, y)

importances = selector_model.feature_importances_
top_k = 200  # Use top 200 most important features
top_indices = np.argsort(importances)[-top_k:]

X = X[:, top_indices]
X_test = X_test[:, top_indices]

# === 🔁 Stratified K-Fold + Soft Voting Ensemble ===
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
n_classes = len(np.unique(y))
test_preds = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = LGBMClassifier(
        objective='multiclass',
        learning_rate=0.02,
        n_estimators=1000,
        num_leaves=31,
        class_weight='balanced',
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss'
    )

    y_val_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average='macro')
    f1_scores.append(f1)
    print(f"✅ Fold {fold} Macro F1: {f1:.4f}")

    test_preds += model.predict_proba(X_test)

# === 📈 Final Predictions ===
test_preds_final = np.argmax(test_preds / kf.get_n_splits(), axis=1)

submission = pd.DataFrame({
    'Id': test_ids,
    'Class': test_preds_final
})
submission.to_csv("submission_final.csv", index=False)

# === 📊 Results Summary ===
print(f"\n✅ Mean CV Macro F1: {np.mean(f1_scores):.4f}")
print(f"Folds: min={np.min(f1_scores):.4f}, max={np.max(f1_scores):.4f}, std={np.std(f1_scores):.4f}")
print("\n📤 Final submission saved as 'submission_final.csv'")
print(submission.head())

# === 📉 Plot Top Feature Importances ===
plt.figure(figsize=(12, 4))
plt.bar(range(top_k), importances[top_indices])
plt.title("Top 200 Feature Importances from LightGBM")
plt.xlabel("Feature Index (Top K)")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()
