# 3. Modeling and Evaluation

This notebook trains and compares two models:
1. **XGBoost:** Using K-Means features.
2. **MLP (Neural Network):** Using Histogram features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, classification_report

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Constants
DATA_DIR = "../data/processed"
SEED = 42

In [None]:
# Load Data
X_kmeans = pd.read_csv(os.path.join(DATA_DIR, "X_kmeans.csv"))
X_hist = pd.read_csv(os.path.join(DATA_DIR, "X_hist.csv"))
y_labels = pd.read_csv(os.path.join(DATA_DIR, "y_labels.csv"))

# Preprocess Labels (Multi-label Binarization)
# Some Pokemon have 1 type, some have 2. We need to combine them into a list.
y_list = []
for _, row in y_labels.iterrows():
    types = [row['type1']]
    if pd.notna(row['type2']):
        types.append(row['type2'])
    y_list.append(types)

mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(y_list)
print(f"Classes: {mlb.classes_}")
print(f"y shape: {y_encoded.shape}")

In [None]:
# Split Data
# We need to use the same indices for both models to be fair
train_idx, test_idx = train_test_split(range(len(y_encoded)), test_size=0.2, random_state=SEED)

X_train_xgb = X_kmeans.iloc[train_idx]
X_test_xgb = X_kmeans.iloc[test_idx]

X_train_mlp = X_hist.iloc[train_idx]
X_test_mlp = X_hist.iloc[test_idx]

y_train = y_encoded[train_idx]
y_test = y_encoded[test_idx]

## Model A: XGBoost (K-Means Features)

In [None]:
xgb_estimator = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
clf_xgb = MultiOutputClassifier(xgb_estimator)

print("Training XGBoost...")
clf_xgb.fit(X_train_xgb, y_train)
y_pred_xgb = clf_xgb.predict(X_test_xgb)

print("XGBoost Evaluation:")
print(f"Hamming Loss: {hamming_loss(y_test, y_pred_xgb):.4f}")
print(f"F1 Score (Macro): {f1_score(y_test, y_pred_xgb, average='macro'):.4f}")

## Model B: MLP (Histogram Features)

In [None]:
input_dim = X_train_mlp.shape[1]
output_dim = y_encoded.shape[1]

model_mlp = Sequential([
    Dense(128, activation='relu', input_shape=(input_dim,)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(output_dim, activation='sigmoid') # Sigmoid for multi-label
])

model_mlp.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

print("Training MLP...")
history = model_mlp.fit(X_train_mlp, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

y_pred_probs_mlp = model_mlp.predict(X_test_mlp)
y_pred_mlp = (y_pred_probs_mlp > 0.5).astype(int)

print("MLP Evaluation:")
print(f"Hamming Loss: {hamming_loss(y_test, y_pred_mlp):.4f}")
print(f"F1 Score (Macro): {f1_score(y_test, y_pred_mlp, average='macro'):.4f}")

## Comparison & Analysis

In [None]:
# Compare Metrics
metrics = {
    'Model': ['XGBoost', 'MLP'],
    'Hamming Loss': [hamming_loss(y_test, y_pred_xgb), hamming_loss(y_test, y_pred_mlp)],
    'F1 Score (Macro)': [f1_score(y_test, y_pred_xgb, average='macro'), f1_score(y_test, y_pred_mlp, average='macro')]
}

df_res = pd.DataFrame(metrics)
print(df_res)

df_res.plot(x='Model', kind='bar', subplots=True, layout=(1, 2), figsize=(12, 5))
plt.show()

### Save Models

In [None]:
import pickle
import joblib

MODEL_DIR = "../models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Save XGB
joblib.dump(clf_xgb, os.path.join(MODEL_DIR, "xgboost_model.pkl"))

# Save MLP
model_mlp.save(os.path.join(MODEL_DIR, "mlp_model.h5"))

# Save Binarizer
joblib.dump(mlb, os.path.join(MODEL_DIR, "mlb.pkl"))

print("Models saved.")