In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [None]:
data = pd.read_csv('data/pokemon.csv')
print(data.head())

### Cleaning

In [None]:
# missing values
# median imputation for numerical
num_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']

imputer = SimpleImputer(strategy='median')
data[num_cols] = imputer.fit_transform(data[num_cols])

# most frequent value imputation for categorical
cat_cols = ['Type 1', 'Type 2']
imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = imputer.fit_transform(data[cat_cols])

# capping outliers
for col in num_cols:
    q_low = data[col].quantile(0.01)
    q_hi = data[col].quantile(0.99)
    data[col] = np.where(data[col] < q_low, q_low, data[col])
    data[col] = np.where(data[col] > q_hi, q_hi, data[col])

### Features

In [None]:
# encoding categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cols = encoder.fit_transform(data[cat_cols])
encoded_cols = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))

data = data.drop(cat_cols, axis=1)
data = pd.concat([data, encoded_cols], axis=1)

data['Legendary'] = data['Legendary'].astype(int)

In [None]:
# feature set and target variable
X = data.drop(columns=['Name', 'Combat Power'])
y = data['Combat Power']

# training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# base models
base_models = [
    ('xgboost', xgb.XGBRegressor(random_state=42)),
    ('lightgbm', lgb.LGBMRegressor(random_state=42)),
    ('catboost', CatBoostRegressor(random_seed=42, silent=True))
]

meta_model = LinearRegression()

### Initializing

In [None]:
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1, verbose=2)

stacking_model.fit(X_train, y_train)

y_pred_stack = stacking_model.predict(X_test)

### Evaluation

In [None]:
# Evaluate the stacking model
mse_stack = mean_squared_error(y_test, y_pred_stack)
rmse_stack = np.sqrt(mse_stack)
print(f'Stacking Model RMSE: {rmse_stack}')

# Calculate the mean combat power
mean_combat_power = y_test.mean()

# Calculate the RMSE as a percentage of the mean combat power
relative_rmse = (rmse_stack / mean_combat_power) * 100
print(f'RMSE as a percentage of the mean combat power: {relative_rmse:.2f}%')