In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import xgboost as xgb

In [None]:
data = pd.read_csv('data/pokemon.csv')
print(data.head())

### Cleaning

In [None]:
# missing values
# median imputation for numerical
num_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']

imputer = SimpleImputer(strategy='median')
data[num_cols] = imputer.fit_transform(data[num_cols])

# most frequent value imputation for categorical
cat_cols = ['Type 1', 'Type 2']
imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = imputer.fit_transform(data[cat_cols])

# capping outliers
for col in num_cols:
    q_low = data[col].quantile(0.01)
    q_hi = data[col].quantile(0.99)
    data[col] = np.where(data[col] < q_low, q_low, data[col])
    data[col] = np.where(data[col] > q_hi, q_hi, data[col])

### Encoding

In [None]:
# encoding categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cols = encoder.fit_transform(data[cat_cols])
encoded_cols = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))

data = data.drop(cat_cols, axis=1)
data = pd.concat([data, encoded_cols], axis=1)

data['Legendary'] = data['Legendary'].astype(int)

### Model

In [None]:
# training and test sets
X = data.drop(columns=['Name', 'Combat Power'])
y = data['Combat Power']
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

In [None]:
# evaluating best parameters 
xgboost_model = xgb.XGBRegressor(random_state=42)

grid_search = GridSearchCV(estimator=xgboost_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f'Best parameters found for XGBoost: {best_params}')

In [None]:
# XGBoost model with best parameters
best_xgboost_model = xgb.XGBRegressor(**best_params, random_state=42)
best_xgboost_model.fit(X_train, y_train)

y_pred_xgb = best_xgboost_model.predict(X_test)

### Model Evaluation

In [None]:
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
print(f'XGBoost RMSE: {rmse_xgb}')

mean_combat_power = y_test.mean()

relative_rmse = (rmse_xgb / mean_combat_power) * 100
print(f'RMSE as a percentage of the mean combat power: {relative_rmse:.2f}%')