In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [None]:
data = pd.read_csv('data/pokemon.csv')

print(data.head())

### Data Cleaning

In [None]:
# Handle missing values
# For numerical columns, we will use the median for imputation
# For categorical columns, we will use the most frequent value for imputation

# Numerical
num_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']

imputer = SimpleImputer(strategy='median')
data[num_cols] = imputer.fit_transform(data[num_cols])

# Categorical
cat_cols = ['Type 1', 'Type 2']
imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = imputer.fit_transform(data[cat_cols])

In [None]:
# Capping outliers at the 1st and 99th percentiles
for col in num_cols:
    q_low = data[col].quantile(0.01)
    q_hi = data[col].quantile(0.99)
    data[col] = np.where(data[col] < q_low, q_low, data[col])
    data[col] = np.where(data[col] > q_hi, q_hi, data[col])

### Feature Engineering

In [None]:
# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cols = encoder.fit_transform(data[cat_cols])
encoded_cols = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))

# Combine encoded columns with the original dataset
data = data.drop(cat_cols, axis=1)
data = pd.concat([data, encoded_cols], axis=1)

# Encode 'Legendary' column
data['Legendary'] = data['Legendary'].astype(int)

### Model Building

In [None]:
# Feature set and target variable
X = data.drop(columns=['Name', 'Combat Power'])
y = data['Combat Power']

# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Regression model (Random Forest Regressor )
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

### Evaluate the model

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

In [None]:
# Range of combat power
combat_power_range = data['Combat Power'].max() - data['Combat Power'].min()
print(f'Combat Power Range: {combat_power_range}')

In [None]:
# Mean combat power
mean_combat_power = data['Combat Power'].mean()
print(f'Mean Combat Power: {mean_combat_power}')

In [None]:
# RMSE as a percentage of the mean combat power
relative_rmse = (rmse / mean_combat_power) * 100
print(f'Relative RMSE: {relative_rmse:.2f}%')