In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error, r2_score
import plotly.express as px
import joblib

In [None]:
data = pd.read_csv('../data/battle_results_top_10.csv')

df = data.copy()

df = pd.DataFrame(df)

df_encoded = pd.get_dummies(df, columns=['type', 'pokemon_class'])
bool_columns = df_encoded.select_dtypes(include='bool').columns
df_encoded[bool_columns] = df_encoded[bool_columns].astype(int)
df_encoded.shape

## RandomForrestClassifier

In [None]:
cols_to_drop = [
  'battle_id', 'name', 'pokemon_id',
  'opponent_id', 'is_winner', 'opponent_name', 
  'opponent_primary_type', 'primary_type'
]
cols_to_use = ['attack', 'defense', 'speed', 'hp', 'effectiveness','attack_first']

# X = df_encoded.drop(columns=cols_to_drop)
X = df_encoded[cols_to_use]

# encoded_columns = X.columns.tolist()  
# joblib.dump(encoded_columns, './models/encoded_columns.pkl')

y = df_encoded['is_winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
print(f'Modellens noggrannhet: {accuracy * 100:.2f}%')
# model.get_params()


## Confusion Matrix / Classification Report 

In [None]:
y_pred = model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n', conf_matrix, end='\n')
print('____________________________', end='\n\n')

report = classification_report(y_test, y_pred, zero_division=1)
print(f'Report:\n{report}')

## Correlation

In [None]:
correlation = df_encoded['speed'].corr(df_encoded['is_winner'])
print(f"Korrelation mellan speed och is_winner: {correlation}")

## CV Score

In [None]:
cv_scores = cross_val_score(model, X_train, y_train, cv=5)

model.fit(X_train, y_train)
 
y_pred = model.predict(X_test)

print("Cross-Validated Accuracy Scores: \n", cv_scores)
print("Mean Cross-Validated Accuracy: \n", cv_scores.mean())

## GridSearch

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}


model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print('Best Hyperparameters: ', grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Accuracy Score: ', accuracy_score(y_test, y_pred))

In [None]:
best_model = grid_search.best_estimator_

joblib.dump(best_model, '../models/rf_best_model.pkl')

## Random Forrest Regressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
  'n_estimators': [100, 200, 300], 
  'max_depth': [None, 10, 20, 30],
  'max_features': ['sqrt', 'log2', None]
}

rf_regressor = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Best Hyperparameters:", grid_search.best_params_)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

joblib.dump(best_rf, '../models/best_random_forrest_regressor.pkl')

## Feature importance

In [None]:
importances = best_rf.feature_importances_
features = X.columns  
sorted_idx = importances.argsort()
feature_names = X.columns

indices = np.argsort(importances)[::-1]
print("Feature Importances: ")
for i in range(X.shape[1]):
  print(f"{feature_names[indices[i]]}: {importances[indices[i]] * 100:.2f}%")
fig = px.bar(x=features[sorted_idx], y=importances[sorted_idx])
fig.show()