In [32]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,  classification_report
from sklearn.preprocessing import OneHotEncoder
import joblib
import plotly.express as px


In [33]:
data = pd.read_csv('../data/battle_results_top_10.csv')

df = data.copy()

df = pd.DataFrame(df)

In [None]:
df.head()

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

encoder.fit(df[['type', 'pokemon_class']])

joblib.dump(encoder, '../models/onehot_encoder.pkl')

In [36]:
encoder = joblib.load('../models/onehot_encoder.pkl')
df_encoded = encoder.transform(df[['type', 'pokemon_class']])

df_encoded = pd.DataFrame(df_encoded, columns=encoder.get_feature_names_out(['type', 'pokemon_class']))


df_encoded = pd.concat([df.drop(columns=['type', 'pokemon_class']), df_encoded], axis=1)


## XGBoost

In [None]:
cols_to_drop = [
  'battle_id', 'name', 'pokemon_id',
  'opponent_id', 'is_winner', 'opponent_name', 
  'opponent_primary_type', 'primary_type'
]

cols_to_use = ['attack', 'defense', 'speed', 'hp', 'effectiveness','attack_first', 'pokemon_class_Legendary', 'pokemon_class_Mythical', 'pokemon_class_Normal']

# X = df_encoded.drop(columns=cols_to_drop)

X = df_encoded[cols_to_use]
y = df_encoded['is_winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)

encoded_columns = X.columns.tolist()
joblib.dump(encoded_columns, '../models/encoded_columns.pkl')

print(f'Modellens noggrannhet: {accuracy * 100:.2f}%')


## Confusion Matrix & Classification Report

In [None]:
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n', conf_matrix, end='\n')
print('____________________________________', end='\n\n')
report = classification_report(y_test, y_pred, zero_division=1)
print(f'Report:\n{report}')

## Cross Validation Accuracy for XGBoost

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

print(f"Cross-Validation Accuracy: {scores.mean():.2f}")

## XGBoost x RandomizedSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.3, 0.6, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 0.2],
    'min_child_weight': [1]
}


xgb_model = xgb.XGBClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model, param_distributions=param_grid, 
    n_iter=100, cv=5, scoring='f1', verbose=1, random_state=42
)

random_search.fit(X_train, y_train)


In [None]:
print(f"Best parameters found:\n{random_search.best_params_}")
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

## Feature importance

In [None]:
importances = best_model.feature_importances_
features = X.columns  
sorted_idx = importances.argsort()
feature_names = X.columns

indices = np.argsort(importances)[::-1]
print("Feature Importances: ")
for i in range(X.shape[1]):
  print(f"{feature_names[indices[i]]}: {importances[indices[i]] * 100:.2f}%")
fig = px.bar(x=features[sorted_idx], y=importances[sorted_idx])
fig.show()

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n', conf_matrix, end='\n')
print('____________________________________', end='\n\n')
report = classification_report(y_test, y_pred, zero_division=1)
print(f'Report:\n{report}')
print('____________________________________', end='\n\n')
print('Accuracy Score: \n', accuracy_score(y_test, y_pred))



In [None]:
importances = best_model.feature_importances_

feature_names = X.columns

indices = np.argsort(importances)[::-1]
print("Feature Importances: ")
for i in range(X.shape[1]):
  print(f"{feature_names[indices[i]]}: {importances[indices[i]] * 100:.2f}%")

In [None]:
importance = best_model.feature_importances_
features = X.columns  
sorted_idx = importance.argsort()

fig = px.bar(x=features[sorted_idx], y=importance[sorted_idx])
fig.show()

In [None]:
joblib.dump(model, '../models/xgboost_model.pkl')