In [None]:
import json
import pandas as pd
import os
from tqdm.notebook import tqdm
import numpy as np

In [None]:
# --- Define the path to our data ---
DATA_FOLDER = 'data'
DATA_PATH = os.path.join(os.path.abspath(''), DATA_FOLDER)
DATA_PATH

In [None]:
# Load the training data from file into train_data list

train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
train_data = []

# Read and decode the file line by line
print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))
    print(f'Successfully loaded {len(train_data)} battles.')

except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print('Please make sure you have added the competition data to this notebook.')

except IOError:
    print(f"An error occurred while reading the file '{train_file_path}'.")

In [None]:
# OPTIONAL
# Sneak a peek into the just loaded battle data.

print("\n--- Structure of the first train battle: ---")
if train_data:
    first_battle = train_data[0]
    
    # To keep the output clean, we can create a copy and truncate the timeline
    battle_for_display = first_battle.copy()
    battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:2] # Show first 2 turns
    
    # Use json.dumps for pretty-printing the dictionary
    print(json.dumps(battle_for_display, indent=4))
    if len(first_battle.get('battle_timeline', [])) > 3:
        print("    ...")
        print("    [battle_timeline has been truncated for display]")

# Feature Engineering

At this point we have all the training data, decoded from JSON, in the `train_data` list (of nested structures), so it's time to work on the features, through techniques such as features regularization and selection.

In [None]:
# Different extract* functions: they differentiate on the number of features they extract.
# Only one of them must be called (otherwise the last called one overwrites the features dataframe).

POKEMON_STATS = ['hp', 'atk', 'def', 'spa', 'spd', 'spe']

def extract_basic_features(data: list[dict]) -> pd.DataFrame:
    """
    A very basic feature extraction function.
    It only uses the aggregated base stats of the player's team and opponent's lead.
    """
    feature_list = []
    for battle in tqdm(data, desc="Extracting features"):
        battle_features = {}
        
        # --- Player 1 Team Features ---
        p1_team = battle.get('p1_team_details', [])
        if p1_team:
            for stat in POKEMON_STATS:
                battle_features[f'p1_mean_{stat}'] = np.mean([p.get(f'base_{stat}', 0) for p in p1_team])
                battle_features[f'p1_min_{stat}'] = np.min([p.get(f'base_{stat}', 0) for p in p1_team])
                battle_features[f'p1_max_{stat}'] = np.max([p.get(f'base_{stat}', 0) for p in p1_team])
                battle_features[f'p1_std_{stat}'] = np.std([p.get(f'base_{stat}', 0) for p in p1_team])
            
            i = 1
            for p in p1_team:
                battle_features[f'p1_name_{i}'] = p['name']
                battle_features[f'p1_level_{i}'] = p['level']
                battle_features[f'p1_types_1_{i}'] = p['types'][0]
                battle_features[f'p1_types_2_{i}'] = p['types'][1]
                i += 1

        # --- Player 2 Lead battle_features ---
        p2_lead = battle.get('p2_lead_details')
        if p2_lead:
            # Player 2's lead Pok√©mon's stats
            for stat in POKEMON_STATS:
                battle_features[f'p2_lead_{stat}'] = p2_lead.get(f'base_{stat}', 0)
                
            battle_features['p2_lead_name'] = p2_lead['name']
            battle_features['p2_lead_level'] = p2_lead['level']
            battle_features['p2_lead_types_1'] = p2_lead['types'][0]
            battle_features['p2_lead_types_2'] = p2_lead['types'][1]

        # We also need the ID and the target variable (if it exists)
        battle_features['battle_id'] = battle.get('battle_id')
        if 'player_won' in battle:
            battle_features['player_won'] = int(battle['player_won'])
            
        feature_list.append(battle_features)
        
    return pd.DataFrame(feature_list).fillna(0)

In [None]:
print("Processing training data...")
train_df = extract_basic_features(train_data)

In [None]:
# Optional: sneak a peek into the features dataframe.
train_df.tail()

In [None]:
# Optional: sneak a peek into text columns.
text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
train_df[text_cols].tail()

# Models Training and Comparison

At this point we have all the selected (possibly regularized) features in to the `train_df` dataframe, so it's time to train the various models on them.

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score

from prettytable import PrettyTable

In [None]:
# Encode all the text columns
le = LabelEncoder()

text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
for col in text_cols: 
    train_df[col] = le.fit_transform(train_df[col])

In [None]:
# Define our features (X) and target (y)
features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]

# Any data scaling/regularization goes here
scaler = StandardScaler().fit(train_df[features])
train_df_scaled = scaler.transform(train_df[features])

# Split the data (and decide whether to use scaling or not).
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['player_won'], test_size=0.3)

In [None]:
# PCA
pca = PCA(n_components=10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
# Models list (just add models with their parameters!)
models = [
    LogisticRegression(max_iter=10_000),
    LogisticRegressionCV(max_iter=10_000),
    GaussianNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    KNeighborsClassifier(n_neighbors = 3)
]

In [None]:
# Model testing with normal training
models_result = []

for model in models:
    model.fit(X_train, y_train)
    models_result.append([model.__class__.__name__, model.score(X_test, y_test)])

In [None]:
# Model testing with K-fold cross-validation training - Alternative to the above!
models_result = []

for model in models:    
    kfold = KFold(n_splits=18) 
    cv_results = cross_val_score(model, train_df_scaled, train_df['player_won'], cv=kfold) # or 'accuracy'
    models_result.append([model.__class__.__name__, cv_results.mean()])

In [None]:
results_table = PrettyTable()
results_table.field_names = ['Model Name', 'Accuracy']
results_table.align['Model Name'] = 'r'
results_table.align['Accuracy'] = 'l'
results_table.add_rows(sorted([[result[0], round(result[1]*100, 3)] for result in models_result], key=lambda row: row[1]))
print(results_table)