In [1]:
import json
import pandas as pd
import os
from tqdm.notebook import tqdm
import numpy as np

from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score

from prettytable import PrettyTable

In [2]:
# --- Define the path to our data ---
DATA_FOLDER = 'data'
DATA_PATH = os.path.join(os.path.abspath(''), DATA_FOLDER)
DATA_PATH

'C:\\Users\\emazep\\Dropbox\\uni\\2025-2026\\FDS\\FDS-Pokemon\\data'

In [3]:
# Load the training data from file into train_data list

train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
train_data = []

# Read and decode the file line by line
print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))
    print(f'Successfully loaded {len(train_data)} battles.')

except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print('Please make sure you have added the competition data to this notebook.')

except IOError:
    print(f"An error occurred while reading the file '{train_file_path}'.")

Loading data from 'C:\Users\emazep\Dropbox\uni\2025-2026\FDS\FDS-Pokemon\data\train.jsonl'...
Successfully loaded 10000 battles.


In [4]:
# OPTIONAL
# Sneak a peek into the just loaded battle data.

print("\n--- Structure of the first train battle: ---")
if train_data:
    first_battle = train_data[0]
    
    # To keep the output clean, we can create a copy and truncate the timeline
    battle_for_display = first_battle.copy()
    battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:5]
    
    # Use json.dumps for pretty-printing the dictionary
    print(json.dumps(battle_for_display, indent=4))
    if len(first_battle.get('battle_timeline', [])) > 5:
        print("    ...")
        print("    [battle_timeline has been truncated for display]")


--- Structure of the first train battle: ---
{
    "player_won": true,
    "p1_team_details": [
        {
            "name": "starmie",
            "level": 100,
            "types": [
                "psychic",
                "water"
            ],
            "base_hp": 60,
            "base_atk": 75,
            "base_def": 85,
            "base_spa": 100,
            "base_spd": 100,
            "base_spe": 115
        },
        {
            "name": "exeggutor",
            "level": 100,
            "types": [
                "grass",
                "psychic"
            ],
            "base_hp": 95,
            "base_atk": 95,
            "base_def": 85,
            "base_spa": 125,
            "base_spd": 125,
            "base_spe": 55
        },
        {
            "name": "chansey",
            "level": 100,
            "types": [
                "normal",
                "notype"
            ],
            "base_hp": 250,
            "base_atk": 5,
            "base_d

# Feature Engineering

At this point we have all the training data, decoded from JSON, in the `train_data` list (of nested structures), so it's time to work on the features, through techniques such as features regularization and selection.

In [5]:
# Different extract* functions: they differentiate on the number of features they extract.
# Only one of them must be called (otherwise the last called one overwrites the features dataframe).

POKEMON_STATS = ['hp', 'atk', 'def', 'spa', 'spd', 'spe']

def extract_features(data: list[dict]) -> pd.DataFrame:
    """
    This is the most complete features extractor.
    """
    feature_list = []
    for battle in tqdm(data, desc="Extracting features"):
        battle_features = {}
        
        # --- Player 1 Team Features ---
        #p1_team = battle.get('p1_team_details', [])
        p1_team = battle['p1_team_details']
        #if p1_team:
        for stat in POKEMON_STATS:
            battle_features[f'p1_mean_{stat}'] = np.mean([p[f'base_{stat}'] for p in p1_team])
            #battle_features[f'p1_min_{stat}'] = np.min([p.get(f'base_{stat}', 0) for p in p1_team])
            #battle_features[f'p1_max_{stat}'] = np.max([p.get(f'base_{stat}', 0) for p in p1_team])
            #battle_features[f'p1_std_{stat}'] = np.std([p.get(f'base_{stat}', 0) for p in p1_team])
        
        i = 1
        for p in p1_team:
            #battle_features[f'p1_name_{i}'] = p['name']
            #battle_features[f'p1_level_{i}'] = p['level']
            #battle_features[f'p1_types_1_{i}'] = p['types'][0]
            #battle_features[f'p1_types_2_{i}'] = p['types'][1]
            i += 1            
        
        battle_features['p1_mean_level'] = np.mean([p.get('level', 0) for p in p1_team])

        # --- Player 2 Lead battle_features ---
        p2_lead = battle['p2_lead_details']
        #if p2_lead:
        # Player 2's lead Pokémon's stats
        for stat in POKEMON_STATS:
            battle_features[f'p2_lead_{stat}'] = p2_lead[f'base_{stat}']
            
        #battle_features['p2_lead_name'] = p2_lead['name']
        battle_features['p2_lead_level'] = p2_lead['level']
        #battle_features['p2_lead_types_1'] = p2_lead['types'][0]
        #battle_features['p2_lead_types_2'] = p2_lead['types'][1]

        # We also need the ID and the target variable (if they exist)
        battle_features['battle_id'] = battle.get('battle_id')
        battle_features['player_won'] = int(battle['player_won'])

        # Moves
        #battle_features['moves'] = len(battle['battle_timeline'])

        battle_features['p1_mean_hp_pct'] = np.mean([move['p1_pokemon_state']['hp_pct'] for move in battle['battle_timeline']])
        battle_features['p2_mean_hp_pct'] = np.mean([move['p2_pokemon_state']['hp_pct'] for move in battle['battle_timeline']])

        '''
        battle_features['hp_advantage'] = \
            sum([move['p1_pokemon_state']['hp_pct'] >= move['p2_pokemon_state']['hp_pct'] for move in battle['battle_timeline']])
        '''

        battle_features['hp_last_advantage'] = \
            battle['battle_timeline'][-1]['p1_pokemon_state']['hp_pct'] >= \
            battle['battle_timeline'][-1]['p2_pokemon_state']['hp_pct']

        battle_features['p1_hp_shortage'] = sum(
            [move['p1_pokemon_state']['hp_pct'] < 0.1 for move in battle['battle_timeline']]
        )
        battle_features['p2_hp_shortage'] = sum(
            [move['p2_pokemon_state']['hp_pct'] < 0.1 for move in battle['battle_timeline']]
        )


        battle_features['p1_mean_accuracy'] = \
            np.mean([move['p1_move_details']['accuracy'] for move in battle['battle_timeline'] if move['p1_move_details']])
        battle_features['p2_mean_accuracy'] = \
            np.mean([move['p2_move_details']['accuracy'] for move in battle['battle_timeline'] if move['p2_move_details']])

        # This feature doesn't seem to give any advantage.
        '''        
        battle_features['p1_mean_base_power'] = \
            np.mean([move['p1_move_details']['base_power'] for move in battle['battle_timeline'] if move['p1_move_details']])
        battle_features['p2_mean_base_power'] = \
            np.mean([move['p2_move_details']['base_power'] for move in battle['battle_timeline'] if move['p2_move_details']])
        '''

        battle_features['power_advantage'] = \
            sum([move['p1_move_details']['base_power'] <= move['p2_move_details']['base_power'] for move in battle['battle_timeline'] \
                 if move['p1_move_details'] and move['p2_move_details']])
        
        # This feature doesn't seem to give any advantage.
        ''' 
        battle_features['priority_advantage'] = \
            sum([move['p1_move_details']['priority'] < move['p2_move_details']['priority'] for move in battle['battle_timeline'] \
                 if move['p1_move_details'] and move['p2_move_details']])
        '''
        
        battle_features['p1_mean_priority'] = \
            np.mean([move['p1_move_details']['priority'] for move in battle['battle_timeline'] if move['p1_move_details']])
        battle_features['p2_mean_priority'] = \
            np.mean([move['p2_move_details']['priority'] for move in battle['battle_timeline'] if move['p2_move_details']])

        battle_features['priority_advantage'] = \
            sum([move['p1_move_details']['priority'] > move['p2_move_details']['priority'] for move in battle['battle_timeline'] \
                 if move['p1_move_details'] and move['p2_move_details']])
            
        feature_list.append(battle_features)
        
    return pd.DataFrame(feature_list).fillna(0)

In [48]:
# Alternative extract_features()
def extract_features(data: list[dict]) -> pd.DataFrame:
    """
    This function extracts features only from the battle moves
    """
    feature_list = []

    for battle in tqdm(data, desc="Extracting features"):
        battle_features, battle_features_tmp = {}, {}

        # We also need the ID and the target variable (if they exist)
        battle_features['battle_id'] = battle.get('battle_id')
        battle_features['player_won'] = int(battle['player_won'])

        battle_features_tmp['p1_hp_shortage'] = sum(
            [move['p1_pokemon_state']['hp_pct'] < 0.02 for move in battle['battle_timeline']]
        )
        battle_features_tmp['p2_hp_shortage'] = sum(
            [move['p2_pokemon_state']['hp_pct'] < 0.02 for move in battle['battle_timeline']]
        )

        battle_features['p1_hp_shortage_advantage'] = battle_features_tmp['p1_hp_shortage'] < battle_features_tmp['p2_hp_shortage']

        battle_features['p1_hp_last_advantage'] = \
            battle['battle_timeline'][-1]['p1_pokemon_state']['hp_pct'] >= \
            battle['battle_timeline'][-1]['p2_pokemon_state']['hp_pct']

        battle_features['p1_mean_accuracy'] = \
            np.mean([move['p1_move_details']['accuracy'] for move in battle['battle_timeline'] if move['p1_move_details']])
        battle_features['p2_mean_accuracy'] = \
            np.mean([move['p2_move_details']['accuracy'] for move in battle['battle_timeline'] if move['p2_move_details']])

        battle_features['p1_mean_priority'] = \
            np.mean([move['p1_move_details']['priority'] for move in battle['battle_timeline'] if move['p1_move_details']])
        battle_features['p2_mean_priority'] = \
            np.mean([move['p2_move_details']['priority'] for move in battle['battle_timeline'] if move['p2_move_details']])

        '''        
        battle_features['p1_advantage'] = \
        battle_features['p1_hp_shortage_advantage'] and \
        battle_features['p1_hp_last_advantage']
        '''
        
        feature_list.append(battle_features)
        
    return pd.DataFrame(feature_list).fillna(0)

In [6]:
print("Processing training data...")
train_df = extract_features(train_data)

Processing training data...


Extracting features:   0%|          | 0/10000 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [7]:
# Optional: sneak a peek into the features dataframe.
train_df.head()

Unnamed: 0,p1_mean_hp,p1_mean_atk,p1_mean_def,p1_mean_spa,p1_mean_spd,p1_mean_spe,p1_mean_level,p2_lead_hp,p2_lead_atk,p2_lead_def,...,p2_mean_hp_pct,hp_last_advantage,p1_hp_shortage,p2_hp_shortage,p1_mean_accuracy,p2_mean_accuracy,power_advantage,p1_mean_priority,p2_mean_priority,priority_advantage
0,115.833333,72.5,63.333333,100.0,100.0,80.0,100.0,60,75,85,...,0.559756,True,1,4,0.925926,0.9875,6,0.0,0.0,0
1,123.333333,72.5,65.833333,90.0,90.0,61.666667,100.0,55,50,45,...,0.623,False,3,0,0.963043,0.969565,9,-0.043478,0.0,0
2,124.166667,84.166667,71.666667,90.0,90.0,65.833333,100.0,250,5,5,...,0.785333,False,1,0,0.944444,0.943182,15,0.0,0.0,0
3,121.666667,77.5,65.833333,103.333333,103.333333,75.833333,100.0,75,100,95,...,0.680667,False,5,4,0.954348,0.94,15,0.0,0.0,0
4,114.166667,75.833333,79.166667,97.5,97.5,72.5,100.0,60,75,85,...,0.650333,True,1,2,0.990385,0.971154,14,0.0,0.0,0


In [117]:
# Optional: sneak a peek into text columns.
text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
train_df[text_cols].tail()

0


# Models Training and Comparison

At this point we have all the selected features in to the `train_df` dataframe, so it's time to train the various models on them.

In [37]:
# Encode all the text columns
le = LabelEncoder()

text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
for col in text_cols: 
    train_df[col] = le.fit_transform(train_df[col])

In [8]:
# Define our features (X) and target (y)
features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]

# Any data scaling/regularization goes here
scaler = StandardScaler().fit(train_df[features])
train_df_scaled = scaler.transform(train_df[features])

# Split the data (and decide whether to use scaling or not).
X_train, X_test, y_train, y_test = train_test_split(train_df_scaled, train_df['player_won'], test_size=0.25)

In [104]:
# PCA (optional)
pca = PCA(n_components=22)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [9]:
# Models list (just add models with their parameters!)
models = [
    LogisticRegression(max_iter=10_000),
    LogisticRegressionCV(max_iter=10_000),
    SGDClassifier(max_iter=10_000, tol=1e-3),
    GaussianNB(),
    DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=5),
    HistGradientBoostingClassifier(max_iter=10_000),
    BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5),
    LinearSVC(random_state=0,dual=False),
    RandomForestClassifier(n_estimators=100),
    KNeighborsClassifier(n_neighbors=20),
    #GaussianProcessClassifier(),
    AdaBoostClassifier(),
    MLPClassifier(max_iter=10_000)
]

In [21]:
# Model testing with normal training
models_result = []

for model in models:
    model.fit(X_train, y_train)
    models_result.append([model.__class__.__name__, model.score(X_test, y_test)])

In [32]:
# Model testing with cross-validation training - Alternative to the above!
models_result = []

for model in models:
    cv = ShuffleSplit(n_splits=4, test_size=0.25, random_state=0)
    cv_results = cross_val_score(model, train_df_scaled, train_df['player_won'], cv=cv)
    print(f'{model.__class__.__name__:<32} mean: {cv_results.mean():.3f}\tmin: {cv_results.min():.3f}\tmax: {cv_results.max():.3f}')
    models_result.append([model.__class__.__name__, cv_results.mean()])

LogisticRegression               mean: 0.745	min: 0.740	max: 0.748
LogisticRegressionCV             mean: 0.745	min: 0.740	max: 0.748
SGDClassifier                    mean: 0.721	min: 0.709	max: 0.732
GaussianNB                       mean: 0.606	min: 0.492	max: 0.724
DecisionTreeClassifier           mean: 0.729	min: 0.724	max: 0.735
HistGradientBoostingClassifier   mean: 0.722	min: 0.717	max: 0.730
BaggingClassifier                mean: 0.713	min: 0.706	max: 0.723
LinearSVC                        mean: 0.744	min: 0.736	max: 0.748
RandomForestClassifier           mean: 0.746	min: 0.739	max: 0.754
KNeighborsClassifier             mean: 0.711	min: 0.702	max: 0.722
AdaBoostClassifier               mean: 0.740	min: 0.735	max: 0.748
MLPClassifier                    mean: 0.699	min: 0.688	max: 0.705


In [33]:
results_table = PrettyTable()
results_table.field_names = ['Model Name', 'Accuracy']
results_table.align['Model Name'] = 'r'
results_table.align['Accuracy'] = 'l'
results_table.add_rows(sorted([[result[0], round(result[1]*100, 3)] for result in models_result], key=lambda row: row[1]))
print(results_table)

+--------------------------------+----------+
|                     Model Name | Accuracy |
+--------------------------------+----------+
|                     GaussianNB | 60.61    |
|                  MLPClassifier | 69.94    |
|           KNeighborsClassifier | 71.1     |
|              BaggingClassifier | 71.3     |
|                  SGDClassifier | 72.09    |
| HistGradientBoostingClassifier | 72.19    |
|         DecisionTreeClassifier | 72.94    |
|             AdaBoostClassifier | 74.03    |
|                      LinearSVC | 74.41    |
|             LogisticRegression | 74.53    |
|           LogisticRegressionCV | 74.53    |
|         RandomForestClassifier | 74.62    |
+--------------------------------+----------+
