In [55]:
import pandas as pd
import numpy as np 
import sklearn as skl
train_df = pd.read_json('train.jsonl', lines=True)
test_df = pd.read_json('test.jsonl', lines=True)


# Dictionaries/usefull data

In [92]:
pokemon_base_stats_nested = {
    'alakazam': {
        'name': {'value': 'alakazam'},
        'level': {'value': None},  # placeholder if unknown
        'types': {'value': []},    # placeholder if unknown
        'base_hp': {'value': 55},
        'base_atk': {'value': 50},
        'base_def': {'value': 45},
        'base_spa': {'value': 135},
        'base_spd': {'value': 81},
        'base_spe': {'value': 120}
    },
    'articuno': {
        'name': {'value': 'articuno'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 90},
        'base_atk': {'value': 85},
        'base_def': {'value': 100},
        'base_spa': {'value': 125},
        'base_spd': {'value': 97},
        'base_spe': {'value': 85}
    },
    'chansey': {
        'name': {'value': 'chansey'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 250},
        'base_atk': {'value': 5},
        'base_def': {'value': 5},
        'base_spa': {'value': 105},
        'base_spd': {'value': 83},
        'base_spe': {'value': 50}
    },
    'charizard': {
        'name': {'value': 'charizard'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 78},
        'base_atk': {'value': 84},
        'base_def': {'value': 78},
        'base_spa': {'value': 85},
        'base_spd': {'value': 85},
        'base_spe': {'value': 100}
    },
    'cloyster': {
        'name': {'value': 'cloyster'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 50},
        'base_atk': {'value': 95},
        'base_def': {'value': 180},
        'base_spa': {'value': 85},
        'base_spd': {'value': 96},
        'base_spe': {'value': 70}
    },
    'dragonite': {
        'name': {'value': 'dragonite'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 91},
        'base_atk': {'value': 134},
        'base_def': {'value': 95},
        'base_spa': {'value': 100},
        'base_spd': {'value': 100},
        'base_spe': {'value': 80}
    },
    'exeggutor': {
        'name': {'value': 'exeggutor'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 95},
        'base_atk': {'value': 95},
        'base_def': {'value': 85},
        'base_spa': {'value': 125},
        'base_spd': {'value': 91},
        'base_spe': {'value': 55}
    },
    'gengar': {
        'name': {'value': 'gengar'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 60},
        'base_atk': {'value': 65},
        'base_def': {'value': 60},
        'base_spa': {'value': 130},
        'base_spd': {'value': 85},
        'base_spe': {'value': 110}
    },
    'golem': {
        'name': {'value': 'golem'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 80},
        'base_atk': {'value': 110},
        'base_def': {'value': 130},
        'base_spa': {'value': 55},
        'base_spd': {'value': 84},
        'base_spe': {'value': 45}
    },
    'jolteon': {
        'name': {'value': 'jolteon'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 65},
        'base_atk': {'value': 65},
        'base_def': {'value': 60},
        'base_spa': {'value': 110},
        'base_spd': {'value': 86},
        'base_spe': {'value': 130}
    },
    'jynx': {
        'name': {'value': 'jynx'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 65},
        'base_atk': {'value': 50},
        'base_def': {'value': 35},
        'base_spa': {'value': 95},
        'base_spd': {'value': 68},
        'base_spe': {'value': 95}
    },
    'lapras': {
        'name': {'value': 'lapras'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 130},
        'base_atk': {'value': 85},
        'base_def': {'value': 80},
        'base_spa': {'value': 95},
        'base_spd': {'value': 90},
        'base_spe': {'value': 60}
    },
    'persian': {
        'name': {'value': 'persian'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 65},
        'base_atk': {'value': 70},
        'base_def': {'value': 60},
        'base_spa': {'value': 65},
        'base_spd': {'value': 75},
        'base_spe': {'value': 115}
    },
    'rhydon': {
        'name': {'value': 'rhydon'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 105},
        'base_atk': {'value': 130},
        'base_def': {'value': 120},
        'base_spa': {'value': 45},
        'base_spd': {'value': 88},
        'base_spe': {'value': 40}
    },
    'slowbro': {
        'name': {'value': 'slowbro'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 95},
        'base_atk': {'value': 75},
        'base_def': {'value': 110},
        'base_spa': {'value': 80},
        'base_spd': {'value': 78},
        'base_spe': {'value': 30}
    },
    'snorlax': {
        'name': {'value': 'snorlax'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 160},
        'base_atk': {'value': 110},
        'base_def': {'value': 65},
        'base_spa': {'value': 65},
        'base_spd': {'value': 86},
        'base_spe': {'value': 30}
    },
    'starmie': {
        'name': {'value': 'starmie'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 60},
        'base_atk': {'value': 75},
        'base_def': {'value': 85},
        'base_spa': {'value': 100},
        'base_spd': {'value': 87},
        'base_spe': {'value': 115}
    },
    'tauros': {
        'name': {'value': 'tauros'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 75},
        'base_atk': {'value': 100},
        'base_def': {'value': 95},
        'base_spa': {'value': 70},
        'base_spd': {'value': 90},
        'base_spe': {'value': 110}
    },
    'victreebel': {
        'name': {'value': 'victreebel'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 80},
        'base_atk': {'value': 105},
        'base_def': {'value': 65},
        'base_spa': {'value': 100},
        'base_spd': {'value': 84},
        'base_spe': {'value': 70}
    },
    'zapdos': {
        'name': {'value': 'zapdos'},
        'level': {'value': None},
        'types': {'value': []},
        'base_hp': {'value': 90},
        'base_atk': {'value': 90},
        'base_def': {'value': 85},
        'base_spa': {'value': 125},
        'base_spd': {'value': 98},
        'base_spe': {'value': 100}
    }
}


# Flatten base stats into arrays
pokemon_embeddings = {}
for name, data in pokemon_base_stats_nested.items():
    pokemon_embeddings[name] = np.array([
        data['base_hp']['value'],
        data['base_atk']['value'],
        data['base_def']['value'],
        data['base_spa']['value'],
        data['base_spd']['value'],
        data['base_spe']['value']
    ])


In [58]:
# Mappa di efficacia dei tipi per la Generazione 1
# Nota: 'Special' in Gen 1 copre sia Atk Sp. che Def Sp.
# Non ci sono tipi Dark, Steel, o Fairy.
TYPE_CHART_GEN1 = {
    'NORMAL': {'ROCK': 0.5, 'GHOST': 0.0},
    'FIRE': {'FIRE': 0.5, 'WATER': 0.5, 'GRASS': 2.0, 'ICE': 2.0, 'BUG': 2.0, 'ROCK': 0.5},
    'WATER': {'FIRE': 2.0, 'WATER': 0.5, 'GRASS': 0.5, 'GROUND': 2.0, 'ROCK': 2.0, 'DRAGON': 0.5},
    'ELECTRIC': {'WATER': 2.0, 'ELECTRIC': 0.5, 'GRASS': 0.5, 'GROUND': 0.0, 'FLYING': 2.0, 'DRAGON': 0.5},
    'GRASS': {'FIRE': 0.5, 'WATER': 2.0, 'ELECTRIC': 1.0, 'GRASS': 0.5, 'POISON': 0.5, 'GROUND': 2.0, 'FLYING': 0.5, 'BUG': 0.5, 'ROCK': 2.0, 'DRAGON': 0.5},
    'ICE': {'WATER': 0.5, 'GRASS': 2.0, 'ICE': 0.5, 'GROUND': 2.0, 'FLYING': 2.0, 'DRAGON': 2.0},
    'FIGHTING': {'NORMAL': 2.0, 'POISON': 0.5, 'FLYING': 0.5, 'PSYCHIC': 0.5, 'BUG': 0.5, 'ROCK': 2.0, 'GHOST': 0.0},
    'POISON': {'GRASS': 2.0, 'POISON': 0.5, 'GROUND': 0.5, 'BUG': 2.0, 'ROCK': 0.5, 'GHOST': 0.5},
    'GROUND': {'FIRE': 2.0, 'ELECTRIC': 2.0, 'GRASS': 0.5, 'POISON': 2.0, 'FLYING': 0.0, 'BUG': 0.5, 'ROCK': 2.0},
    'FLYING': {'ELECTRIC': 0.5, 'GRASS': 2.0, 'FIGHTING': 2.0, 'BUG': 2.0, 'ROCK': 0.5},
    'PSYCHIC': {'FIGHTING': 2.0, 'POISON': 2.0, 'PSYCHIC': 0.5, 'GHOST': 1.0}, # In Gen 1, Psychic era immune a Ghost per un bug, ma i dati Showdown potrebbero averlo corretto. Assumiamo 1.0 per sicurezza, o 0.0 se il bug è emulato. Qui usiamo 1.0.
    'BUG': {'FIRE': 0.5, 'GRASS': 2.0, 'FIGHTING': 0.5, 'POISON': 2.0, 'FLYING': 0.5, 'PSYCHIC': 2.0},
    'ROCK': {'FIRE': 2.0, 'ICE': 2.0, 'FIGHTING': 0.5, 'GROUND': 0.5, 'FLYING': 2.0, 'BUG': 2.0},
    'GHOST': {'NORMAL': 0.0, 'PSYCHIC': 0.0, 'GHOST': 2.0}, # Famoso bug: Lick (Ghost) non colpisce Psychic.
    'DRAGON': {'DRAGON': 2.0},
}

# Funzione helper per calcolare l'efficacia
def get_type_effectiveness(move_type, target_types):
    if move_type not in TYPE_CHART_GEN1:
        return 1.0
    
    multiplier = 1.0
    chart_for_move = TYPE_CHART_GEN1[move_type]
    
    for target_type in target_types:
        if target_type in chart_for_move:
            multiplier *= chart_for_move[target_type]
            
    return multiplier

# Pokémon dominanti nel metagame Gen 1 OU (S-Tier e A-Tier)
# La loro presenza è un segnale fortissimo.
META_THREATS_GEN1 = {
    'Snorlax', 'Tauros', 'Chansey', 'Alakazam', 'Starmie', 'Exeggutor', 
    'Zapdos', 'Jolteon', 'Rhydon', 'Golem', 'Lapras'
}

# Mosse di setup o status chiave
STATUS_MOVES = {'Thunder Wave', 'Sleep Powder', 'Sing', 'Toxic', 'Lovely Kiss', 'Spore', 'Stun Spore', 'Glare'}
SETUP_MOVES = {'Amnesia', 'Swords Dance', 'Agility', 'Growth'}

In [72]:


# Define strength-based encoding
POKEMON_RANKING = {
    'alakazam': 1, 'dragonite': 2, 'zapdos': 3, 'articuno': 4, 'gengar': 5,
    'snorlax': 6, 'lapras': 7, 'jolteon': 8, 'exeggutor': 9, 'rhydon': 10,
    'charizard': 11, 'starmie': 12, 'cloyster': 13, 'chansey': 14, 'victreebel': 15,
    'slowbro': 16, 'persian': 17, 'jynx': 18, 'golem': 19, 'tauros': 20
}

POKEMON_LIST = sorted(POKEMON_RANKING, key=lambda k: POKEMON_RANKING[k])  # sorted by rank

def encode_pokemon_set(pokemon_set):
    """Convert a set of Pokémon names into a multi-hot vector based on strength ranking."""
    vector = np.zeros(len(POKEMON_LIST), dtype=int)
    if not pokemon_set:
        return vector
    for p in pokemon_set:
        rank = POKEMON_RANKING.get(p)
        if rank:  # skip unknown Pokémon
            vector[rank - 1] = 1  # ranks start at 1, indices start at 0
    return vector


def get_pokemons_seen_in_battle(battle_timeline):
    """
    Extracts the set of Pokémon seen for each player during the battle.

    Args:
        battle_timeline (list): list of turn dictionaries. Each contains
            'p1_pokemon_state' and 'p2_pokemon_state' dicts with a 'name' key.

    Returns:
        (p1_seen, p2_seen): sets of Pokémon names for player 1 and player 2.
    """
    p1_seen = set()
    p2_seen = set()

    for turn in battle_timeline:
        # Player 1 Pokémon
        if 'p1_pokemon_state' in turn and isinstance(turn['p1_pokemon_state'], dict):
            name = turn['p1_pokemon_state'].get('name')
            if name:
                p1_seen.add(name)

        # Player 2 Pokémon
        if 'p2_pokemon_state' in turn and isinstance(turn['p2_pokemon_state'], dict):
            name = turn['p2_pokemon_state'].get('name')
            if name:
                p2_seen.add(name)

    return p1_seen, p2_seen

# --- helper function ---
def get_base_stats(name):
    """Return base stats from the nested dictionary or default neutral values."""
    base = pokemon_base_stats_nested.get(name.lower())
    if base:
        return {
            'base_hp': base['base_hp']['value'],
            'base_atk': base['base_atk']['value'],
            'base_def': base['base_def']['value'],
            'base_spa': base['base_spa']['value'],
            'base_spd': base['base_spd']['value'],
            'base_spe': base['base_spe']['value']
        }
    # fallback if not in the dictionary
    return {
        'base_hp': 80,
        'base_atk': 80,
        'base_def': 80,
        'base_spa': 80,
        'base_spd': 80,
        'base_spe': 80
    }






## Data exploration

In [59]:
import json

def print_json_structure(data, indent=0):
    prefix = " " * indent
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{prefix}- {key} ({type(value).__name__})")
            print_json_structure(value, indent + 2)
    elif isinstance(data, list):
        print(f"{prefix}- [list of {len(data)} items]")
        if len(data) > 0:
            print_json_structure(data[0], indent + 2)
    else:
        print(f"{prefix}- value ({type(data).__name__})")


def describe_jsonl_structure(file_path):
    """Reads the first JSON line and prints its structure."""
    with open(file_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            print_json_structure(data)
            break

# Example usage:
describe_jsonl_structure("train.jsonl")


- player_won (bool)
  - value (bool)
- p1_team_details (list)
  - [list of 6 items]
    - name (str)
      - value (str)
    - level (int)
      - value (int)
    - types (list)
      - [list of 2 items]
        - value (str)
    - base_hp (int)
      - value (int)
    - base_atk (int)
      - value (int)
    - base_def (int)
      - value (int)
    - base_spa (int)
      - value (int)
    - base_spd (int)
      - value (int)
    - base_spe (int)
      - value (int)
- p2_lead_details (dict)
  - name (str)
    - value (str)
  - level (int)
    - value (int)
  - types (list)
    - [list of 2 items]
      - value (str)
  - base_hp (int)
    - value (int)
  - base_atk (int)
    - value (int)
  - base_def (int)
    - value (int)
  - base_spa (int)
    - value (int)
  - base_spd (int)
    - value (int)
  - base_spe (int)
    - value (int)
- battle_timeline (list)
  - [list of 30 items]
    - turn (int)
      - value (int)
    - p1_pokemon_state (dict)
      - name (str)
        - value (str)

In [60]:
import json

# assume TYPE_CHART_GEN1, get_type_effectiveness, META_THREATS_GEN1, STATUS_MOVES, SETUP_MOVES are already defined

def analyze_turn_both_teams(turn_info, team1_state, team2_state):
    p1_state = turn_info["p1_pokemon_state"]
    p2_state = turn_info["p2_pokemon_state"]
    p1_move = turn_info.get("p1_move_details")
    p2_move = turn_info.get("p2_move_details")
    
    p1_name = p1_state["name"]
    p2_name = p2_state["name"]

    print(f"\nTurn {turn_info['turn']}:")

    # Player 1 move
    if p1_move:
        move_name = p1_move['name']
        move_type = p1_move['type'].upper()
        effectiveness = get_type_effectiveness(move_type, p2_state.get("types", ["notype"]))
        status_flag = move_name in STATUS_MOVES
        setup_flag = move_name in SETUP_MOVES
        print(f"  {p1_name} used {move_name} ({move_type}) -> Effectiveness: x{effectiveness}")
        if status_flag: print("    -> Status move")
        if setup_flag: print("    -> Setup move")
    else:
        print(f"  {p1_name} did not move.")

    # Player 2 move
    if p2_move:
        move_name = p2_move['name']
        move_type = p2_move['type'].upper()
        effectiveness = get_type_effectiveness(move_type, p1_state.get("types", ["notype"]))
        status_flag = move_name in STATUS_MOVES
        setup_flag = move_name in SETUP_MOVES
        print(f"  {p2_name} used {move_name} ({move_type}) -> Effectiveness: x{effectiveness}")
        if status_flag: print("    -> Status move")
        if setup_flag: print("    -> Setup move")
    else:
        print(f"  {p2_name} did not move.")

    # Update HP states
    team1_state[p1_name] = p1_state['hp_pct']
    if p2_name not in team2_state:
        team2_state[p2_name] = p2_state['hp_pct']  # add new Pokémon to team 2
    else:
        team2_state[p2_name] = p2_state['hp_pct']

    # Show team states
    print("\n  Player 1 Team Status:")
    for poke, hp in team1_state.items():
        print(f"    - {poke}: {hp*100:.1f}% HP")

    print("\n  Player 2 Team Status:")
    for poke, hp in team2_state.items():
        print(f"    - {poke}: {hp*100:.1f}% HP")

def simulate_first_battle_both_teams(jsonl_path):
    with open(jsonl_path, "r") as f:
        battle = json.loads(f.readline().strip())

    print("="*60)
    print(f"Battle ID: {battle['battle_id']} | Player won: {battle['player_won']}")
    print("="*60)

    # Player 1 team
    team1_state = {}
    print("\nPlayer 1 Team:")
    for p in battle["p1_team_details"]:
        print(f"  - {p['name']} (Lv {p['level']}, {', '.join(p['types'])})")
        team1_state[p['name']] = 1.0  # full HP

    # Player 2 initial lead
    team2_state = {}
    p2 = battle["p2_lead_details"]
    print("\nOpponent Lead:")
    print(f"  - {p2['name']} (Lv {p2['level']}, {', '.join(p2['types'])})")
    team2_state[p2['name']] = 1.0

    # Process turns
    for turn_info in battle["battle_timeline"]:
        analyze_turn_both_teams(turn_info, team1_state, team2_state)

    print("\n" + "="*60)
    print("Battle Ended")
    print("="*60)

# Example usage:
simulate_first_battle_both_teams("train.jsonl")


Battle ID: 0 | Player won: True

Player 1 Team:
  - starmie (Lv 100, psychic, water)
  - exeggutor (Lv 100, grass, psychic)
  - chansey (Lv 100, normal, notype)
  - snorlax (Lv 100, normal, notype)
  - tauros (Lv 100, normal, notype)
  - alakazam (Lv 100, notype, psychic)

Opponent Lead:
  - starmie (Lv 100, psychic, water)

Turn 1:
  starmie used icebeam (ICE) -> Effectiveness: x1.0
  exeggutor did not move.

  Player 1 Team Status:
    - starmie: 100.0% HP
    - exeggutor: 100.0% HP
    - chansey: 100.0% HP
    - snorlax: 100.0% HP
    - tauros: 100.0% HP
    - alakazam: 100.0% HP

  Player 2 Team Status:
    - starmie: 100.0% HP
    - exeggutor: 69.0% HP

Turn 2:
  exeggutor did not move.
  starmie did not move.

  Player 1 Team Status:
    - starmie: 100.0% HP
    - exeggutor: 100.0% HP
    - chansey: 100.0% HP
    - snorlax: 100.0% HP
    - tauros: 100.0% HP
    - alakazam: 100.0% HP

  Player 2 Team Status:
    - starmie: 100.0% HP
    - exeggutor: 69.0% HP

Turn 3:
  exeggutor u

In [61]:
from tqdm.auto import tqdm

def get_all_pokemons_used(df):
    all_pokemons = set()
    
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Collecting Pokémon names"):
        # Player 1 team
        for p in row.get('p1_team_details', []):
            if 'name' in p:
                all_pokemons.add(p['name'])
        
        # Player 2 lead
        p2_lead = row.get('p2_lead_details', {})
        if 'name' in p2_lead:
            all_pokemons.add(p2_lead['name'])
        
        # Pokémon seen in battle timeline
        timeline = row.get('battle_timeline', [])
        for turn in timeline:
            # Player 1
            p1_state = turn.get('p1_pokemon_state')
            if p1_state and 'name' in p1_state:
                all_pokemons.add(p1_state['name'])
            
            # Player 2
            p2_state = turn.get('p2_pokemon_state')
            if p2_state and 'name' in p2_state:
                all_pokemons.add(p2_state['name'])
    
    return sorted(all_pokemons)  # sorted list for easier reference

# Usage
all_pokemons = get_all_pokemons_used(train_df)
print(f"Total unique Pokémon: {len(all_pokemons)}")
print(all_pokemons)


Collecting Pokémon names:   0%|          | 0/10000 [00:00<?, ?it/s]

Total unique Pokémon: 20
['alakazam', 'articuno', 'chansey', 'charizard', 'cloyster', 'dragonite', 'exeggutor', 'gengar', 'golem', 'jolteon', 'jynx', 'lapras', 'persian', 'rhydon', 'slowbro', 'snorlax', 'starmie', 'tauros', 'victreebel', 'zapdos']


## Feature creation

In [103]:
def create_advanced_features_gen2(df):
    processed_data = []
    embedding_dim = 6  # hp, atk, def, spa, spd, spe

    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Generating advanced features"):
        p1_team = row['p1_team_details']
        p2_lead = row['p2_lead_details']
        timeline = row['battle_timeline']
        p1_lead = p1_team[0]

        # --- Ensure Pokémon stats exist ---
        def fill_missing_stats(pokemon):
            name = pokemon.get('name', '').lower()
            if not name:
                return pokemon
            if any(k not in pokemon for k in ['base_hp', 'base_atk', 'base_def', 'base_spa', 'base_spd', 'base_spe']):
                base_info = pokemon_base_stats_nested.get(name)
                if base_info:
                    for stat in ['base_hp', 'base_atk', 'base_def', 'base_spa', 'base_spd', 'base_spe']:
                        pokemon[stat] = pokemon.get(stat, base_info[stat]['value'])
                else:
                    # Unknown Pokémon, use neutral defaults
                    pokemon.update({s: 80 for s in ['base_hp','base_atk','base_def','base_spa','base_spd','base_spe']})
            return pokemon

        p1_lead = fill_missing_stats(p1_lead)
        p2_lead = fill_missing_stats(p2_lead)
        for mon in p1_team:
            fill_missing_stats(mon)

        # --- Basic numeric features ---
        feat_lead_speed_diff = p1_lead['base_spe'] - p2_lead['base_spe']
        feat_end_boost_diff = 0
        feat_num_turns = 0
        feat_total_damage_dealt = 0
        feat_total_healing_done = 0
        feat_status_turns = 0
        feat_first_faint_turn = 0
        feat_damage_diff_turn25 = 0
        feat_damage_diff_turn30 = 0

        # --- Track seen Pokémon ---
        p1_seen_status = {p['name']: {'hp_pct': 100, 'status': None} for p in p1_team}
        p2_seen_status = {p2_lead['name']: {'hp_pct': 100, 'status': None}}

        # --- Timeline analysis ---
        if timeline:
            feat_num_turns = timeline[-1].get('turn', 0)
            for turn in timeline:
                p1_state = turn.get('p1_pokemon_state', {})
                p2_state = turn.get('p2_pokemon_state', {})
                turn_num = turn.get('turn', 0)

                # Update player 1 status
                if p1_state and p1_state.get('name'):
                    name = p1_state['name']
                    p1_seen_status.setdefault(name, {'hp_pct': 100, 'status': None})
                    prev_hp = p1_seen_status[name]['hp_pct']
                    p1_seen_status[name]['hp_pct'] = p1_state.get('hp_pct', prev_hp)
                    p1_seen_status[name]['status'] = p1_state.get('status', p1_seen_status[name]['status'])

                # Update player 2 status
                if p2_state and p2_state.get('name'):
                    name = p2_state['name']
                    p2_seen_status.setdefault(name, {'hp_pct': 100, 'status': None})
                    prev_hp = p2_seen_status[name]['hp_pct']
                    p2_seen_status[name]['hp_pct'] = p2_state.get('hp_pct', prev_hp)
                    p2_seen_status[name]['status'] = p2_state.get('status', p2_seen_status[name]['status'])

                # Track total damage / healing
                if p2_state.get('hp_pct') is not None:
                    feat_total_damage_dealt += max(0, 100 - p2_state['hp_pct'])
                if p1_state.get('hp_pct') is not None:
                    feat_total_healing_done += max(0, p1_state['hp_pct'] - 100)

                # Count status turns
                for s in p1_seen_status.values():
                    if s['status'] is not None:
                        feat_status_turns += 1
                for s in p2_seen_status.values():
                    if s['status'] is not None:
                        feat_status_turns += 1

                # First faint turn
                if not feat_first_faint_turn:
                    p1_faints = any(s['hp_pct'] <= 0 for s in p1_seen_status.values())
                    p2_faints = any(s['hp_pct'] <= 0 for s in p2_seen_status.values())
                    if p1_faints or p2_faints:
                        feat_first_faint_turn = turn_num

                # HP difference snapshot at turn 25
                if turn_num == 25 or (turn_num == feat_num_turns and turn_num < 25):
                    total_hp_p1 = sum(p['hp_pct'] for p in p1_seen_status.values())
                    total_hp_p2 = sum(p['hp_pct'] for p in p2_seen_status.values())
                    feat_damage_diff_turn25 = (600 - total_hp_p1) - (600 - total_hp_p2)

                # HP difference snapshot at turn 30
                if turn_num == 30 or (turn_num == feat_num_turns and turn_num < 30):
                    total_hp_p1 = sum(p['hp_pct'] for p in p1_seen_status.values())
                    total_hp_p2 = sum(p['hp_pct'] for p in p2_seen_status.values())
                    feat_damage_diff_turn30 = (600 - total_hp_p1) - (600 - total_hp_p2)

            # --- End-of-battle boosts ---
            last_turn = timeline[-1]
            p1_state_last = last_turn.get('p1_pokemon_state', {})
            p2_state_last = last_turn.get('p2_pokemon_state', {})
            p1_boosts = sum(p1_state_last.get('boosts', {}).values()) if p1_state_last else 0
            p2_boosts = sum(p2_state_last.get('boosts', {}).values()) if p2_state_last else 0
            feat_end_boost_diff = p1_boosts - p2_boosts

        # --- Derived battle features ---
        p1_total_hp_seen = sum(p['hp_pct'] for p in p1_seen_status.values())
        p2_total_hp_seen = sum(p['hp_pct'] for p in p2_seen_status.values())
        feat_hp_advantage_seen = p1_total_hp_seen - p2_total_hp_seen
        feat_mons_revealed_diff = len(p2_seen_status) - len(p1_seen_status)
        p1_team_status_count = sum(1 for p in p1_seen_status.values() if p['status'] is not None)
        p2_team_status_count = sum(1 for p in p2_seen_status.values() if p['status'] is not None)
        feat_team_status_diff = p1_team_status_count - p2_team_status_count

        # --- Type advantage, meta, and setup features ---
        p1_lead_types = p1_lead.get('types', [])
        p2_lead_types = p2_lead.get('types', [])
        feat_lead_type_adv = np.prod([get_type_effectiveness(t1, p2_lead_types) for t1 in p1_lead_types]) if p1_lead_types else 1.0
        feat_p1_meta_count = sum(1 for p in p1_team if p['name'] in META_THREATS_GEN1)
        feat_p2_meta_count = 1 if p2_lead['name'] in META_THREATS_GEN1 else 0
        feat_meta_diff = feat_p1_meta_count - feat_p2_meta_count

        p1_moves = p1_lead.get('moves', [])
        p2_moves = p2_lead.get('moves', [])
        feat_p1_status_setup_count = sum(1 for m in p1_moves if m in STATUS_MOVES or m in SETUP_MOVES)
        feat_p2_status_setup_count = sum(1 for m in p2_moves if m in STATUS_MOVES or m in SETUP_MOVES)
        feat_status_setup_diff = feat_p1_status_setup_count - feat_p2_status_setup_count

        p1_seen_set, p2_seen_set = get_pokemons_seen_in_battle(timeline)
        p1_seen_encoded = encode_pokemon_set(p1_seen_set)
        p2_seen_encoded = encode_pokemon_set(p2_seen_set)

        # --- NEW: Lead embeddings ---
        p1_lead_embedding = pokemon_embeddings.get(p1_lead['name'], np.zeros(embedding_dim))
        p2_lead_embedding = pokemon_embeddings.get(p2_lead['name'], np.zeros(embedding_dim))

        feat_total_stats_diff = sum(
            p1_lead.get(stat, 0) - p2_lead.get(stat, 0)
            for stat in ['base_hp', 'base_atk', 'base_def', 'base_spe', 'base_spa', 'base_spd']
        )

        # --- Combine all features ---
        feature_dict = {
            'battle_id': row['battle_id'],
            'lead_speed_diff': feat_lead_speed_diff,
            'hp_advantage_seen': feat_hp_advantage_seen,
            'mons_revealed_diff': feat_mons_revealed_diff,
            'team_status_diff': feat_team_status_diff,
            'end_boost_diff': feat_end_boost_diff,
            'total_damage_dealt': feat_total_damage_dealt,
            'total_healing_done': feat_total_healing_done,
            'status_turns': feat_status_turns,
            'first_faint_turn': feat_first_faint_turn,
            'lead_type_adv': feat_lead_type_adv,
            'meta_diff': feat_meta_diff,
            'status_setup_diff': feat_status_setup_diff,
            'p1_seen_pokemons': p1_seen_encoded,
            'p2_seen_pokemons': p2_seen_encoded,
            'total_stats_diff': feat_total_stats_diff,
            'damage_diff_turn25': feat_damage_diff_turn25,
            'damage_diff_turn30': feat_damage_diff_turn30
        }

        # Add embeddings as separate numeric features
        for stat_name, val in zip(['hp','atk','def','spa','spd','spe'], p1_lead_embedding):
            feature_dict[f'p1_lead_{stat_name}'] = val
        for stat_name, val in zip(['hp','atk','def','spa','spd','spe'], p2_lead_embedding):
            feature_dict[f'p2_lead_{stat_name}'] = val

        processed_data.append(feature_dict)

    return pd.DataFrame(processed_data).set_index('battle_id')


In [104]:
# Subset already created
fraction_to_use = 1.0
train_df_subset = train_df.sample(frac=fraction_to_use, random_state=42).reset_index(drop=True)
test_df_subset = test_df.sample(frac=fraction_to_use, random_state=42).reset_index(drop=True)

# Feature engineering
X_train_features = create_advanced_features_gen2(train_df_subset)
X_test_features = create_advanced_features_gen2(test_df_subset)


# Target
y_train = train_df_subset.set_index('battle_id')['player_won']
y_test = test_df_subset.set_index('battle_id')

# Train/validation split on the subset ONLY
from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_features,  # must match y_train
    y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print("Shapes:")
print(X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape)


Generating advanced features:   0%|          | 0/10000 [00:00<?, ?it/s]

Generating advanced features:   0%|          | 0/5000 [00:00<?, ?it/s]

Shapes:
(8000, 29) (2000, 29) (8000,) (2000,)


## Train/grid search

In [144]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# --- 1. Expand multi-hot Pokémon columns ---
def expand_seen_pokemon_features(df, prefix='p1_seen_pokemons'):
    """Expands multi-hot Pokémon vectors into separate numeric columns."""
    if prefix not in df.columns:
        return df
    num_pokemon = len(df[prefix].iloc[0])
    expanded = pd.DataFrame(
        df[prefix].tolist(),
        index=df.index,
        columns=[f"{prefix}_{i}" for i in range(num_pokemon)]
    )
    df = df.drop(columns=[prefix])
    return pd.concat([df, expanded], axis=1)


for col_prefix in ['p1_seen_pokemons', 'p2_seen_pokemons']:
    X_train_split = expand_seen_pokemon_features(X_train_split, col_prefix)
    X_val_split   = expand_seen_pokemon_features(X_val_split, col_prefix)


# --- 2. Identify numeric columns ---
pokemon_columns = [
    c for c in X_train_split.columns
    if c.startswith('p1_seen_pokemons_') or c.startswith('p2_seen_pokemons_')
]

lead_embedding_columns = [
    'p1_lead_hp','p1_lead_atk','p1_lead_def','p1_lead_spa','p1_lead_spd','p1_lead_spe',
    'p2_lead_hp','p2_lead_atk','p2_lead_def','p2_lead_spa','p2_lead_spd','p2_lead_spe'
]

numeric_features = [
    'lead_speed_diff', 'hp_advantage_seen', 'status_setup_diff', 'total_stats_diff',
    'lead_type_adv', 'damage_diff_turn30', 'damage_diff_turn25', 'total_damage_dealt',
    'total_healing_done', 'status_turns', 'first_faint_turn', 'team_status_diff',
    'end_boost_diff', 'meta_diff'
] + pokemon_columns + lead_embedding_columns

# --- 3. Define numeric transformer ---
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])


# --- 4. Define models and param grids ---
models = {
    'LogisticRegression': LogisticRegression(max_iter=5000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(n_estimators=150, use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=200, verbose=0, random_state=42, depth=8)
}

param_grids = {
    'LogisticRegression': {'classifier__C': [0.01, 0.1, 1, 10]},
    'RandomForest': {
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__n_estimators': [100, 200]
    },
    'XGBoost': {
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__subsample': [0.7, 0.85, 1.0],
        'classifier__colsample_bytree': [0.7, 0.85, 1.0]
    },
    'LightGBM': {
        'classifier__max_depth': [5, 10, -1],
        'classifier__num_leaves': [31, 50, 100],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__n_estimators': [100, 200]
    },
    'CatBoost': {
        'classifier__depth': [6, 8, 10],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__iterations': [200, 400],
        'classifier__l2_leaf_reg': [3, 5, 7]
    }
}



# --- 5. Run pipeline with GridSearchCV ---
results = {}

for name, model in models.items():
    print(f"\n--- Testing {name} ---")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

    if name in param_grids:
        grid = GridSearchCV(pipeline, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1, verbose=0)
        grid.fit(X_train_split, y_train_split)
        best_model = grid.best_estimator_
        val_pred = best_model.predict(X_val_split)
        acc = accuracy_score(y_val_split, val_pred)
        print(f"Best Params: {grid.best_params_}")
        print(f"Validation Accuracy: {acc:.4f}")
        results[name] = {'model': best_model, 'val_accuracy': acc}
    else:
        pipeline.fit(X_train_split, y_train_split)
        val_pred = pipeline.predict(X_val_split)
        acc = accuracy_score(y_val_split, val_pred)
        print(f"Validation Accuracy: {acc:.4f}")
        results[name] = {'model': pipeline, 'val_accuracy': acc}



--- Testing LogisticRegression ---
Best Params: {'classifier__C': 0.01}
Validation Accuracy: 0.7620

--- Testing RandomForest ---
Best Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Validation Accuracy: 0.8190

--- Testing XGBoost ---


Parameters: { "use_label_encoder" } are not used.



Best Params: {'classifier__colsample_bytree': 0.85, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__subsample': 1.0}
Validation Accuracy: 0.8350

--- Testing LightGBM ---
[LightGBM] [Info] Number of positive: 4000, number of negative: 4000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1637
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best Params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 100, 'classifier__num_leaves': 31}
Validation Accuracy: 0.8320

--- Testing CatBoost ---




Best Params: {'classifier__depth': 8, 'classifier__iterations': 400, 'classifier__l2_leaf_reg': 7, 'classifier__learning_rate': 0.05}
Validation Accuracy: 0.8335


In [128]:
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# --- 1. Scale numeric features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split[numeric_features])
X_val_scaled = scaler.transform(X_val_split[numeric_features])

# --- 2. Optuna objective for XGBoost ---
def xgb_objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'n_estimators': 300,
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)
    model.fit(
        X_train_scaled, y_train_split,
        eval_set=[(X_val_scaled, y_val_split)],
        verbose=False
    )

    val_pred = model.predict(X_val_scaled)
    return accuracy_score(y_val_split, val_pred)

# --- 3. Run Optuna study ---
study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=20)

best_xgb_params = study.best_params
best_xgb = XGBClassifier(**best_xgb_params, n_estimators=300, 
                         random_state=42, use_label_encoder=False, eval_metric='logloss')
best_xgb.fit(X_train_scaled, y_train_split,
             eval_set=[(X_val_scaled, y_val_split)],
             verbose=False)

print("Best XGBoost Accuracy:", accuracy_score(y_val_split, best_xgb.predict(X_val_scaled)))

# --- 4. Prepare other top models ---
rf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=5, random_state=42, n_jobs=-1)
lgbm = LGBMClassifier(n_estimators=100, random_state=42)
cat = CatBoostClassifier(iterations=200, depth=8, verbose=0, random_state=42)

# Fit non-XGB models on scaled data
rf.fit(X_train_scaled, y_train_split)
lgbm.fit(X_train_scaled, y_train_split)
cat.fit(X_train_scaled, y_train_split)

# --- 5. Create stacking ensemble ---
estimators = [
    ('rf', rf),
    ('lgbm', lgbm),
    ('cat', cat),
    ('xgb', best_xgb)
]

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=2000),
    n_jobs=-1
)

stacking_model.fit(X_train_scaled, y_train_split)
stack_val_pred = stacking_model.predict(X_val_scaled)
print("Stacked Model Validation Accuracy:", accuracy_score(y_val_split, stack_val_pred))


[I 2025-11-04 16:38:18,178] A new study created in memory with name: no-name-9dce1f08-95bf-440b-a97f-1051e4a96098
Parameters: { "use_label_encoder" } are not used.

[I 2025-11-04 16:38:18,964] Trial 0 finished with value: 0.824 and parameters: {'max_depth': 7, 'learning_rate': 0.09539776156507826, 'subsample': 0.9361578333842957, 'colsample_bytree': 0.9150019326239296}. Best is trial 0 with value: 0.824.
Parameters: { "use_label_encoder" } are not used.

[I 2025-11-04 16:38:19,550] Trial 1 finished with value: 0.8225 and parameters: {'max_depth': 4, 'learning_rate': 0.0136791451352636, 'subsample': 0.9831439588801081, 'colsample_bytree': 0.6390352590217884}. Best is trial 0 with value: 0.824.
Parameters: { "use_label_encoder" } are not used.

[I 2025-11-04 16:38:20,158] Trial 2 finished with value: 0.8335 and parameters: {'max_depth': 5, 'learning_rate': 0.029914219758641336, 'subsample': 0.8352371141170586, 'colsample_bytree': 0.8350462840637206}. Best is trial 2 with value: 0.8335.
P

Best XGBoost Accuracy: 0.8355
[LightGBM] [Info] Number of positive: 4000, number of negative: 4000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1637
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Stacked Model Validation Accuracy: 0.8335




In [129]:
# --- Add best XGBoost model to results ---
results['XGBoost_Optuna'] = {
    'model': best_xgb,
    'val_accuracy': accuracy_score(y_val_split, best_xgb.predict(X_val_scaled))
}

# --- Add stacking model to results ---
results['StackingModel'] = {
    'model': stacking_model,
    'val_accuracy': accuracy_score(y_val_split, stack_val_pred)
}

# Optional: print updated results summary
for name, res in results.items():
    print(f"{name}: Validation Accuracy = {res['val_accuracy']:.4f}")


LogisticRegression: Validation Accuracy = 0.7620
RandomForest: Validation Accuracy = 0.8165
XGBoost: Validation Accuracy = 0.8290
LightGBM: Validation Accuracy = 0.8295
CatBoost: Validation Accuracy = 0.8285
XGBoost_Optuna: Validation Accuracy = 0.8355
StackingModel: Validation Accuracy = 0.8335


In [147]:


# --- 6. Build stacking ensemble of tree-based models ---
tree_model_names = ['RandomForest', 'XGBoost', 'LightGBM', 'CatBoost']
forest_estimators = [(name, results[name]['model']) for name in tree_model_names if name in results]

stacking_model = StackingClassifier(
    estimators=forest_estimators,
    final_estimator=LogisticRegression(max_iter=4000, random_state=42),
    n_jobs=-1,
    passthrough=True
)

stacking_model.fit(X_train_split, y_train_split)
stack_val_pred = stacking_model.predict(X_val_split)
stack_acc = accuracy_score(y_val_split, stack_val_pred)
print(f"\nForest Ensemble Validation Accuracy: {stack_acc:.4f}")

results['ForestEnsemble'] = {'model': stacking_model, 'val_accuracy': stack_acc}



Forest Ensemble Validation Accuracy: 0.8315


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## some tests

In [122]:
# List feature columns in training and test sets
train_cols = list(X_train_scaled.shape[1] if isinstance(X_train_scaled, pd.DataFrame) else X_train_split.columns)
test_cols = list(X_test_features.columns)

print(f"Number of training features: {len(train_cols)}")
print(f"Number of test features: {len(test_cols)}\n")

# Find columns present in test but not in train
extra_in_test = set(test_cols) - set(train_cols)
if extra_in_test:
    print("Columns in test but not in train:")
    for col in extra_in_test:
        print("  ", col)
else:
    print("No extra columns in test set.")

# Find columns present in train but missing in test
missing_in_test = set(train_cols) - set(test_cols)
if missing_in_test:
    print("\nColumns in train but missing in test:")
    for col in missing_in_test:
        print("  ", col)
else:
    print("\nNo missing columns in test set.")


Number of training features: 67
Number of test features: 67

No extra columns in test set.

No missing columns in test set.


In [77]:
best_xgb = results['CatBoost']['model']
importances = best_xgb.named_steps['classifier'].feature_importances_
feature_names = best_xgb.named_steps['preprocessor'].get_feature_names_out()
feat_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_importance_df.sort_values(by='importance', ascending=False).head(20)


Unnamed: 0,feature,importance
52,num__p2_seen_pokemons_19,7.850602
11,num__mean_damage_per_turn_diff,7.159298
6,num__damage_diff_turn30,6.891976
1,num__hp_advantage_seen,6.771579
12,num__final_hp_advantage,5.352119
10,num__damage_diff_total,4.537775
8,num__lead_survival_diff,4.215665
5,num__damage_diff_turn25,4.059536
38,num__p2_seen_pokemons_5,3.498687
35,num__p2_seen_pokemons_2,3.380534


In [175]:
best_xgb = results['XGBoost']['model']
importances = best_xgb.named_steps['classifier'].feature_importances_
feature_names = best_xgb.named_steps['preprocessor'].get_feature_names_out()
feat_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_importance_df.sort_values(by='importance', ascending=False).head(20)


Unnamed: 0,feature,importance
45,num__p2_seen_pokemons_19,0.093362
1,num__hp_advantage_seen,0.076756
66,cat__p2_lead_name_1,0.027961
77,cat__p2_lead_name_12,0.027311
31,num__p2_seen_pokemons_5,0.02648
35,num__p2_seen_pokemons_9,0.025512
44,num__p2_seen_pokemons_18,0.025248
41,num__p2_seen_pokemons_15,0.023901
28,num__p2_seen_pokemons_2,0.022457
21,num__p1_seen_pokemons_15,0.01973


In [None]:
For gen1:

--- Testing LogisticRegression ---
Best Params: {'classifier__C': 0.1}
Validation Accuracy: 0.7855

--- Testing RandomForest ---
Best Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 5}
Validation Accuracy: 0.7335

--- Testing XGBoost ---
Best Params: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3}
Validation Accuracy: 0.7655

## create submission file

In [138]:
# --- 6. Predict on the full test set using the best model ---
# Expand Pokémon features in the full test set
for col_prefix in ['p1_seen_pokemons', 'p2_seen_pokemons']:
    X_test_features = expand_seen_pokemon_features(X_test_features, col_prefix)

# --- Align test features to training features ---
train_cols = X_train_split.columns.tolist()
for col in train_cols:
    if col not in X_test_features.columns:
        X_test_features[col] = 0
X_test_features = X_test_features[train_cols]

# --- Scale numeric features using the training scaler ---
#X_test_scaled = scaler.transform(X_test_features[numeric_features])

# Select best model
best_model_name = max(results, key=lambda k: results[k]['val_accuracy'])
best_model = results['CatBoost']['model']

# Predict
xgb_test_pred = best_model.predict(X_test_features)
xgb_test_pred_int = xgb_test_pred.astype(int)

# Save battle IDs
test_battle_ids = test_df_subset['battle_id']

# --- Create submission DataFrame ---
submission_xgb = pd.DataFrame({
    'battle_id': test_battle_ids,
    'player_won': xgb_test_pred_int
})

# Save CSV
submission_xgb.to_csv('submission.csv', index=False)

print("\n-------------------------------------------------")
print(f"File 'submission.csv' creato con successo usando {best_model_name}!")
print("Ora conterrà 1 e 0, e colonne separate.")
print("-------------------------------------------------")

submission_xgb.head()



-------------------------------------------------
File 'submission.csv' creato con successo usando LightGBM!
Ora conterrà 1 e 0, e colonne separate.
-------------------------------------------------


Unnamed: 0,battle_id,player_won
0,1501,1
1,2586,0
2,2653,0
3,1055,1
4,705,1
