****Caricamento test.jsonl e train.jsonl****

In [1]:
import pandas as pd
import numpy as np
import json
import warnings
from tqdm.auto import tqdm

# --- 1. Import di Scikit-learn ---
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression   
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_selection import SelectKBest, f_classif


warnings.filterwarnings('ignore')  # Suppress all warnings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    train_df = pd.read_json('kaggle/input/fds-pokemon-battles-prediction-2025/train.jsonl', lines=True)
    test_df = pd.read_json('kaggle/input/fds-pokemon-battles-prediction-2025/test.jsonl', lines=True)
    
    # Convertiamo subito il target in 1 (Vittoria) e 0 (Sconfitta)
    if 'player_won' in train_df.columns:
        train_df['player_won'] = train_df['player_won'].astype(int)
    print("Caricamento riuscito!")
except Exception as e:
    print(f"Errore nel caricamento dati: {e}")

#####
riga_da_rimuovere = 4877

# Usiamo un controllo per sicurezza, nel caso la riga non esista
if riga_da_rimuovere in train_df.index:
    train_df = train_df.drop(riga_da_rimuovere)
    print(f"Riga {riga_da_rimuovere} rimossa con successo.")
else:
    print(f"Riga {riga_da_rimuovere} non trovata (forse già rimossa o non presente).")

filtro_livello_100 = train_df['p1_team_details'].apply(
    lambda team_list: all(pokemon.get('level') == 100 for pokemon in team_list)
)

train_df = train_df[filtro_livello_100]

Caricamento riuscito!
Riga 4877 rimossa con successo.


In [3]:
# Mappa di efficacia dei tipi per la Generazione 1
# Nota: 'Special' in Gen 1 copre sia Atk Sp. che Def Sp.
# Non ci sono tipi Dark, Steel, o Fairy.
# Source: https://pokemondb.net/type
TYPE_CHART_GEN1 = {
    'NORMAL': {'ROCK': 0.5, 'GHOST': 0.0},
    'FIRE': {'FIRE': 0.5, 'WATER': 0.5, 'GRASS': 2.0, 'ICE': 2.0, 'BUG': 2.0, 'ROCK': 0.5},
    'WATER': {'FIRE': 2.0, 'WATER': 0.5, 'GRASS': 0.5, 'GROUND': 2.0, 'ROCK': 2.0, 'DRAGON': 0.5},
    'ELECTRIC': {'WATER': 2.0, 'ELECTRIC': 0.5, 'GRASS': 0.5, 'GROUND': 0.0, 'FLYING': 2.0, 'DRAGON': 0.5},
    'GRASS': {'FIRE': 0.5, 'WATER': 2.0, 'ELECTRIC': 1.0, 'GRASS': 0.5, 'POISON': 0.5, 'GROUND': 2.0, 'FLYING': 0.5, 'BUG': 0.5, 'ROCK': 2.0, 'DRAGON': 0.5},
    'ICE': {'WATER': 0.5, 'GRASS': 2.0, 'ICE': 0.5, 'GROUND': 2.0, 'FLYING': 2.0, 'DRAGON': 2.0},
    'FIGHTING': {'NORMAL': 2.0, 'POISON': 0.5, 'FLYING': 0.5, 'PSYCHIC': 0.5, 'BUG': 0.5, 'ROCK': 2.0, 'GHOST': 0.0},
    'POISON': {'GRASS': 2.0, 'POISON': 0.5, 'GROUND': 0.5, 'BUG': 2.0, 'ROCK': 0.5, 'GHOST': 0.5},
    'GROUND': {'FIRE': 2.0, 'ELECTRIC': 2.0, 'GRASS': 0.5, 'POISON': 2.0, 'FLYING': 0.0, 'BUG': 0.5, 'ROCK': 2.0},
    'FLYING': {'ELECTRIC': 0.5, 'GRASS': 2.0, 'FIGHTING': 2.0, 'BUG': 2.0, 'ROCK': 0.5},
    'PSYCHIC': {'FIGHTING': 2.0, 'POISON': 2.0, 'PSYCHIC': 0.5, 'GHOST': 1.0}, # In Gen 1, Psychic era immune a Ghost per un bug, ma i dati Showdown potrebbero averlo corretto. Assumiamo 1.0 per sicurezza, o 0.0 se il bug è emulato. Qui usiamo 1.0.
    'BUG': {'FIRE': 0.5, 'GRASS': 2.0, 'FIGHTING': 0.5, 'POISON': 2.0, 'FLYING': 0.5, 'PSYCHIC': 2.0},
    'ROCK': {'FIRE': 2.0, 'ICE': 2.0, 'FIGHTING': 0.5, 'GROUND': 0.5, 'FLYING': 2.0, 'BUG': 2.0},
    'GHOST': {'NORMAL': 0.0, 'PSYCHIC': 0.0, 'GHOST': 2.0}, # Famoso bug: Lick (Ghost) non colpisce Psychic.
    'DRAGON': {'DRAGON': 2.0},
}

# Funzione helper per calcolare l'efficacia
def get_type_effectiveness(move_type, target_types):
    if move_type not in TYPE_CHART_GEN1:
        return 1.0
    
    multiplier = 1.0
    chart_for_move = TYPE_CHART_GEN1[move_type]
    
    for target_type in target_types:
        if target_type in chart_for_move:
            multiplier *= chart_for_move[target_type]
            
    return multiplier

# Pokémon dominanti nel metagame Gen 1 OU (S-Tier e A-Tier)
# La loro presenza è un segnale fortissimo.
META_THREATS_GEN1 = {
    'Snorlax', 'Tauros', 'Chansey', 'Alakazam', 'Starmie', 'Exeggutor', 
    'Zapdos', 'Jolteon', 'Rhydon', 'Golem', 'Lapras'
}

# Mosse di setup o status chiave
STATUS_MOVES = {'Thunder Wave', 'Sleep Powder', 'Sing', 'Toxic', 'Lovely Kiss', 'Spore', 'Stun Spore', 'Glare'}
SETUP_MOVES = {'Amnesia', 'Swords Dance', 'Agility', 'Growth'}

def get_best_stab_advantage(attacker_types, defender_types):
    """
    Calculates the best possible STAB multiplier an attacker has against a defender.
    """
    # Clean 'notype' from lists
    attacker_types = [t for t in attacker_types if t.upper() != 'NOTYPE']
    defender_types = [t for t in defender_types if t.upper() != 'NOTYPE']

    if not attacker_types:
        return 1.0 # No types, no STAB advantage

    best_multiplier = 0.0
    
    for move_type in attacker_types:
        # Get the multiplier for this STAB type against the defender's types
        multiplier = get_type_effectiveness(move_type.upper(), [t.upper() for t in defender_types])
        
        # We're looking for the *best* STAB move
        if multiplier > best_multiplier:
            best_multiplier = multiplier
            
    # If no STAB move is effective (e.g., Normal vs. Ghost), multiplier is 0.
    # Otherwise, it's the best one we found.
    # If best_multiplier is 0, we should return 0, not 1.
    if best_multiplier == 0.0:
        # Check if any type was effective at all, even if not > 0
        # This handles neutral hits (1.0)
        is_neutral = any(get_type_effectiveness(t.upper(), [d.upper() for d in defender_types]) >= 1.0 for t in attacker_types)
        if is_neutral:
             return 1.0
        
    return best_multiplier

### Ranking of most important stats in a pokemon battle.
- Pokemon type
- Speed stat
- Moveset
    - STAB attack
    - Coverage
    - Utility/Status
- HP, Attack, Defense, Special Attack, and Special Defense

In [4]:
def create_advanced_features(df):
    processed_data = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creazione features"):
        p1_team = row['p1_team_details']
        p2_lead = row['p2_lead_details']
        timeline = row['battle_timeline']
        p1_lead = p1_team[0]
        
        # --- Static Features ---
        feat_lead_speed_diff = p1_lead['base_spe'] - p2_lead['base_spe']
        feat_lead_hp_diff = p1_lead['base_hp'] - p2_lead['base_hp']
        feat_lead_atk_diff = p1_lead['base_atk'] - p2_lead['base_atk']
        feat_lead_def_diff = p1_lead['base_def'] - p2_lead['base_def']
        feat_lead_spa_diff = p1_lead['base_spa'] - p2_lead['base_spa']
        feat_lead_spd_diff = p1_lead['base_spd'] - p2_lead['base_spd']
        
        # --- Lead Type Advantage ---
        p1_lead_types = [t for t in p1_lead['types'] if t != 'notype']
        p2_lead_types = [t for t in p2_lead['types'] if t != 'notype']
        
        p1_best_adv = get_best_stab_advantage(p1_lead_types, p2_lead_types)
        p2_best_adv = get_best_stab_advantage(p2_lead_types, p1_lead_types)
        
        # A positive number means P1's STABs are more effective
        feat_lead_type_adv_diff = p1_best_adv - p2_best_adv

        # Statistiche aggregate P1
        feat_p1_team_avg_atk = np.mean([p['base_atk'] for p in p1_team])
        feat_p1_team_avg_spe = np.mean([p['base_spe'] for p in p1_team])
        feat_p1_team_max_hp = np.max([p['base_hp'] for p in p1_team])
        
        # Using 'base_spa' as it's identical to 'base_spd' in Gen 1 data
        feat_p1_team_avg_special = np.mean([p['base_spa'] for p in p1_team])
        
        # Meta threats
        feat_p1_team_meta_count = sum(1 for p in p1_team if p['name'].title() in META_THREATS_GEN1)
        feat_p2_lead_is_meta = 1 if p2_lead['name'].title() in META_THREATS_GEN1 else 0
        
        p2_lead_speed = p2_lead['base_spe']
        feat_team_speed_adv_vs_lead = sum(1 for p in p1_team if p['base_spe'] > p2_lead_speed)

        # --- Dynamic Features (Timeline) ---
        p1_seen_status = {p['name']: {'hp_pct': 100, 'status': None} for p in p1_team}
        p2_seen_status = {p2_lead['name']: {'hp_pct': 100, 'status': None}}
        
        feat_end_boost_diff = 0
        p1_status_moves = 0
        p1_setup_moves = 0
        p2_status_moves = 0
        p2_setup_moves = 0
        
        p1_total_bp = 0
        p2_total_bp = 0
        p1_confused_turns = 0
        p2_confused_turns = 0
        p1_active_hp_end = 100 
        p2_active_hp_end = 100 
        
        last_turn_num = 0 

        if timeline:
            last_turn_num = timeline[-1].get('turn', 0)
            for turn in timeline:
                p1_state = turn.get('p1_pokemon_state')
                if p1_state and p1_state.get('name'):
                    p1_name = p1_state['name']
                    p1_seen_status.setdefault(p1_name, {'hp_pct': 100, 'status': None})
                    p1_seen_status[p1_name]['hp_pct'] = p1_state.get('hp_pct', p1_seen_status[p1_name]['hp_pct'])
                    p1_seen_status[p1_name]['status'] = p1_state.get('status', p1_seen_status[p1_name]['status'])
                    
                    if 'confusion' in p1_state.get('volatile_effects', []):
                        p1_confused_turns += 1
                    
                p2_state = turn.get('p2_pokemon_state')
                if p2_state and p2_state.get('name'):
                    p2_name = p2_state['name']
                    p2_seen_status.setdefault(p2_name, {'hp_pct': 100, 'status': None})
                    p2_seen_status[p2_name]['hp_pct'] = p2_state.get('hp_pct', p2_seen_status[p2_name]['hp_pct'])
                    p2_seen_status[p2_name]['status'] = p2_state.get('status', p2_seen_status[p2_name]['status'])
                    
                    if 'confusion' in p2_state.get('volatile_effects', []):
                        p2_confused_turns += 1

                p1_move = turn.get('p1_move_details')
                if p1_move:
                    move_name_p1 = p1_move.get('name', '').title()
                    if move_name_p1 in STATUS_MOVES: p1_status_moves += 1
                    if move_name_p1 in SETUP_MOVES: p1_setup_moves += 1
                    if p1_move.get('base_power'):
                        p1_total_bp += p1_move['base_power']
                    
                p2_move = turn.get('p2_move_details')
                if p2_move:
                    move_name_p2 = p2_move.get('name', '').title()
                    if move_name_p2 in STATUS_MOVES: p2_status_moves += 1
                    if move_name_p2 in SETUP_MOVES: p2_setup_moves += 1
                    if p2_move.get('base_power'):
                        p2_total_bp += p2_move['base_power']

                if turn.get('turn') == last_turn_num:
                    p1_boosts = sum(p1_state.get('boosts', {}).values()) if p1_state else 0
                    p2_boosts = sum(p2_state.get('boosts', {}).values()) if p2_state else 0
                    feat_end_boost_diff = p1_boosts - p2_boosts
                    
                    p1_active_hp_end = p1_state.get('hp_pct', 0) if p1_state else 0
                    p2_active_hp_end = p2_state.get('hp_pct', 0) if p2_state else 0

        # Calcoli finali
        p1_total_hp_seen = sum(p['hp_pct'] for p in p1_seen_status.values())
        p2_total_hp_seen = sum(p['hp_pct'] for p in p2_seen_status.values())
        feat_hp_advantage_seen = p1_total_hp_seen - p2_total_hp_seen
        
        feat_mons_revealed_diff = len(p2_seen_status) - len(p1_seen_status)
        
        p1_team_status_count = sum(1 for p in p1_seen_status.values() if p['status'] is not None)
        p2_team_status_count = sum(1 for p in p2_seen_status.values() if p['status'] is not None)
        feat_team_status_diff = p1_team_status_count - p2_team_status_count

        feat_status_move_diff = p1_status_moves - p2_status_moves
        feat_setup_move_diff = p1_setup_moves - p2_setup_moves
        
        p1_fainted_count = sum(1 for p in p1_seen_status.values() if p['hp_pct'] == 0)
        p2_fainted_count = sum(1 for p in p2_seen_status.values() if p['hp_pct'] == 0)
        feat_fainted_mons_diff = p2_fainted_count - p1_fainted_count 

        feat_hp_advantage_active = p1_active_hp_end - p2_active_hp_end
        
        feat_total_base_power_diff = p1_total_bp - p2_total_bp
        feat_volatile_status_diff = p2_confused_turns - p1_confused_turns 

        processed_data.append({
            'battle_id': row['battle_id'],
            # Categoriche
            'p1_lead_name': p1_lead['name'], 
            'p2_lead_name': p2_lead['name'],
            # Numeriche (Core)
            'lead_speed_diff': feat_lead_speed_diff,
            'hp_advantage_seen': feat_hp_advantage_seen,
            'mons_revealed_diff': feat_mons_revealed_diff,
            'team_status_diff': feat_team_status_diff,
            'end_boost_diff': feat_end_boost_diff,
            # Numeriche (Aggregati Team e Meta)
            'p1_team_avg_atk': feat_p1_team_avg_atk,
            'p1_team_avg_spe': feat_p1_team_avg_spe,
            'p1_team_max_hp': feat_p1_team_max_hp,
            'p1_team_meta_count': feat_p1_team_meta_count,
            'p2_lead_is_meta': feat_p2_lead_is_meta,
            # Numeriche (Aggregati Mosse)
            'status_move_diff': feat_status_move_diff,
            'setup_move_diff': feat_setup_move_diff,
            'total_base_power_diff': feat_total_base_power_diff,
            'volatile_status_diff': feat_volatile_status_diff,
            # Numeriche (Lead Diffs)
            'lead_hp_diff': feat_lead_hp_diff,
            'lead_atk_diff': feat_lead_atk_diff,
            'lead_def_diff': feat_lead_def_diff,
            'lead_spa_diff': feat_lead_spa_diff,
            'lead_spd_diff': feat_lead_spd_diff,
            # Numeriche (Momentum)
            'team_speed_adv_vs_lead': feat_team_speed_adv_vs_lead,
            'fainted_mons_diff': feat_fainted_mons_diff,
            'hp_advantage_active': feat_hp_advantage_active,
            
            # --- ADD NEW FEATURES HERE ---
            'lead_type_adv_diff': feat_lead_type_adv_diff,
            'p1_team_avg_special': feat_p1_team_avg_special
            
        })
    return pd.DataFrame(processed_data).set_index('battle_id')

In [5]:
print("Inizio feature engineering avanzata sul set di training...")
X_train_features = create_advanced_features(train_df)

print("\nInizio feature engineering avanzata sul set di test...")
X_test_features = create_advanced_features(test_df)

# Definiamo la nostra variabile target 'y'
y_train = train_df.set_index('battle_id')['player_won']

# Allineiamo X e y
y_train = y_train.loc[X_train_features.index]

print("\nFeature engineering completato. Esempio di dati trasformati:")
print(X_train_features.head())

Inizio feature engineering avanzata sul set di training...


Creazione features:   0%|          | 0/9996 [00:00<?, ?it/s]

Creazione features: 100%|██████████| 9996/9996 [00:02<00:00, 4605.90it/s]



Inizio feature engineering avanzata sul set di test...


Creazione features: 100%|██████████| 5000/5000 [00:01<00:00, 4545.63it/s]



Feature engineering completato. Esempio di dati trasformati:
          p1_lead_name p2_lead_name  ...  lead_type_adv_diff  p1_team_avg_special
battle_id                            ...                                         
0              starmie      starmie  ...                 0.0           100.000000
1                 jynx     alakazam  ...                 0.5            90.000000
2            exeggutor      chansey  ...                 0.0            90.000000
3               gengar       tauros  ...                 1.0           103.333333
4             alakazam      starmie  ...                -0.5            97.500000

[5 rows x 26 columns]


In [6]:
# --- 1. Definizione delle Feature ---

numeric_features = [
    'lead_speed_diff',
    'hp_advantage_seen',
    'mons_revealed_diff',
    'team_status_diff',
    'end_boost_diff',
    
    'p1_team_avg_atk',
    'p1_team_avg_spe',
    'p1_team_max_hp',
    'p1_team_meta_count',
    'p2_lead_is_meta',
    
    'lead_type_adv_diff',
    'p1_team_avg_special',
    
    'status_move_diff',
    'setup_move_diff',
    'total_base_power_diff',
    'volatile_status_diff',

    'lead_hp_diff',
    'lead_atk_diff',
    'lead_def_diff',
    'lead_spa_diff',
    'lead_spd_diff',

    'team_speed_adv_vs_lead',
    'fainted_mons_diff',
    'hp_advantage_active'
]

categorical_features = ['p1_lead_name', 'p2_lead_name']


# --- 2. Creazione del Preprocessor ---

# Creiamo i trasformatori (StandardScaler e OneHotEncoder)
numeric_transformer = Pipeline(steps=[('scaler', RobustScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

In [7]:

# Dividiamo i dati di training per una validazione locale
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_features, 
    y_train, 
    test_size=0.2, # 20% per la validazione
    random_state=42,
    stratify=y_train # Mantiene l'equilibrio delle classi
)

print(f"Dimensione Training Split: {X_train_split.shape}")
print(f"Dimensione Validation Split: {X_val_split.shape}")

# 1. Creiamo la pipeline con un modello "di default"
# Usiamo C=1.0 come valore predefinito
baseline_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear', C=1.0))
])

# 2. Alleniamo il modello base SUL SOLO SET DI TRAINING SPLIT
print("\nAllenamento del modello baseline...")
baseline_pipeline.fit(X_train_split, y_train_split)

# 3. Valutiamo il modello base SUL SET DI VALIDAZIONE
y_val_pred = baseline_pipeline.predict(X_val_split)
val_accuracy = accuracy_score(y_val_split, y_val_pred)

print(f"\n--- Risultati Modello Baseline ---")
print(f"Accuracy sul Validation Set: {val_accuracy:.4f}")
print("---------------------------------")

Dimensione Training Split: (7996, 26)
Dimensione Validation Split: (2000, 26)

Allenamento del modello baseline...

--- Risultati Modello Baseline ---
Accuracy sul Validation Set: 0.8050
---------------------------------


In [9]:
from sklearn.model_selection import GridSearchCV
print("\nL'ottimizzazione degli iperparametri...")
# 1. Creiamo la pipeline (la stessa di prima, ma senza 'C' definito)
# La pipeline che verrà testata da GridSearchCV
tuning_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selectkbest', SelectKBest(score_func=f_classif)),  # <-- NUOVA FASE
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'))
])

# 2. Definiamo la griglia dei parametri
# Vogliamo testare diversi valori per 'classifier__C'
param_grid = {
    'selectkbest__k': [30, 40, 50], # Quanti features tenere
    'classifier__penalty': ['l1'],
    'classifier__C': [100, 200, 500],       # La nuova grid per C
    'classifier__solver': ['liblinear'] 
}

# 3. Impostiamo GridSearchCV
# cv=5 significa 5-fold cross-validation
# scoring='accuracy' è la nostra metrica
# n_jobs=-1 usa tutti i processori
grid_search = GridSearchCV(
    tuning_pipeline, 
    param_grid, 
    cv=10, 
    scoring='accuracy', 
    n_jobs=-1,
    verbose=1 # Mostra i log
)

# 4. Avviamo la ricerca sull'INTERO set di training
# (GridSearchCV gestirà internamente le divisioni di cross-validation)
grid_search.fit(X_train_features, y_train)

# 5. Analizziamo i risultati
print("\n--- Risultati GridSearchCV ---")
print(f"Migliori parametri trovati: {grid_search.best_params_}")
print(f"Migliore Accuracy (media CV): {grid_search.best_score_:.4f}")
print("------------------------------")

best_index = grid_search.best_index_
# (std_test_score è la colonna che la contiene)
std_dev_del_modello = grid_search.cv_results_['std_test_score'][best_index]

print(f"Accuracy media: {grid_search.best_score_ * 100:.2f}%")
print(f"Deviazione Standard (Stabilità) del modello: {std_dev_del_modello * 100:.2f}%")


L'ottimizzazione degli iperparametri...
Fitting 10 folds for each of 9 candidates, totalling 90 fits

--- Risultati GridSearchCV ---
Migliori parametri trovati: {'classifier__C': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'selectkbest__k': 50}
Migliore Accuracy (media CV): 0.8337
------------------------------
Accuracy media: 83.37%
Deviazione Standard (Stabilità) del modello: 1.34%


In [10]:
# 1. Il nostro modello finale è il 'best_estimator_' trovato da GridSearch
final_model = grid_search.best_estimator_

print("\nGenerazione delle predizioni sul set di test...")
# 2. Usiamo il modello finale per predire sul set di test (X_test_features)
test_predictions = final_model.predict(X_test_features)

# Ignoriamo i warning per pulizia
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# --- FASE 1: VALIDAZIONE ---

# Dividi i dati di training per la validazione
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_features, y_train, test_size=0.20, random_state=42, stratify=y_train
)

# Allena un modello base (es. 'baseline_pipeline') sulla parte splittata
baseline_pipeline.fit(X_train_split, y_train_split)

# Calcola l'accuracy sul set di validazione
y_val_pred = baseline_pipeline.predict(X_val_split)
val_accuracy = accuracy_score(y_val_split, y_val_pred)

print(f"--- Accuracy di Validazione (stima): {val_accuracy * 100:.2f}% ---")

# -----------------------------------------------------------------

# --- FASE 2: ALLENAMENTO DEL MODELLO OTTIMIZZATO ---

# Allena il GridSearchCV su TUTTI i dati di training
# (Questo è il 'grid_search' definito nel Passaggio 6 della risposta precedente)
grid_search.fit(X_train_features, y_train)

print(f"\n--- Risultati GridSearchCV ---")
print(f"Migliori parametri trovati: {grid_search.best_params_}")
print(f"Migliore Accuracy (media CV): {grid_search.best_score_:.4f}")

# 1. Il tuo modello finale e ottimizzato è pronto
final_model = grid_search.best_estimator_
cv_accuracy = grid_search.best_score_

# 2. Usa il tuo modello finale (ottimizzato) per fare previsioni
#    sul set di validazione (X_val_split)
y_val_pred = final_model.predict(X_val_split)

# 3. Calcola l'accuracy confrontando le previsioni (y_val_pred)
#    con le risposte vere (y_val_split)
final_model_accuracy = accuracy_score(y_val_split, y_val_pred)

print(f"Accuracy del 'final_model' sul set di validazione: {final_model_accuracy * 100:.2f}%")

print(f"Accuracy (media da Cross-Validation): {cv_accuracy * 100:.2f}%")



Generazione delle predizioni sul set di test...
--- Accuracy di Validazione (stima): 80.50% ---
Fitting 10 folds for each of 9 candidates, totalling 90 fits

--- Risultati GridSearchCV ---
Migliori parametri trovati: {'classifier__C': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'selectkbest__k': 50}
Migliore Accuracy (media CV): 0.8337
Accuracy del 'final_model' sul set di validazione: 83.25%
Accuracy (media da Cross-Validation): 83.37%


In [11]:
import pandas as pd
import numpy as np

# 1. Estrai il modello migliore (la pipeline completa) da GridSearchCV
final_model = grid_search.best_estimator_

# 2. Estrai i passaggi della pipeline
preprocessor = final_model.named_steps['preprocessor']
selector = final_model.named_steps['selectkbest']
classifier = final_model.named_steps['classifier']

# 3. Estrai i coefficienti (l'importanza) - questo è corretto (avrà lunghezza 40)
coefficients = classifier.coef_[0]

# 4.
# Dobbiamo ottenere i nomi di TUTTE le feature DAL PREPROCESSOR
# e poi filtrarli in base a QUALI feature sono state scelte da SelectKBest.

try:
    # A. Ottieni tutti i 62 nomi delle feature dal preprocessor
    all_feature_names = preprocessor.get_feature_names_out()
    
    # B. Ottieni la "maschera" booleana da SelectKBest (un array di True/False lungo 62)
    mask = selector.get_support()
    
    # C. Applica la maschera ai nomi per ottenere solo i 40 nomi selezionati
    # Converti i nomi in un array numpy per un facile filtraggio
    selected_feature_names = np.array(all_feature_names)[mask]

except Exception as e:
    print(f"Errore con get_feature_names_out o get_support: {e}")
    # Il tuo fallback originale (potrebbe non funzionare con SelectKBest)
    try:
        categorical_names = preprocessor.named_transformers_['cat'] \
                                         .named_steps['onehot'] \
                                         .get_feature_names_out(categorical_features)
    except AttributeError:
        categorical_names = preprocessor.named_transformers_['cat'] \
                                         .named_steps['onehot'] \
                                         .get_feature_names_out()
    all_feature_names = numeric_features + list(categorical_names)
    mask = selector.get_support()
    selected_feature_names = np.array(all_feature_names)[mask]


# 5. Controlla le lunghezze (ora 40 == 40)
if len(selected_feature_names) != len(coefficients):
    print(f"ERRORE: La lunghezza dei nomi ({len(selected_feature_names)}) non corrisponde a quella dei coefficienti ({len(coefficients)})")
    print("C'è ancora un problema logico nel recuperare i nomi o la maschera.")
else:
    # 6. Crea un DataFrame per visualizzarli in modo chiaro
    importance_df = pd.DataFrame({
        'Feature': selected_feature_names, # Usa i nomi filtrati
        'Coefficient': coefficients
    })

    # 7. Aggiungi il 'Coefficiente Assoluto' per ordinare per impatto (sia positivo che negativo)
    importance_df['Impact'] = importance_df['Coefficient'].abs()
    importance_df = importance_df.sort_values(by='Impact', ascending=False)

    # 8. Stampa i risultati
    print("--- Importanza delle Feature (Coefficienti del Modello) ---")
    print(importance_df.to_string())

--- Importanza delle Feature (Coefficienti del Modello) ---
                        Feature  Coefficient      Impact
3         num__team_status_diff   206.208007  206.208007
1        num__hp_advantage_seen   105.771754  105.771754
2       num__mons_revealed_diff   105.030090  105.030090
38    cat__p2_lead_name_chansey    -9.428868    9.428868
22    cat__p1_lead_name_chansey     9.233678    9.233678
23   cat__p1_lead_name_cloyster    -7.605669    7.605669
39   cat__p2_lead_name_cloyster     6.898832    6.898832
32    cat__p1_lead_name_snorlax     4.068268    4.068268
47    cat__p2_lead_name_snorlax    -4.000245    4.000245
42      cat__p2_lead_name_golem     3.522784    3.522784
21   cat__p1_lead_name_articuno    -3.206037    3.206037
14           num__lead_def_diff     2.627333    2.627333
44     cat__p2_lead_name_lapras    -2.486974    2.486974
26      cat__p1_lead_name_golem    -1.963348    1.963348
29     cat__p1_lead_name_lapras     1.877254    1.877254
33    cat__p1_lead_name_star

In [12]:
# 1. Estrai il modello finale
final_model = grid_search.best_estimator_

# 2. Genera le predizioni (saranno True/False)
test_predictions_bool = final_model.predict(X_test_features)

# 3. Converti True/False in 1/0 ---
# .astype(int) converte True -> 1 e False -> 0
test_predictions_int = test_predictions_bool.astype(int)

# 4. Prendi i battle_id dall'indice
test_battle_ids = X_test_features.index

# 5. Crea il DataFrame con le due colonne CORRETTE
submission_df = pd.DataFrame({
    'battle_id': test_battle_ids,
    'player_won': test_predictions_int  # Usa la versione 1/0
})

# 6. Salva SENZA l'indice di pandas ---
# Aggiungendo 'index=False' si risolve il problema della "stessa colonna".
submission_df.to_csv('submission_predictions.csv', index=False)

print("\n-------------------------------------------------")
print("File 'submission_predictions.csv' creato con successo!")
print("Ora conterrà 1 e 0, e colonne separate.")
print("-------------------------------------------------")

# Stampa un'anteprima
print(submission_df.head())


-------------------------------------------------
File 'submission_predictions.csv' creato con successo!
Ora conterrà 1 e 0, e colonne separate.
-------------------------------------------------
   battle_id  player_won
0          0           0
1          1           1
2          2           1
3          3           1
4          4           1
