# Premier League V4.5: Re-Optimizing for Draws

A accuracy baixou porque mud√°mos as regras do jogo (pesos) mas mantivemos a estrat√©gia antiga.
Nesta etapa, vamos correr o **Grid Search** novamente, mas desta vez informando o Grid Search de que os empates s√£o importantes.

Imports e Configura√ß√£o

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import os
import codecs
import requests
import kagglehub # Mantemos a integra√ß√£o do Kaggle
from bs4 import BeautifulSoup
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, confusion_matrix # <--- AQUI ESTAVA A FALTA
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import numpy as np

sns.set_style("whitegrid")

# --- CONFIGURA√á√ÉO ---
DATA_FILE = 'europe_football_full.csv' 
XG_FILE = 'europe_football_xg.csv'
MARKET_VALUE_FILE = 'market_values.csv'
START_YEAR = 2014 
END_YEAR = 2025

## 1. Data Acquisition (Recolha de Dados)
Vamos buscar dados reais do `football-data.co.uk`. Vamos carregar v√°rias temporadas consecutivas para que o modelo tenha hist√≥rico suficiente para aprender padr√µes.

* **FTHG**: Full Time Home Goals
* **FTAG**: Full Time Away Goals
* **FTR**: Full Time Result (H=Home, D=Draw, A=Away)

In [None]:
# [CELL: Real Market Value Data Processing (Kaggle)]
import kagglehub
import pandas as pd
import os

MARKET_VALUE_FILE = 'market_values.csv'

def prepare_market_values():
    if os.path.exists(MARKET_VALUE_FILE):
        print("‚úÖ Dados de Valor de Mercado j√° existem localmente.")
        return

    print("‚¨áÔ∏è A baixar dados do Transfermarkt via Kagglehub...")
    try:
        # Download do dataset oficial
        path = kagglehub.dataset_download("davidcariboo/player-scores")
        print(f"üìÇ Dataset baixado em: {path}")
        
        # Carregar ficheiros necess√°rios
        print("‚öôÔ∏è A processar valores hist√≥ricos de plant√©is...")
        # players.csv: Para saber o nome do clube atual (e mapear IDs)
        # valuations.csv: Hist√≥rico de valor de cada jogador
        # clubs.csv: Para mapear club_id -> nome
        
        valuations = pd.read_csv(os.path.join(path, "player_valuations.csv"))
        clubs = pd.read_csv(os.path.join(path, "clubs.csv"))
        
        # Converter datas
        valuations['date'] = pd.to_datetime(valuations['date'])
        valuations['Season'] = valuations['date'].apply(lambda x: x.year if x.month > 7 else x.year - 1)
        
        # Merge para ter o nome do clube
        # Nota: O dataset do davidcariboo tem 'current_club_id', mas para hist√≥rico preciso 
        # √© melhor usar a avalia√ß√£o ligada ao clube naquele momento.
        # Simplifica√ß√£o robusta: Agrupar por 'current_club_id' da altura se dispon√≠vel, 
        # mas como valuations.csv tem 'current_club_id' do jogador HOJE, precisamos cruzar de outra forma.
        # A forma mais precisa neste dataset √© usar 'player_valuations' que tem 'player_id'
        # e cruzar com 'appearances' ou assumir a m√©dia por clube se tivermos essa liga√ß√£o.
        
        # CORRE√á√ÉO ESTRAT√âGICA:
        # O dataset 'player_valuations.csv' tem: player_id, date, market_value, current_club_id
        # Esse 'current_club_id' √© o clube ONDE O JOGADOR ESTAVA na data da avalia√ß√£o.
        # (Verifiquei a documenta√ß√£o do dataset: o campo tracking √© hist√≥rico).
        
        # 1. Juntar com nome do clube
        val_merged = valuations.merge(clubs[['club_id', 'name']], left_on='current_club_id', right_on='club_id', how='left')
        
        # 2. Agrupar por Clube e √âpoca -> Somar valor de mercado
        # Filtramos apenas os maiores valores (Top 18 jogadores) para evitar infla√ß√£o com equipas B/Reservas grandes
        # Mas para simplificar, a soma total costuma ser um bom indicador relativo.
        
        squad_values = val_merged.groupby(['name', 'Season'])['market_value_in_eur'].sum().reset_index()
        
        # 3. Limpeza e Mapeamento de Nomes (Para bater certo com o nosso dataset)
        # Vamos normalizar os nomes depois, aqui guardamos o raw
        squad_values.rename(columns={'name': 'Team', 'market_value_in_eur': 'Value'}, inplace=True)
        
        # Converter para Milh√µes (torna mais leg√≠vel)
        squad_values['Value'] = squad_values['Value'] / 1_000_000
        
        # Guardar
        squad_values.to_csv(MARKET_VALUE_FILE, index=False)
        print(f"‚úÖ 'market_values.csv' criado com sucesso! ({len(squad_values)} registos)")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Erro ao processar dados do Kaggle: {e}")
        print("   -> O sistema usar√° o m√©todo de estimativa (Tier System) como fallback.")

# Executar a prepara√ß√£o
prepare_market_values()

In [None]:
# --- FUN√á√ÉO 1: Scraper Understat (Agora com Champions) ---
def scrape_understat_season(year, league_name):
    # league_name: 'EPL', 'Bundesliga', 'La_liga', 'Ligue_1', 'Serie_A', 'Champions_League'
    print(f"üï∑Ô∏è A recolher xG ({league_name}) de {year}/{year+1}...")
    url = f"https://understat.com/league/{league_name}/{year}"
    try:
        response = requests.get(url)
        if response.status_code != 200: return pd.DataFrame()
        
        match = re.search(r"datesData\s*=\s*JSON\.parse\('(.*?)'\)", response.text)
        if not match: return pd.DataFrame()
            
        json_data = codecs.decode(match.group(1), 'unicode_escape')
        data = json.loads(json_data)
        
        matches = []
        for m in data:
            if m['isResult']:
                matches.append({
                    'Date': m['datetime'][:10],
                    'HomeTeam': m['h']['title'],
                    'AwayTeam': m['a']['title'],
                    'FTHG': int(m['goals']['h']), # Golos Reais
                    'FTAG': int(m['goals']['a']),
                    'Home_xG': float(m['xG']['h']),
                    'Away_xG': float(m['xG']['a']),
                    'League': league_name
                })
        return pd.DataFrame(matches)
    except Exception as e:
        print(f"‚ö†Ô∏è Erro no ano {year} ({league_name}): {e}")
        return pd.DataFrame()

# --- FUN√á√ÉO 2: Carregar Dados Ligas (Football-Data) ---
def get_main_data(start, end):
    if os.path.exists(DATA_FILE):
        print(f"üìÇ Carregando dados locais: {DATA_FILE}")
        df = pd.read_csv(DATA_FILE)
        return df
    
    print("üåê A descarregar dados das Ligas (Football-Data)...")
    dfs = []
    base_url = "https://www.football-data.co.uk/mmz4281/{}/{}.csv"
    divisions = ['E0', 'D1', 'SP1', 'F1', 'I1'] 
    
    for year in range(start, end + 1):
        season = f"{str(year)[-2:]}{str(year+1)[-2:]}"
        for div in divisions:
            try:
                url = base_url.format(season, div)
                df = pd.read_csv(url)
                df['Div'] = div
                df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
                dfs.append(df)
            except: pass
        
    full_df = pd.concat(dfs, ignore_index=True).dropna(subset=['Date', 'FTR'])
    full_df.to_csv(DATA_FILE, index=False)
    return full_df.sort_values('Date').reset_index(drop=True)

# --- FUN√á√ÉO 3: Limpeza de Nomes (GLOBAL + EUROPE) ---
def clean_team_name(name):
    name_map = {
        # --- INGLATERRA ---
        'Manchester United': 'Man United', 'Manchester City': 'Man City',
        'Newcastle United': 'Newcastle', 'West Ham United': 'West Ham', 
        'Wolverhampton Wanderers': 'Wolves', 'Brighton': 'Brighton',
        'Leicester City': 'Leicester', 'Leeds United': 'Leeds',
        'Tottenham Hotspur': 'Tottenham', 'Nottingham Forest': "Nott'm Forest", 
        'Sheffield United': 'Sheffield United', 'Luton': 'Luton', 
        'Brentford': 'Brentford', 'Bournemouth': 'Bournemouth',
        
        # --- ALEMANHA ---
        'Bayern Munich': 'Bayern Munich', 'Bayern M√ºnchen': 'Bayern Munich',
        'Borussia Dortmund': 'Borussia Dortmund', 'Dortmund': 'Borussia Dortmund',
        'Bayer Leverkusen': 'Bayer Leverkusen', 'Leverkusen': 'Bayer Leverkusen',
        'RB Leipzig': 'RB Leipzig', 'Leipzig': 'RB Leipzig',
        'Borussia Monchengladbach': 'Borussia M.Gladbach', "M'gladbach": 'Borussia M.Gladbach',
        'Eintracht Frankfurt': 'Eintracht Frankfurt', 'Frankfurt': 'Eintracht Frankfurt',
        'Wolfsburg': 'Wolfsburg', 'VfL Wolfsburg': 'Wolfsburg',
        'Mainz 05': 'Mainz 05', 'Mainz': 'Mainz 05',
        'Stuttgart': 'VfB Stuttgart', 'VfB Stuttgart': 'VfB Stuttgart',
        'Freiburg': 'Freiburg', 'SC Freiburg': 'Freiburg',
        'Union Berlin': 'Union Berlin', 'FC Union Berlin': 'Union Berlin',
        'Bochum': 'VfL Bochum', 'VfL Bochum': 'VfL Bochum',
        'Koln': 'FC Koln', 'FC K√∂ln': 'FC Koln',
        'Hertha': 'Hertha Berlin', 'Hertha BSC': 'Hertha Berlin',
        'Schalke 04': 'Schalke 04', 'Schalke': 'Schalke 04',

        # --- ESPANHA ---
        'Ath Bilbao': 'Athletic Club', 'Athletic Bilbao': 'Athletic Club',
        'Atl Madrid': 'Atletico Madrid', 'Atletico': 'Atletico Madrid',
        'Barcelona': 'Barcelona', 'Real Madrid': 'Real Madrid',
        'Betis': 'Real Betis', 'Real Betis': 'Real Betis',
        'Celta': 'Celta Vigo', 'Celta Vigo': 'Celta Vigo',
        'Espanol': 'Espanyol', 'Espanyol': 'Espanyol',
        'Sociedad': 'Real Sociedad', 'Real Sociedad': 'Real Sociedad',
        'Valencia': 'Valencia', 'Valladolid': 'Real Valladolid', 
        'Villarreal': 'Villarreal', 'Girona': 'Girona',
        'Alaves': 'Alaves', 'Cadiz': 'Cadiz', 'Almeria': 'Almeria',

        # --- FRAN√áA ---
        'Paris SG': 'Paris Saint Germain', 'PSG': 'Paris Saint Germain',
        'Marseille': 'Marseille', 'Lyon': 'Lyon', 'Monaco': 'Monaco',
        'Lille': 'Lille', 'Nice': 'Nice', 'Rennes': 'Rennes',
        'Lens': 'Lens', 'Montpellier': 'Montpellier', 'Nantes': 'Nantes',
        'Reims': 'Reims', 'Strasbourg': 'Strasbourg', 'Toulouse': 'Toulouse',
        'Brest': 'Brest', 'Lorient': 'Lorient', 'Metz': 'Metz',
        'St Etienne': 'Saint-Etienne', 'Saint-Etienne': 'Saint-Etienne',

        # --- IT√ÅLIA ---
        'Inter': 'Inter', 'Internazionale': 'Inter',
        'Milan': 'AC Milan', 'Juventus': 'Juventus', 'Roma': 'Roma', 
        'Lazio': 'Lazio', 'Napoli': 'Napoli', 'Atalanta': 'Atalanta', 
        'Fiorentina': 'Fiorentina', 'Torino': 'Torino', 'Udinese': 'Udinese',
        'Bologna': 'Bologna', 'Verona': 'Verona', 'Hellas Verona': 'Verona',
        'Empoli': 'Empoli', 'Lecce': 'Lecce', 'Sassuolo': 'Sassuolo',
        'Monza': 'Monza', 'Genoa': 'Genoa', 'Salernitana': 'Salernitana',

        # --- OUTROS (CHAMPIONS LEAGUE) ---
        'Benfica': 'Benfica', 'Sporting CP': 'Sporting CP', 'Porto': 'Porto',
        'Ajax': 'Ajax', 'PSV Eindhoven': 'PSV Eindhoven', 'Feyenoord': 'Feyenoord',
        'Club Brugge': 'Club Brugge', 'Shakhtar Donetsk': 'Shakhtar Donetsk',
        'Galatasaray': 'Galatasaray', 'Celtic': 'Celtic', 'Rangers': 'Rangers',
        'Salzburg': 'RB Salzburg', 'Red Bull Salzburg': 'RB Salzburg'
    }
    return name_map.get(name, name)

# ==========================================
# üöÄ EXECU√á√ÉO
# ==========================================

# 1. Carregar Dados Principais (Ligas)
df_main = get_main_data(START_YEAR, END_YEAR)
df_main['Date'] = pd.to_datetime(df_main['Date'], dayfirst=True, errors='coerce')
df_main = df_main.dropna(subset=['Date'])

# 2. Carregar xG e Champions League (Understat)
if os.path.exists(XG_FILE):
    print("üìÇ Carregando dados Understat locais...")
    df_understat = pd.read_csv(XG_FILE)
else:
    print("üåê A iniciar scraping Understat (Ligas + Champions)...")
    dfs = []
    # Ligas
    for y in range(START_YEAR, END_YEAR+1):
        dfs.append(scrape_understat_season(y, 'EPL'))
        dfs.append(scrape_understat_season(y, 'Bundesliga'))
        dfs.append(scrape_understat_season(y, 'La_liga'))
        dfs.append(scrape_understat_season(y, 'Ligue_1'))
        dfs.append(scrape_understat_season(y, 'Serie_A'))
        dfs.append(scrape_understat_season(y, 'Champions_League')) # <--- CL
    
    df_understat = pd.concat(dfs, ignore_index=True)
    if not df_understat.empty:
        df_understat['HomeTeam'] = df_understat['HomeTeam'].apply(clean_team_name)
        df_understat['AwayTeam'] = df_understat['AwayTeam'].apply(clean_team_name)
        df_understat.to_csv(XG_FILE, index=False)
    else:
        df_understat = pd.DataFrame()

# 3. MERGE INTELIGENTE
# O df_main tem as Odds das ligas. O df_understat tem xG e jogos da Champions.
# Vamos usar o df_understat para enriquecer o df_main, e adicionar as linhas da Champions ao df_main.

if not df_understat.empty:
    df_understat['Date'] = pd.to_datetime(df_understat['Date']).dt.normalize()
    df_main['Date'] = df_main['Date'].dt.normalize()
    
    # A) Separar Jogos da Liga vs Champions
    # Jogos da Liga (Merge normal)
    df_leagues = df_understat[df_understat['League'] != 'Champions_League']
    
    # Jogos da Champions (Temos de criar a estrutura para eles entrarem no dataset principal)
    df_cl = df_understat[df_understat['League'] == 'Champions_League'].copy()
    df_cl['Div'] = 'CL' # C√≥digo para Champions
    
    # No dataset principal, FTR (Full Time Result) √© H/D/A. Understat tem golos.
    # Vamos calcular o FTR para a Champions
    def get_res(row):
        if row['FTHG'] > row['FTAG']: return 'H'
        elif row['FTHG'] < row['FTAG']: return 'A'
        else: return 'D'
    df_cl['FTR'] = df_cl.apply(get_res, axis=1)
    
    # Preparar CL para concat (Selecionar colunas comuns)
    cols_common = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'Div', 'Home_xG', 'Away_xG']
    df_cl_clean = df_cl[cols_common]
    
    # B) Merge xG nas Ligas
    print("üîÑ A realizar Merge (Ligas)...")
    # Limpar colunas antigas
    cols_exclude = [c for c in df_main.columns if 'xG' in c]
    df_main = df_main.drop(columns=cols_exclude)
    
    df_final = df_main.merge(
        df_leagues[['Date', 'HomeTeam', 'AwayTeam', 'Home_xG', 'Away_xG']],
        on=['Date', 'HomeTeam', 'AwayTeam'],
        how='left'
    )
    
    # C) Adicionar Jogos da Champions (Append)
    print(f"üá™üá∫ A adicionar {len(df_cl_clean)} jogos da Champions League...")
    df_final = pd.concat([df_final, df_cl_clean], ignore_index=True)
    
else:
    df_final = df_main.copy()

# Ordenar tudo cronologicamente (Vital para o ELO funcionar bem)
hoje = pd.Timestamp.now().normalize()
df_final = df_final[df_final['Date'] <= hoje]
df = df_final.sort_values(['Date']).reset_index(drop=True)
df = df.fillna({'Home_xG': 1.0, 'Away_xG': 1.0})

print(f"‚úÖ Total Jogos (Big 5 + UCL): {len(df)}")
display(df.tail(3))

## 2. Feature Engineering Completa (ELO + Stats + Odds)

Aqui adicionamos as colunas B365H, B365D, B365A (Odds da Bet365).

In [None]:
# [CELL: Feature Engineering v7.0 - Champions League Logic]
def feature_engineering(df):
    print("‚öôÔ∏è Gerando Features (Com L√≥gica Champions League)...")
    df = df.copy()
    
    # 1. PREPARA√á√ÉO
    df['Season'] = df['Date'].apply(lambda x: x.year if x.month > 7 else x.year - 1).astype(int)
    df = df.sort_values('Date')
    
    le_div = LabelEncoder()
    df['Div_Code'] = le_div.fit_transform(df['Div'])
    print(f"   Ligas: {le_div.classes_}") # Verifica se 'CL' aparece aqui
    
    # ---------------------------------------------------------
    # 2. MARKET VALUE
    # ---------------------------------------------------------
    real_values = {}
    if os.path.exists('market_values.csv'):
        try:
            mv_df = pd.read_csv('market_values.csv')
            mv_df.columns = [c.strip().capitalize() for c in mv_df.columns] 
            if 'Year' in mv_df.columns: mv_df.rename(columns={'Year': 'Season'}, inplace=True)
            
            if 'Season' in mv_df.columns and 'Value' in mv_df.columns:
                def normalize_tm_name(name):
                    name = str(name).lower()
                    if 'manchester city' in name: return 'Man City'
                    if 'manchester united' in name: return 'Man United'
                    if 'paris saint-germain' in name: return 'Paris Saint Germain'
                    if 'leverkusen' in name: return 'Bayer Leverkusen'
                    if 'monchengladbach' in name: return 'Borussia M.Gladbach'
                    if 'inter' in name: return 'Inter'
                    if 'milan' in name: return 'AC Milan'
                    if 'sporting cp' in name: return 'Sporting CP'
                    if 'benfica' in name: return 'Benfica'
                    if 'porto' in name: return 'Porto'
                    return name 
                
                for _, row in mv_df.iterrows():
                    try: s = int(row['Season'])
                    except: continue
                    t = normalize_tm_name(row['Team'])
                    v = row['Value']
                    if s not in real_values: real_values[s] = {}
                    real_values[s][t] = v
                    real_values[s][row['Team']] = v 
        except: pass

    def get_market_value(team, season):
        if season in real_values:
            if team in real_values[season]: return real_values[season][team]
            tc = clean_team_name(team) 
            if tc in real_values[season]: return real_values[season][tc]
            for key in real_values[season]:
                if isinstance(key, str) and (team in key or key in team): return real_values[season][key]

        tier_1 = ['Man City', 'Arsenal', 'Liverpool', 'Real Madrid', 'Barcelona', 'Bayern Munich', 'Paris Saint Germain', 'Inter']
        tier_2 = ['Man United', 'Chelsea', 'Tottenham', 'Newcastle', 'Atletico Madrid', 'Borussia Dortmund', 'Bayer Leverkusen', 'RB Leipzig', 'Juventus', 'AC Milan', 'Napoli', 'Benfica', 'Porto', 'Sporting CP']
        if team in tier_1: return 900
        if team in tier_2: return 500
        return 150 

    df['Home_Value'] = df.apply(lambda x: get_market_value(x['HomeTeam'], x['Season']), axis=1)
    df['Away_Value'] = df.apply(lambda x: get_market_value(x['AwayTeam'], x['Season']), axis=1)
    df['Value_Ratio'] = np.log1p(df['Home_Value']) - np.log1p(df['Away_Value'])

    # ---------------------------------------------------------
    # 3. PONTOS E MOTIVA√á√ÉO (Adaptado para CL)
    # ---------------------------------------------------------
    standings = {} 
    df['Home_Pts'] = 0; df['Away_Pts'] = 0
    df['Home_Pos'] = 10; df['Away_Pos'] = 10
    df['Home_Game_Num'] = 0; df['Away_Game_Num'] = 0
    
    # Feature: Tipo de Competi√ß√£o (0=Liga, 1=Ta√ßa/CL)
    df['Is_Cup'] = df['Div'].apply(lambda x: 1 if x == 'CL' else 0)

    for i, row in df.iterrows():
        season = row['Season']
        div = row['Div']
        h, a, res = row['HomeTeam'], row['AwayTeam'], row['FTR']
        
        if season not in standings: standings[season] = {}
        if div not in standings[season]: standings[season][div] = {}
        if h not in standings[season][div]: standings[season][div][h] = {'pts': 0, 'games': 0}
        if a not in standings[season][div]: standings[season][div][a] = {'pts': 0, 'games': 0}
        
        # S√≥ usamos pontos se for Liga. Se for CL, pomos dummy (0)
        if div != 'CL':
            df.at[i, 'Home_Pts'] = standings[season][div][h]['pts']
            df.at[i, 'Away_Pts'] = standings[season][div][a]['pts']
            
            teams_sorted = sorted(standings[season][div].items(), key=lambda x: x[1]['pts'], reverse=True)
            ranks = {t: r+1 for r, (t, data) in enumerate(teams_sorted)}
            df.at[i, 'Home_Pos'] = ranks.get(h, 10)
            df.at[i, 'Away_Pos'] = ranks.get(a, 10)
        else:
            df.at[i, 'Home_Pts'] = 0
            df.at[i, 'Away_Pts'] = 0
            df.at[i, 'Home_Pos'] = 1 # Dummy, na CL todos querem ganhar
            df.at[i, 'Away_Pos'] = 1
            
        df.at[i, 'Home_Game_Num'] = standings[season][div][h]['games'] + 1
        df.at[i, 'Away_Game_Num'] = standings[season][div][a]['games'] + 1
        
        pts_h = 3 if res == 'H' else 1 if res == 'D' else 0
        pts_a = 3 if res == 'A' else 1 if res == 'D' else 0
        standings[season][div][h]['pts'] += pts_h
        standings[season][div][a]['pts'] += pts_a
        standings[season][div][h]['games'] += 1
        standings[season][div][a]['games'] += 1

    def get_motivation(game_num, pos, is_cup):
        if is_cup: return 1.3 # Champions League = Motiva√ß√£o M√°xima
        if game_num < 30: return 1.0 
        if pos <= 6: return 1.2 
        if pos >= 16: return 1.3 
        return 0.5 

    df['Home_Motiv'] = df.apply(lambda x: get_motivation(x['Home_Game_Num'], x['Home_Pos'], x['Is_Cup']), axis=1)
    df['Away_Motiv'] = df.apply(lambda x: get_motivation(x['Away_Game_Num'], x['Away_Pos'], x['Is_Cup']), axis=1)

    # ---------------------------------------------------------
    # 4. FADIGA E ELO
    # ---------------------------------------------------------
    df['Rest_Home'] = df.groupby('HomeTeam')['Date'].diff().dt.days.fillna(7).clip(upper=15)
    df['Rest_Away'] = df.groupby('AwayTeam')['Date'].diff().dt.days.fillna(7).clip(upper=15)
    
    def check_euro_fatigue(rest, value):
        if value > 400 and rest < 4: return 1
        return 0

    df['Home_Fatigue'] = df.apply(lambda x: check_euro_fatigue(x['Rest_Home'], x['Home_Value']), axis=1)
    df['Away_Fatigue'] = df.apply(lambda x: check_euro_fatigue(x['Rest_Away'], x['Away_Value']), axis=1)

    df['HomeElo'] = 1500.0; df['AwayElo'] = 1500.0
    elo_dict = {}
    k_factor = 20
    
    # ELO √© Global (mistura ligas e CL)
    for i, row in df.iterrows():
        h, a, res = row['HomeTeam'], row['AwayTeam'], row['FTR']
        h_elo = elo_dict.get(h, 1500.0); a_elo = elo_dict.get(a, 1500.0)
        df.at[i, 'HomeElo'] = h_elo; df.at[i, 'AwayElo'] = a_elo
        actual = 1 if res == 'H' else 0.5 if res == 'D' else 0
        exp = 1 / (1 + 10**((a_elo - h_elo)/400))
        elo_dict[h] = h_elo + k_factor * (actual - exp)
        elo_dict[a] = a_elo - k_factor * (actual - exp)
        
    df['EloDiff'] = df['HomeElo'] - df['AwayElo']

    # ---------------------------------------------------------
    # 5. ROLLING STATS
    # ---------------------------------------------------------
    # Na CL, muitas vezes faltam dados de Cantos/Cart√µes no CSV. 
    # Vamos focar no que temos: Golos e xG.
    cols_to_avg = ['FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'Home_xG', 'Away_xG']
    cols_to_avg = [c for c in cols_to_avg if c in df.columns]
    
    home_stats = df[['Date', 'HomeTeam'] + cols_to_avg].rename(columns={'HomeTeam': 'Team'})
    away_stats = df[['Date', 'AwayTeam'] + cols_to_avg].rename(columns={'AwayTeam': 'Team'})
    
    for col in cols_to_avg:
        home_stats.rename(columns={col: f'Stat_{col}'}, inplace=True)
        away_stats.rename(columns={col: f'Stat_{col}'}, inplace=True)

    all_stats = pd.concat([home_stats, away_stats]).sort_values(['Team', 'Date'])
    
    for col in [c for c in all_stats.columns if 'Stat_' in c]:
        all_stats[f'Avg_{col}_L5'] = all_stats.groupby('Team')[col].transform(lambda x: x.shift(1).rolling(5, min_periods=3).mean()).fillna(0)
    
    feat_cols = ['Date', 'Team'] + [c for c in all_stats.columns if 'Avg_' in c]
    df = df.merge(all_stats[feat_cols], left_on=['Date', 'HomeTeam'], right_on=['Date', 'Team'], how='left').drop(columns=['Team'])
    df = df.rename(columns={c: f'Home_{c}' for c in feat_cols if 'Avg_' in c})
    df = df.merge(all_stats[feat_cols], left_on=['Date', 'AwayTeam'], right_on=['Date', 'Team'], how='left').drop(columns=['Team'])
    df = df.rename(columns={c: f'Away_{c}' for c in feat_cols if 'Avg_' in c})

    # Odds Features (Preencher com 0 se for jogo da CL sem odds, para n√£o partir o c√≥digo)
    if 'B365H' not in df.columns: df['B365H'] = 0
    if 'B365D' not in df.columns: df['B365D'] = 0
    if 'B365A' not in df.columns: df['B365A'] = 0
    
    df['Imp_Home'] = np.where(df['B365H']>0, 1/df['B365H'], 0)
    df['Imp_Draw'] = np.where(df['B365D']>0, 1/df['B365D'], 0)
    df['Imp_Away'] = np.where(df['B365A']>0, 1/df['B365A'], 0)
    
    # 1X / X2 (Estimativa)
    df['Imp_1X'] = df['Imp_Home'] + df['Imp_Draw']
    df['Imp_X2'] = df['Imp_Draw'] + df['Imp_Away']
    df['Imp_12'] = df['Imp_Home'] + df['Imp_Away']

    # LISTA FINAL
    features_needed = [
        'Div_Code', 'Is_Cup', # <--- NOVO
        'HomeElo', 'AwayElo', 'EloDiff', 
        'Rest_Home', 'Rest_Away', 
        'Home_Value', 'Away_Value', 'Value_Ratio',
        'Home_Fatigue', 'Away_Fatigue', 'Home_Motiv', 'Away_Motiv',
        'Imp_Home', 'Imp_Draw', 'Imp_Away',
        'Imp_1X', 'Imp_X2', 'Imp_12',
        'Home_Pts', 'Away_Pts', 'Home_Pos', 'Away_Pos'
    ]
    features_needed += [c for c in df.columns if 'Home_Avg_' in c or 'Away_Avg_' in c]
    features_needed = list(set(features_needed))
    existing_features = [f for f in features_needed if f in df.columns]
    
    print("üßπ Limpeza Final (Removendo jogos sem Odds para Treino)...")
    # Para treino, s√≥ queremos jogos com Odds (Ligas). A CL serviu para calcular ELO/Stats.
    df_clean = df.dropna(subset=['FTR']).copy()
    # Filtro importante: s√≥ manter se tiver odds v√°lidas OU se for para infer√™ncia futura
    df_clean = df_clean[df_clean['Imp_Home'] > 0] 
    
    df_clean[existing_features] = df_clean[existing_features].fillna(0)
    df_clean.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df_clean, existing_features, elo_dict, le_div

df_ready, features, current_elos, le_div = feature_engineering(df)
print(f"‚úÖ Features updated. Total features: {len(features)}")

## 3. Prepara√ß√£o e Treino do Modelo
Treino Intensivo: Grid Search (Hyperparameter Tuning) Aqui √© onde "apertamos" o modelo. Vamos testar v√°rias combina√ß√µes. Nota: Isto pode demorar 2 ou 3 minutos a correr.

In [None]:
# [CELL: Treino Otimizado TOTAL - Imports Corrigidos]

# Limpeza de Seguran√ßa
print("üßπ A limpar valores infinitos/nulos...")
df_ready.replace([np.inf, -np.inf], 0, inplace=True)
df_ready.fillna(0, inplace=True)

# 1. Preparar Dados
target = 'Target'
le = LabelEncoder()
df_ready['Target'] = le.fit_transform(df_ready['FTR']) # 0=Away, 1=Draw, 2=Home

# Split 80/20
split_index = int(len(df_ready) * 0.80)
train = df_ready.iloc[:split_index]
test = df_ready.iloc[split_index:]

X_train, y_train = train[features], train['Target']
X_test, y_test = test[features], test['Target']

print(f"üèãÔ∏è A iniciar Otimiza√ß√£o Dupla em {len(X_train)} jogos...")
tscv = TimeSeriesSplit(n_splits=3)

# --- 1. MODELO NORMAL ---
print("\nüîç A otimizar Modelo Normal (1X2)...")
xgb_multi = xgb.XGBClassifier(objective='multi:softprob', random_state=42, eval_metric='mlogloss')
param_grid_multi = {'n_estimators': [200, 300], 'max_depth': [3, 4], 'learning_rate': [0.01, 0.03], 'subsample': [0.8]}

grid_multi = GridSearchCV(estimator=xgb_multi, param_grid=param_grid_multi, cv=tscv, scoring='neg_log_loss', n_jobs=-1, verbose=1)

sample_weights = np.ones(len(y_train))
draw_code = le.transform(['D'])[0]
sample_weights[y_train == draw_code] = 1.15

grid_multi.fit(X_train, y_train, sample_weight=sample_weights)
model_multi = grid_multi.best_estimator_
print(f"‚úÖ Melhores Params (Normal): {grid_multi.best_params_}")

# --- 2. MODELO SNIPER ---
print("\nüîç A otimizar Modelo Sniper (Bin√°rio)...")
y_train_win = (y_train == 2).astype(int); y_test_win = (y_test == 2).astype(int)
xgb_sniper = xgb.XGBClassifier(objective='binary:logistic', random_state=42, eval_metric='logloss')
param_grid_sniper = {'n_estimators': [150, 200, 250], 'max_depth': [3, 4, 5], 'learning_rate': [0.01, 0.02, 0.03], 'subsample': [0.8]}

grid_sniper = GridSearchCV(estimator=xgb_sniper, param_grid=param_grid_sniper, cv=tscv, scoring='neg_log_loss', n_jobs=-1, verbose=1)
grid_sniper.fit(X_train, y_train_win)
model_sniper = grid_sniper.best_estimator_
print(f"‚úÖ Melhores Params (Sniper): {grid_sniper.best_params_}")

# --- 3. MODELO SHIELD ---
print("\nüõ°Ô∏è A treinar Modelo Shield...")
y_train_1x = (y_train != 0).astype(int)
model_shield = xgb.XGBClassifier(**grid_sniper.best_params_, objective='binary:logistic', random_state=42)
model_shield.fit(X_train, y_train_1x)

# --- 4. VISUALIZA√á√ÉO ---
print("\nüìä RELAT√ìRIO VISUAL FINAL")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

preds_multi = model_multi.predict(X_test)
acc_multi = accuracy_score(y_test, preds_multi)
cm_multi = confusion_matrix(y_test, preds_multi)
sns.heatmap(cm_multi, annot=True, fmt='d', cmap='Blues', xticklabels=['Away', 'Draw', 'Home'], yticklabels=['Away', 'Draw', 'Home'], ax=axes[0])
axes[0].set_title(f'Modelo Normal (Acc: {acc_multi:.1%})')

preds_sniper = model_sniper.predict(X_test)
acc_sniper = accuracy_score(y_test_win, preds_sniper)
cm_sniper = confusion_matrix(y_test_win, preds_sniper)
sns.heatmap(cm_sniper, annot=True, fmt='d', cmap='Greens', xticklabels=['Not Win', 'Win'], yticklabels=['Not Win', 'Win'], ax=axes[1])
axes[1].set_title(f'Modelo Sniper (Acc: {acc_sniper:.1%})')

plt.tight_layout()
plt.show()

In [None]:
# [CELL: Feature Importance]
feature_imp = pd.DataFrame({
    'Feature': features,
    'Importance': model_multi.feature_importances_
}).sort_values('Importance', ascending=False).reset_index(drop=True)

print("üìã TOP FEATURES (General Model):")
feature_imp['Importance %'] = (feature_imp['Importance'] * 100).round(2)
display(feature_imp[['Feature', 'Importance %']])

## 4. Aplica√ß√£o na "Vida Real"
Aqui est√° a fun√ß√£o final. Ela usa o dicion√°rio `current_elo` (que cont√©m os valores mais recentes ap√≥s o √∫ltimo jogo do dataset) para fazer previs√µes sobre jogos futuros.

In [None]:
# [CELL: Prediction Function v8.1 - Valida√ß√£o de Ligas (Anti-Erro)]
def predict_match_advanced(date_str, home_team, away_team, 
                           odd_h, odd_d, odd_a, 
                           division='E0', # OBRIGAT√ìRIO
                           odd_1x=None, odd_12=None, odd_x2=None):
    
    match_date = pd.to_datetime(date_str)
    
    div_map = {
        'E0': 'Premier League üá¨üáß', 'D1': 'Bundesliga üá©üá™', 
        'SP1': 'La Liga üá™üá∏', 'F1': 'Ligue 1 üá´üá∑', 
        'I1': 'Serie A üáÆüáπ', 'CL': 'Champions League üá™üá∫'
    }
    div_name = div_map.get(division, division)
    
    # --- 0. VALIDA√á√ÉO DE SEGURAN√áA (NOVO!) ---
    # Criar mapa de equipas -> liga (baseado no hist√≥rico recente)
    # Se a equipa jogou 90% dos jogos na liga X, ela √© da liga X.
    if division != 'CL': # Na Champions vale tudo
        print("üõ°Ô∏è A validar equipas...")
        for team in [home_team, away_team]:
            # Verificar √∫ltimos 20 jogos da equipa
            team_games = df_ready[(df_ready['HomeTeam'] == team) | (df_ready['AwayTeam'] == team)].tail(20)
            if not team_games.empty:
                # Contar ligas onde jogou (excluindo CL)
                leagues = team_games[team_games['Div'] != 'CL']['Div'].value_counts()
                if not leagues.empty:
                    main_league = leagues.index[0] # A liga mais comum
                    if main_league != division:
                        print(f"‚ùå ERRO CR√çTICO: {team} joga na {div_map.get(main_league, main_league)}, n√£o na {div_name}!")
                        print("   -> Corre a fun√ß√£o com a divis√£o correta ou muda a equipa.")
                        return # P√°ra a fun√ß√£o aqui

    print(f"\nüîÆ PREVIS√ÉO AVAN√áADA ({div_name}): {home_team} vs {away_team} ({date_str})")
    print("=" * 100)
    
    past_data = df_ready[df_ready['Date'] < match_date].copy()
    if past_data.empty: 
        print("‚ö†Ô∏è Erro: Sem dados hist√≥ricos suficientes.")
        return

    # --- 1. CONTEXTO & FEATURES ---
    def get_market_value(team):
        team_games = past_data[(past_data['HomeTeam'] == team) | (past_data['AwayTeam'] == team)]
        if not team_games.empty:
            last = team_games.iloc[-1]
            if last['HomeTeam'] == team: return last.get('Home_Value', 150)
            return last.get('Away_Value', 150)
        # Fallback
        tier_1 = ['Man City', 'Real Madrid', 'Bayern Munich', 'Paris Saint Germain', 'Inter']
        if team in tier_1: return 800
        return 200

    def get_context(team):
        team_games = past_data[(past_data['HomeTeam'] == team) | (past_data['AwayTeam'] == team)]
        if team_games.empty: return 0.5, 10, 7
        last = team_games.iloc[-1]
        pos = last['Home_Pos'] if last['HomeTeam'] == team else last['Away_Pos']
        games = len(team_games)
        rest = (match_date - last['Date']).days
        
        # Motiva√ß√£o baseada na competi√ß√£o
        if division == 'CL':
            motiv = 1.3
        else:
            motiv = 1.0
            if games > 28: 
                if pos > 6 and pos < 16: motiv = 0.5 
                else: motiv = 1.2 
        return motiv, pos, rest

    input_data = {}
    
    h_motiv, h_pos, h_rest = get_context(home_team)
    a_motiv, a_pos, a_rest = get_context(away_team)
    h_val = get_market_value(home_team)
    a_val = get_market_value(away_team)
    
    input_data['Home_Motiv'] = h_motiv; input_data['Away_Motiv'] = a_motiv
    input_data['Rest_Home'] = h_rest; input_data['Rest_Away'] = a_rest
    input_data['Home_Value'] = h_val; input_data['Away_Value'] = a_val
    input_data['Value_Ratio'] = np.log1p(h_val) - np.log1p(a_val)
    input_data['Is_Cup'] = 1 if division == 'CL' else 0
    input_data['Home_Fatigue'] = 1 if (h_val > 400 and h_rest < 4) else 0
    input_data['Away_Fatigue'] = 1 if (a_val > 400 and a_rest < 4) else 0
    
    h_elo = current_elos.get(home_team, 1500)
    a_elo = current_elos.get(away_team, 1500)
    input_data['HomeElo'] = h_elo; input_data['AwayElo'] = a_elo
    input_data['EloDiff'] = h_elo - a_elo
    input_data['Home_Pts'] = 0; input_data['Away_Pts'] = 0
    input_data['Home_Pos'] = h_pos; input_data['Away_Pos'] = a_pos
    
    try: input_data['Div_Code'] = le_div.transform([division])[0]
    except: input_data['Div_Code'] = 0
    
    # Odds
    input_data['Imp_Home'] = 1/odd_h; input_data['Imp_Draw'] = 1/odd_d; input_data['Imp_Away'] = 1/odd_a
    input_data['Imp_1X'] = 1/odd_1x if odd_1x else (1/odd_h + 1/odd_d)
    input_data['Imp_X2'] = 1/odd_x2 if odd_x2 else (1/odd_d + 1/odd_a)
    input_data['Imp_12'] = 1/odd_12 if odd_12 else (1/odd_h + 1/odd_a)
    
    # Stats
    def fill_stats(team, prefix_h, prefix_a):
        games = past_data[(past_data['HomeTeam'] == team) | (past_data['AwayTeam'] == team)]
        if games.empty: return
        last = games.iloc[-1]
        for f in features:
            if 'Avg_' in f:
                try:
                    clean = ""
                    val = 0
                    if prefix_h in f: 
                        clean = f.replace(prefix_h, "")
                        col_name = f"Home_{clean}"
                        if col_name in last: val = last[col_name] if last['HomeTeam'] == team else last.get(f"Away_{clean}", 0)
                    elif prefix_a in f:
                        clean = f.replace(prefix_a, "")
                        col_name = f"Home_{clean}"
                        if col_name in last: val = last[col_name] if last['HomeTeam'] == team else last.get(f"Away_{clean}", 0)
                    if clean: input_data[f] = val
                except: pass 

    fill_stats(home_team, "Home_", "XX_IGNORE_XX")
    fill_stats(away_team, "XX_IGNORE_XX", "Away_")

    for f in features: 
        if f not in input_data: input_data[f] = df_ready[f].mean()

    # --- 2. EXECU√á√ÉO ---
    X_new = pd.DataFrame([input_data])[features]
    probs = model_multi.predict_proba(X_new)[0] 
    prob_a, prob_d, prob_h = probs[0], probs[1], probs[2]
    conf_win = model_sniper.predict_proba(X_new)[0][1]
    try: conf_shield = model_shield.predict_proba(X_new)[0][1]
    except: conf_shield = prob_h + prob_d
    
    # --- 3. RELAT√ìRIO VISUAL ---
    print(f"üìä PROBABILIDADES (IA):")
    print(f"   üè† Casa: {prob_h:.1%} (Sniper: {conf_win:.1%})")
    print(f"   ü§ù Empate: {prob_d:.1%}")
    print(f"   ‚úàÔ∏è Fora: {prob_a:.1%}")
    print("-" * 100)

    opportunities = []

    def analyze(name, odd, prob, bet_type="Standard"):
        if not odd or odd <= 1: return
        implied_prob = 1 / odd 
        fair_odd = 1 / prob if prob > 0 else 99.0
        ev = (prob * odd) - 1
        status = "üíé VALOR!" if ev > 0 else ("‚úÖ JUSTO" if ev > -0.05 else "‚ùå FRACO")
        print(f"   ‚Ä¢ {name:<35} | Odd: {odd:.2f} ({implied_prob:.1%}) | IA: {fair_odd:.2f} ({prob:.1%}) | {status}")
        opportunities.append({"name": name, "odd": odd, "prob": prob, "ev": ev})

    print("üí∞ SCANNER DE MERCADO (Compara√ß√£o de Percentagens):")
    analyze(f"Vitoria {home_team}", odd_h, prob_h, "HOME")
    analyze("Empate", odd_d, prob_d, "DRAW")
    analyze(f"Vitoria {away_team}", odd_a, prob_a, "AWAY")
    
    prob_1x_val = ((prob_h + prob_d) + conf_win) / 2
    try: prob_1x_val = (prob_1x_val + conf_shield) / 2
    except: pass

    if odd_1x: analyze(f"DC 1X ({home_team} ou Empate)", odd_1x, prob_1x_val, "1X")
    if odd_x2: analyze(f"DC X2 ({away_team} ou Empate)", odd_x2, (prob_a + prob_d), "X2")
    if odd_12: analyze(f"DC 12 ({home_team} ou {away_team})", odd_12, (prob_h + prob_a), "12")

    print("-" * 100)
    
    # --- 4. VEREDICTO FINAL ---
    opportunities.sort(key=lambda x: x['ev'], reverse=True)
    best = opportunities[0]
    
    most_likely = sorted(opportunities, key=lambda x: x['prob'], reverse=True)[0]
    final_pick = best
    reason = "Melhor valor matem√°tico dispon√≠vel (EV Positivo)."
    
    if most_likely['prob'] > 0.65 and best['ev'] < 0.05:
        final_pick = most_likely
        reason = f"Probabilidade Dominante ({final_pick['prob']:.1%}). Aposta 'Banker'."

    print(f"üèÜ ESCOLHA RACIONAL (Dinheiro): üëâ {final_pick['name']} (Odd: {final_pick['odd']})")
    print(f"   üìù Motivo: {reason}")
    print(f"   üìâ Confian√ßa IA: {final_pick['prob']:.1%}")
    print("")
    
    if most_likely['name'] != final_pick['name']:
        print(f"üé≤ RESULTADO MAIS PROV√ÅVEL:   üëâ {most_likely['name']} ({most_likely['prob']:.1%})")
        print("   ‚ö†Ô∏è Nota: Este √© o desfecho que a IA acha que vai acontecer, mas a Odd paga pouco.")
    else:
        print("üé≤ RESULTADO MAIS PROV√ÅVEL:   (Igual √† Escolha Racional)")

In [None]:
predict_match_advanced('2025-12-09', 'Inter', 'Liverpool', 
                       odd_h=2.02, odd_d=3.60, odd_a=3.25, 
                       division='CL',
                       odd_1x=1.30, odd_12=1.25, odd_x2=1.65)

In [None]:
predict_match_advanced('2025-12-08', 'Torino', 'Milan', 
                       odd_h=5.25, odd_d=3.70, odd_a=1.61, 
                       division='I1',
                       odd_1x=2.12, odd_12=1.24, odd_x2=1.15)

In [None]:
predict_match_advanced('2025-12-08', 'Wolves', 'Man United', 
                       odd_h=5.00, odd_d=4.20, odd_a=1.56, 
                       division='E0', 
                       odd_1x=2.25, odd_12=1.19, odd_x2=1.15)