# Premier League V4.5: Re-Optimizing for Draws

A accuracy baixou porque mud√°mos as regras do jogo (pesos) mas mantivemos a estrat√©gia antiga.
Nesta etapa, vamos correr o **Grid Search** novamente, mas desta vez informando o Grid Search de que os empates s√£o importantes.

Imports e Configura√ß√£o

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib # Para salvar o modelo
import re
import os
import codecs
import requests
from bs4 import BeautifulSoup
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

## 1. Data Acquisition (Recolha de Dados)
Vamos buscar dados reais do `football-data.co.uk`. Vamos carregar v√°rias temporadas consecutivas para que o modelo tenha hist√≥rico suficiente para aprender padr√µes.

* **FTHG**: Full Time Home Goals
* **FTAG**: Full Time Away Goals
* **FTR**: Full Time Result (H=Home, D=Draw, A=Away)

In [None]:
# --- CONFIGURA√á√ÉO ---
DATA_FILE = 'premier_league_v3_full.csv'
XG_FILE = 'premier_league_xg_data.csv'
START_YEAR = 2005
END_YEAR = 2025

# --- FUN√á√ÉO 1: Scraper Robusto (Understat) ---
def scrape_understat_season(year):
    print(f"üï∑Ô∏è A recolher xG de {year}/{year+1}...")
    url = f"https://understat.com/league/EPL/{year}"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return pd.DataFrame()
        
        # Regex para capturar JSON (evita erros de formata√ß√£o HTML)
        match = re.search(r"datesData\s*=\s*JSON\.parse\('(.*?)'\)", response.text)
        if not match:
            print(f"‚ö†Ô∏è Sem dados para {year}")
            return pd.DataFrame()
            
        json_data = codecs.decode(match.group(1), 'unicode_escape')
        data = json.loads(json_data)
        
        matches = []
        for m in data:
            if m['isResult']:
                matches.append({
                    'Date': m['datetime'][:10],
                    'HomeTeam': m['h']['title'],
                    'AwayTeam': m['a']['title'],
                    'Home_xG': float(m['xG']['h']),
                    'Away_xG': float(m['xG']['a'])
                })
        return pd.DataFrame(matches)
    except Exception as e:
        print(f"‚ö†Ô∏è Erro no ano {year}: {e}")
        return pd.DataFrame()

# --- FUN√á√ÉO 2: Carregar Dados Principais (Football-Data) ---
def get_main_data(start, end):
    if os.path.exists(DATA_FILE):
        print(f"üìÇ Carregando dados locais: {DATA_FILE}")
        df = pd.read_csv(DATA_FILE)
        df['Date'] = pd.to_datetime(df['Date'])
        return df
    
    print("üåê A descarregar dados do Football-Data...")
    dfs = []
    base_url = "https://www.football-data.co.uk/mmz4281/{}/{}.csv"
    for year in range(start, end + 1):
        season = f"{str(year)[-2:]}{str(year+1)[-2:]}"
        try:
            df = pd.read_csv(base_url.format(season, "E0"))
            df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
            dfs.append(df)
        except: pass
        
    full_df = pd.concat(dfs, ignore_index=True).dropna(subset=['Date', 'FTR'])
    full_df.to_csv(DATA_FILE, index=False)
    return full_df.sort_values('Date').reset_index(drop=True)

# --- FUN√á√ÉO 3: Limpeza de Nomes (Crucial para o Merge) ---
def clean_team_name(name):
    name_map = {
        'Manchester United': 'Man United', 'Manchester City': 'Man City',
        'Newcastle United': 'Newcastle', 'West Ham United': 'West Ham', 'West Ham': 'West Ham',
        'Wolverhampton Wanderers': 'Wolves', 'Brighton': 'Brighton',
        'Leicester City': 'Leicester', 'Leeds United': 'Leeds',
        'Tottenham Hotspur': 'Tottenham', 'Tottenham': 'Tottenham', 
        'Nottingham Forest': "Nott'm Forest", 'Sheffield United': 'Sheffield United', 
        'Luton': 'Luton', 'Brentford': 'Brentford', 'Bournemouth': 'Bournemouth',
        'Ipswich Town': 'Ipswich', 'Hull City': 'Hull', 'Stoke City': 'Stoke',
        'Swansea City': 'Swansea', 'Cardiff City': 'Cardiff',
        'Huddersfield Town': 'Huddersfield', 'West Bromwich Albion': 'West Brom',
        'Norwich City': 'Norwich', 'Queens Park Rangers': 'QPR'
    }
    return name_map.get(name, name)

In [None]:
# 1. Carregar Dados Principais
df = get_main_data(START_YEAR, END_YEAR)

# 2. Carregar ou Sacar xG
if os.path.exists(XG_FILE):
    print("üìÇ Carregando xG local...")
    df_xg = pd.read_csv(XG_FILE)
else:
    print("üåê A iniciar scraping xG...")
    dfs_xg = [scrape_understat_season(y) for y in range(START_YEAR, END_YEAR)]
    df_xg = pd.concat(dfs_xg, ignore_index=True)
    # Aplicar limpeza de nomes logo ap√≥s o scrape
    df_xg['HomeTeam'] = df_xg['HomeTeam'].apply(clean_team_name)
    df_xg['AwayTeam'] = df_xg['AwayTeam'].apply(clean_team_name)
    df_xg.to_csv(XG_FILE, index=False)

# 3. PREPARA√á√ÉO PARA MERGE (A Corre√ß√£o dos Erros)
# Normalizar Datas (remover horas)
df['Date'] = pd.to_datetime(df['Date']).dt.normalize()
df_xg['Date'] = pd.to_datetime(df_xg['Date']).dt.normalize()

# Aplicar limpeza de nomes ao DF principal tamb√©m
df['HomeTeam'] = df['HomeTeam'].apply(clean_team_name)
df['AwayTeam'] = df['AwayTeam'].apply(clean_team_name)

# Remover colunas duplicadas ou antigas de xG no DF principal
cols_exclude = [c for c in df.columns if 'xG' in c]
df_clean = df.drop(columns=cols_exclude)

# 4. DIAGN√ìSTICO (Para veres se h√° equipas com nomes diferentes)
set_main = set(df_clean['HomeTeam'].unique())
set_xg = set(df_xg['HomeTeam'].unique())
missing_in_xg = set_main - set_xg
# Filtrar apenas equipas recentes (p√≥s 2014) onde existe xG
print(f"‚ö†Ô∏è Equipas no dataset principal sem correspond√™ncia no xG: {len(missing_in_xg)}")
# Se quiseres ver quais s√£o, descomenta a linha abaixo:
# print(missing_in_xg)

# 5. MERGE FINAL
print("üîÑ A realizar o Merge...")
df_final = df_clean.merge(
    df_xg[['Date', 'HomeTeam', 'AwayTeam', 'Home_xG', 'Away_xG']],
    on=['Date', 'HomeTeam', 'AwayTeam'],
    how='left'
)

# Estat√≠stica de Sucesso
missing_count = df_final['Home_xG'].isna().sum()
total = len(df_final)
print(f"‚úÖ Merge conclu√≠do! Jogos com xG: {total - missing_count} / {total}")
print(f"üìâ Jogos sem xG (Preenchidos com 1.0): {missing_count}")

# Preencher vazios
df = df_final.fillna({'Home_xG': 1.0, 'Away_xG': 1.0})

display(df.tail(3))

## 2. Feature Engineering Completa (ELO + Stats + Odds)

Aqui adicionamos as colunas B365H, B365D, B365A (Odds da Bet365).

In [None]:
# C√âLULA CORRIGIDA: Feature Engineering (Resolve Erro de Duplicados e Data Leakage)
def prepare_features(df, window=5):
    df = df.copy()
    
    # --- 1. ELO SYSTEM ---
    elo_dict = {}
    df['HomeElo'] = 1500.0
    df['AwayElo'] = 1500.0
    k_factor = 20
    
    for i, row in df.iterrows():
        h, a, res = row['HomeTeam'], row['AwayTeam'], row['FTR']
        h_elo = elo_dict.get(h, 1500.0)
        a_elo = elo_dict.get(a, 1500.0)
        
        df.at[i, 'HomeElo'] = h_elo
        df.at[i, 'AwayElo'] = a_elo
        
        if res == 'H': val = 1
        elif res == 'D': val = 0.5
        else: val = 0
        
        exp_h = 1 / (1 + 10**((a_elo - h_elo)/400))
        new_h = h_elo + k_factor * (val - exp_h)
        new_a = a_elo + k_factor * ((1-val) - (1-exp_h))
        
        elo_dict[h] = new_h
        elo_dict[a] = new_a
        
    df['EloDiff'] = df['HomeElo'] - df['AwayElo']
    
    # --- 2. ROLLING STATS (Agora com xG!) ---
    # Preparar dados Home
    # Nota: Usamos o 'Home_xG' raw aqui apenas para calcular a m√©dia hist√≥rica
    cols_home = ['Date', 'HomeTeam', 'FTHG', 'FTAG', 'HS', 'HST', 'HC', 'Home_xG']
    home_stats = df[cols_home].copy()
    home_stats.columns = ['Date', 'Team', 'Goals', 'Conceded', 'Shots', 'SoT', 'Corners', 'xG']
    home_stats['Points'] = df['FTR'].map({'H':3, 'D':1, 'A':0})
    
    # Preparar dados Away
    cols_away = ['Date', 'AwayTeam', 'FTAG', 'FTHG', 'AS', 'AST', 'AC', 'Away_xG']
    away_stats = df[cols_away].copy()
    away_stats.columns = ['Date', 'Team', 'Goals', 'Conceded', 'Shots', 'SoT', 'Corners', 'xG']
    away_stats['Points'] = df['FTR'].map({'A':3, 'D':1, 'H':0})
    
    all_stats = pd.concat([home_stats, away_stats]).sort_values(['Team', 'Date'])
    
    # Calcular M√©dias
    metrics = ['Points', 'Goals', 'Conceded', 'Shots', 'SoT', 'Corners', 'xG']
    
    for m in metrics:
        all_stats[f'Avg_{m}'] = all_stats.groupby('Team')[m].transform(
            lambda x: x.shift(1).rolling(window, min_periods=3).mean()
        )
    
    # --- CORRE√á√ÉO DO ERRO ---
    # Removemos as colunas RAW de xG do dataframe principal antes do merge.
    # Isto evita duplicar nomes (Home_xG raw vs Home_xG m√©dia) e evita Data Leakage.
    cols_to_drop = ['Home_xG', 'Away_xG']
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
    
    # Merge de volta (Home e Away)
    df = df.merge(all_stats[['Date', 'Team'] + [f'Avg_{m}' for m in metrics]],
                  left_on=['Date', 'HomeTeam'], right_on=['Date', 'Team'], how='left').drop(columns=['Team'])
    df = df.rename(columns={f'Avg_{m}': f'Home_{m}' for m in metrics})
    
    df = df.merge(all_stats[['Date', 'Team'] + [f'Avg_{m}' for m in metrics]],
                  left_on=['Date', 'AwayTeam'], right_on=['Date', 'Team'], how='left').drop(columns=['Team'])
    df = df.rename(columns={f'Avg_{m}': f'Away_{m}' for m in metrics})
    
    # --- 3. ODDS ---
    if 'B365H' in df.columns:
        df['Prob_Home'] = 1 / df['B365H']
        df['Prob_Draw'] = 1 / df['B365D']
        df['Prob_Away'] = 1 / df['B365A']
        df = df.dropna(subset=['Prob_Home'])
    
    # Limpeza final
    rolling_cols = [f for f in df.columns if f.startswith("Home_") or f.startswith("Away_")]
    # Agora j√° n√£o h√° colunas duplicadas, o fillna funciona
    df[rolling_cols] = df[rolling_cols].fillna(0)
    
    return df.dropna(axis=1, how='all'), elo_dict

# Executar a nova fun√ß√£o
df_processed, elo_tracker = prepare_features(df)
print("‚úÖ Features calculadas! Colunas de xG inclu√≠das e sem erros.")
print("Exemplo de colunas:", [c for c in df_processed.columns if 'xG' in c])

## 3. Prepara√ß√£o e Treino do Modelo
Treino Intensivo: Grid Search (Hyperparameter Tuning) Aqui √© onde "apertamos" o modelo. Vamos testar v√°rias combina√ß√µes. Nota: Isto pode demorar 2 ou 3 minutos a correr.

In [None]:
# C√âLULA ATUALIZADA: Treino Inteligente v6
MODEL_FILE = 'model_xgboost_v6_xg.pkl'
ENCODER_FILE = 'label_encoder_v6.pkl'

# 1. Definir Features e Target
cols_to_use = ['HomeElo', 'AwayElo', 'EloDiff', 'Prob_Home', 'Prob_Draw', 'Prob_Away'] + \
              [c for c in df_processed.columns if 'Home_' in c or 'Away_' in c]
features = [f for f in cols_to_use if f in df_processed.columns]

le = LabelEncoder()
df_processed['Target'] = le.fit_transform(df_processed['FTR'])

# Split
split = int(len(df_processed) * 0.90)
train = df_processed.iloc[:split]
test = df_processed.iloc[split:]
X_train, y_train = train[features], train['Target']
X_test, y_test = test[features], test['Target']

# 2. L√≥gica de Carregamento Inteligente
if os.path.exists(MODEL_FILE) and os.path.exists(ENCODER_FILE):
    print(f"üìÇ Modelo '{MODEL_FILE}' encontrado! A carregar...")
    model_final = joblib.load(MODEL_FILE)
    le = joblib.load(ENCODER_FILE)
    print("‚úÖ Modelo carregado com sucesso!")

else:
    print("‚ö†Ô∏è Modelo novo. A iniciar Grid Search (Isto demora uns minutos)...")
    
    # Grid Search
    xgb_model = xgb.XGBClassifier(random_state=42, objective='multi:softprob', eval_metric='mlogloss')
    param_grid = {
        'n_estimators': [150, 200],
        'max_depth': [3, 4],
        'learning_rate': [0.03, 0.05],
        'gamma': [0, 0.1],
        'min_child_weight': [1, 3]
    }
    
    tscv = TimeSeriesSplit(n_splits=3)
    grid = GridSearchCV(xgb_model, param_grid, cv=tscv, scoring='accuracy', verbose=1)
    grid.fit(X_train, y_train)
    
    best_params = grid.best_params_
    print(f"‚úÖ Melhores par√¢metros: {best_params}")
    
    # Treino Final com Pesos (Soft Boosting no Empate)
    weights = np.ones(len(y_train))
    draw_idx = le.transform(['D'])[0]
    weights[y_train == draw_idx] = 1.3
    
    print("‚öñÔ∏è A treinar modelo final com pesos...")
    model_final = xgb.XGBClassifier(**best_params, random_state=42, objective='multi:softprob')
    model_final.fit(X_train, y_train, sample_weight=weights)
    
    joblib.dump(model_final, MODEL_FILE)
    joblib.dump(le, ENCODER_FILE)
    print("üíæ Modelo salvo no disco!")

### Matriz de Confus√£o e accuracy
Vamos ver visualmente onde o modelo erra.
* Eixo Y: O que realmente aconteceu.
* Eixo X: O que o modelo previu.

In [None]:
# Avalia√ß√£o
preds = model_final.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"üéØ Accuracy Final (com xG): {acc:.2%}")

# Ver import√¢ncia das features
importances = pd.Series(model_final.feature_importances_, index=features).sort_values(ascending=False)
print("\nTop 5 Fatores mais importantes:")
print(importances.head(5))

## 4. Aplica√ß√£o na "Vida Real"
Aqui est√° a fun√ß√£o final. Ela usa o dicion√°rio `current_elo` (que cont√©m os valores mais recentes ap√≥s o √∫ltimo jogo do dataset) para fazer previs√µes sobre jogos futuros.

In [None]:
def predict_smart(home, away, odd_h, odd_d, odd_a):
    # --- 1. CARREGAMENTO DO MODELO ---
    # Tenta usar o modelo que est√° na mem√≥ria (model_final). 
    # Se n√£o existir, carrega o ficheiro CORRETO (v6_xg).
    global model_final, le # Garante que acedemos √†s vari√°veis globais se existirem
    
    try:
        model = model_final
    except NameError:
        print("‚ö†Ô∏è Modelo n√£o est√° na mem√≥ria. A carregar do disco...")
        model = joblib.load('model_xgboost_v6_xg.pkl') # <--- O NOME CORRETO √â ESTE
        
    # Verificar se o LabelEncoder (le) existe, sen√£o carregar
    try:
        encoder = le
    except NameError:
        encoder = joblib.load('label_encoder_v6.pkl')

    # --- 2. PREPARAR DADOS ---
    # (Assume que elo_tracker e df_processed est√£o em mem√≥ria)
    h_elo = elo_tracker.get(home, 1500)
    a_elo = elo_tracker.get(away, 1500)
    
    input_data = {
        'HomeElo': h_elo, 'AwayElo': a_elo, 'EloDiff': h_elo - a_elo,
        'Prob_Home': 1/odd_h, 'Prob_Draw': 1/odd_d, 'Prob_Away': 1/odd_a
    }
    
    # Preencher stats hist√≥ricas (xG, Remates, etc.)
    # Se a equipa subiu de divis√£o agora e n√£o tem hist√≥rico, usamos zeros (seguran√ßa)
    try:
        h_row = df_processed[df_processed['HomeTeam'] == home].iloc[-1]
    except IndexError:
        h_row = pd.Series(0, index=df_processed.columns)
        
    try:
        a_row = df_processed[df_processed['AwayTeam'] == away].iloc[-1]
    except IndexError:
        a_row = pd.Series(0, index=df_processed.columns)
    
    # Preencher as features necess√°rias
    for feat in features:
        if feat not in input_data:
            if 'Home_' in feat: input_data[feat] = h_row.get(feat, 0)
            elif 'Away_' in feat: input_data[feat] = a_row.get(feat, 0)

    X_input = pd.DataFrame([input_data])[features]
    
    # --- 3. PREVIS√ÉO ---
    probs = model.predict_proba(X_input)[0]
    
    # Mapear probabilidades corretamente usando o encoder
    # O encoder sabe que 0=Away, 1=Draw, 2=Home (ou a ordem alfab√©tica 'A', 'D', 'H')
    class_order = encoder.classes_
    prob_dict = {class_label: prob for class_label, prob in zip(class_order, probs)}
    
    p_home = prob_dict.get('H', 0)
    p_draw = prob_dict.get('D', 0)
    p_away = prob_dict.get('A', 0)
    
    print(f"\nüß† An√°lise: {home} vs {away}")
    print(f"   Probabilidades: Casa {p_home:.1%} | Empate {p_draw:.1%} | Fora {p_away:.1%}")
    print(f"   (xG M√©dio Recente: {home} {input_data.get('Home_xG',0):.2f} vs {away} {input_data.get('Away_xG',0):.2f})")
    
    # --- 4. VEREDICTO ---
    prediction = "Inconclusivo"
    
    if p_home > 0.45:
        prediction = f"Vit√≥ria {home}"
    elif p_away > 0.45:
        prediction = f"Vit√≥ria {away}"
    elif p_draw > 0.28: 
        prediction = "EMPATE (Risco calculado)"
    else:
        max_prob = max(p_home, p_draw, p_away)
        if max_prob == p_home: prediction = f"Tend√™ncia {home}"
        elif max_prob == p_away: prediction = f"Tend√™ncia {away}"
        else: prediction = "Tend√™ncia Empate"

    print(f"   >> Veredicto IA: {prediction}")

# Testa com o jogo que querias
predict_smart('Aston Villa', 'Arsenal', 4.05, 3.45, 1.84)