In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import re
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

warnings.filterwarnings('ignore')

print("==================================================")
print("   FOOTBALL PREDICTOR (DRAW OVERSAMPLING)         ")
print("==================================================")

# ==========================================
# 1. LOAD & CLEAN
# ==========================================
try:
    df = pd.read_csv("match_data.csv")
except FileNotFoundError:
    print("Error: match_data.csv not found.")
    exit()

def extract_date(url):
    try:
        match = re.search(r'([A-Za-z]+-\d{1,2}-\d{4})', str(url))
        if match: return pd.to_datetime(match.group(1), format='%B-%d-%Y', errors='coerce')
    except: pass
    return pd.NaT

df['date'] = df['match_url'].apply(extract_date)
df = df.sort_values(by='date').reset_index(drop=True)

# Numeric Cleaning
stats_cols = ["xg", "possession", "shots_onTarget", "corners", "fouls", "team_points", "team_score"]
for side in ['home', 'away']:
    p_col = f"{side}_team_possession"
    if p_col in df.columns:
        df[p_col] = pd.to_numeric(df[p_col].astype(str).str.rstrip('%'), errors='coerce').fillna(50) / 100.0
    
    for s in stats_cols:
        col = f"{side}_{s}" if s not in ['xg', 'possession', 'team_points', 'team_score'] else f"{side}_team_{s}"
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df.groupby(f'{side}_team_name')[col].transform('median'))
            df[col] = df[col].fillna(df[col].mean())

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================

# --- STANDARD ELO ---
def calculate_elo(df):
    k_factor = 20
    home_advantage = 60
    team_elos = {team: 1500 for team in set(df['home_team_name']).union(set(df['away_team_name']))}
    elo_h, elo_a = [], []
    
    for _, row in df.iterrows():
        h, a = row['home_team_name'], row['away_team_name']
        rh, ra = team_elos[h], team_elos[a]
        elo_h.append(rh); elo_a.append(ra)
        
        if row['home_team_score'] > row['away_team_score']: res = 1
        elif row['home_team_score'] == row['away_team_score']: res = 0.5
        else: res = 0
        
        dr = (rh + home_advantage) - ra
        e_h = 1 / (1 + 10 ** (-dr / 400))
        change = k_factor * (res - e_h)
        team_elos[h] = rh + change
        team_elos[a] = ra - change
        
    return elo_h, elo_a, team_elos

df['home_elo'], df['away_elo'], current_elos = calculate_elo(df)
df['diff_elo'] = (df['home_elo'] + 60) - df['away_elo']

# --- Rest Days ---
long_df = pd.concat([
    df[['date', 'home_team_name']].rename(columns={'home_team_name':'team'}),
    df[['date', 'away_team_name']].rename(columns={'away_team_name':'team'})
]).sort_values(['team', 'date'])
long_df['rest'] = (long_df['date'] - long_df.groupby('team')['date'].shift(1)).dt.days.fillna(7).clip(upper=14)
rest_map = dict(zip(zip(long_df['date'], long_df['team']), long_df['rest']))

df['diff_rest'] = df.apply(lambda x: rest_map.get((x['date'], x['home_team_name']),7), axis=1) - \
                  df.apply(lambda x: rest_map.get((x['date'], x['away_team_name']),7), axis=1)

# --- Rolling Stats (EWMA 10) ---
df['home_team_points'] = np.select([df['home_team_score']>df['away_team_score'], df['home_team_score']==df['away_team_score']], [3, 1], 0)
df['away_team_points'] = np.select([df['away_team_score']>df['home_team_score'], df['away_team_score']==df['home_team_score']], [3, 1], 0)

roll_feats = ['team_xg', 'team_possession','passing_onTarget','shots_onTarget', 'corners', 'team_points', 'fouls']

h_d = df[['date', 'match_url', 'home_team_name']].rename(columns={'home_team_name':'team'})
a_d = df[['date', 'match_url', 'away_team_name']].rename(columns={'away_team_name':'team'})

for f in roll_feats:
    c_h = f"home_{f}" if f in ['team_points', 'team_xg', 'team_possession'] else f"home_{f}"
    c_a = f"away_{f}" if f in ['team_points', 'team_xg', 'team_possession'] else f"away_{f}"
    if c_h in df.columns: h_d[f] = df[c_h]
    if c_a in df.columns: a_d[f] = df[c_a]

stacked = pd.concat([h_d, a_d]).sort_values(['team', 'date'])
for f in roll_feats:
    if f in stacked.columns:
        stacked[f'roll_{f}'] = stacked.groupby('team')[f].transform(lambda x: x.shift(1).ewm(span=10, min_periods=1).mean())

df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'home_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'home_roll_{f}' for f in roll_feats})
df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'away_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'away_roll_{f}' for f in roll_feats})

for f in roll_feats:
    df[f'diff_{f}'] = df[f'home_roll_{f}'] - df[f'away_roll_{f}']

df = df.fillna(0)

# ==========================================
# 3. PREPARE DATA
# ==========================================
features = [
    'diff_elo', 'home_elo', 'away_elo',
    'diff_rest', 'diff_team_points', 'diff_team_xg', 
    'diff_shots_onTarget', 'diff_corners',
    'home_roll_team_xg', 'away_roll_team_xg',
    'home_roll_team_possession', 'away_roll_team_possession',
    'home_roll_shots_onTarget', 'away_roll_shots_onTarget',
    'home_roll_corners', 'away_roll_corners',
    'home_roll_fouls', 'away_roll_fouls',
    'home_roll_passing_onTarget', 'away_roll_passing_onTarget',
]

conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
y = np.select(conditions, [2, 1, 0])
X = df[features]

split = int(len(df) * 0.85)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y[:split], y[split:] # Fixed array slicing

print(f"Original Train Size: {len(X_train)} matches")

# --- ðŸš€ THE FIX: OVERSAMPLING DRAWS ---
# Identify Draw Indices in Train Set
train_indices = np.where(y_train == 1)[0]
# Replicate them 1 time (Doubles the number of draws in training)
X_train_draws = X_train.iloc[train_indices]
y_train_draws = y_train[train_indices]

# Concat
X_train_augmented = pd.concat([X_train, X_train_draws])
y_train_augmented = np.concatenate([y_train, y_train_draws])

print(f"Augmented Train Size: {len(X_train_augmented)} matches (Draws Doubled)")

# ==========================================
# 4. TRAINING
# ==========================================

# Base Models (Standard Params)
xgb_clf = xgb.XGBClassifier(
    n_estimators=300, learning_rate=0.01, max_depth=4, 
    subsample=0.7, colsample_bytree=0.8, gamma=1,
    objective='multi:softprob', num_class=3, random_state=42
)

rf_clf = RandomForestClassifier(
    n_estimators=200, max_depth=10, min_samples_leaf=3, 
    max_features=0.5, random_state=42
)

lr_clf = make_pipeline(StandardScaler(), LogisticRegression(C=0.1, solver='lbfgs', multi_class='multinomial', max_iter=1000))

# Stacking
stacking_ensemble = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('lr', lr_clf)],
    final_estimator=LogisticRegression(),
    cv=5, n_jobs=-1
)

# Calibrate (SWITCH TO SIGMOID)
# Sigmoid allows shifts in probability better than Isotonic for oversampled data
calibrated = CalibratedClassifierCV(stacking_ensemble, method='sigmoid', cv=3)

print("Training on Augmented Data...")
calibrated.fit(X_train_augmented, y_train_augmented)

print("âœ… Model Trained.")

# ==========================================
# 5. EVALUATION
# ==========================================
preds = calibrated.predict(X_test)
acc = accuracy_score(y_test, preds)

print("==================================================")
print(f"   AUGMENTED ACCURACY: {acc:.2%}   ")
print("==================================================")

# Confusion Matrix
cm = confusion_matrix(y_test, preds)
print("\nConfusion Matrix (Middle = Draws):")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, preds, target_names=['Away', 'Draw', 'Home']))

# Save
joblib.dump({
    'model': calibrated, 
    'features': features,
    'elo_dict': current_elos,
    'df_recent': df[['date', 'home_team_name', 'away_team_name'] + [c for c in df.columns if 'roll_' in c]].tail(1000)
}, 'football_model_final.pkl')
print("âœ… Saved.")

   FOOTBALL PREDICTOR (DRAW OVERSAMPLING)         
Original Train Size: 1081 matches
Augmented Train Size: 1329 matches (Draws Doubled)
Training on Augmented Data...
âœ… Model Trained.
   AUGMENTED ACCURACY: 54.45%   

Confusion Matrix (Middle = Draws):
[[42  1 26]
 [ 5  4 22]
 [27  6 58]]

Classification Report:
              precision    recall  f1-score   support

        Away       0.57      0.61      0.59        69
        Draw       0.36      0.13      0.19        31
        Home       0.55      0.64      0.59        91

    accuracy                           0.54       191
   macro avg       0.49      0.46      0.46       191
weighted avg       0.52      0.54      0.52       191

âœ… Saved.
