In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import warnings
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

warnings.filterwarnings('ignore')

print("==================================================")
print("   HYPERPARAMETER TUNING ENGINE (ELO + ENSEMBLE)  ")
print("==================================================")

# ==========================================
# 1. LOAD & CLEAN (Same Feature Engineering)
# ==========================================
# We must recreate the exact features (ELO, Rolling, Rest) first
df = pd.read_csv("match_data.csv")

def extract_date(url):
    try:
        match = re.search(r'([A-Za-z]+-\d{1,2}-\d{4})', str(url))
        if match: return pd.to_datetime(match.group(1), format='%B-%d-%Y', errors='coerce')
    except: pass
    return pd.NaT

df['date'] = df['match_url'].apply(extract_date)
df = df.sort_values(by='date').reset_index(drop=True)

# Basic Numeric Cleaning & Imputation
stats_cols = ["xg", "possession", "shots_onTarget", "corners", "fouls", "team_points"]
for side in ['home', 'away']:
    p_col = f"{side}_team_possession"
    if p_col in df.columns:
        df[p_col] = pd.to_numeric(df[p_col].astype(str).str.rstrip('%'), errors='coerce').fillna(50) / 100.0
    
    # Calculate points if missing (3 for win logic)
    if 'team_points' not in df.columns and f'{side}_team_score' in df.columns:
        # (Logic handled in rolling block below)
        pass

    for s in stats_cols:
        col = f"{side}_{s}" if s not in ['xg', 'possession', 'team_points'] else f"{side}_team_{s}"
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Group median fill
            df[col] = df[col].fillna(df.groupby(f'{side}_team_name')[col].transform('median'))
            df[col] = df[col].fillna(df[col].mean()) # Fallback

# --- ELO CALCULATION ---
def calculate_elo(df):
    k_factor = 20          # Standard K-Factor (Stable)
    home_advantage = 60   # You found this works best for PL
    
    # Initialize all teams at 1500
    team_elos = {team: 1500 for team in set(df['home_team_name']).union(set(df['away_team_name']))}
    elo_h, elo_a = [], []
    
    for _, row in df.iterrows():
        h, a = row['home_team_name'], row['away_team_name']
        rh, ra = team_elos[h], team_elos[a]
        
        # Store ratings BEFORE the match (Predictive)
        elo_h.append(rh); elo_a.append(ra)
        
        # Result (1, 0.5, 0)
        if row['home_team_score'] > row['away_team_score']: res = 1
        elif row['home_team_score'] == row['away_team_score']: res = 0.5
        else: res = 0
        
        # Expected Result
        # No "Goal Difference" multiplier here. 
        # This keeps ratings tighter and improves Draw predictions.
        dr = (rh + home_advantage) - ra
        e_h = 1 / (1 + 10 ** (-dr / 400))
        
        # Update
        change = k_factor * (res - e_h)
        team_elos[h] = rh + change
        team_elos[a] = ra - change
        
    return elo_h, elo_a 

df['home_elo'], df['away_elo'] = calculate_elo(df)
df['diff_elo'] = (df['home_elo'] + 65) - df['away_elo']

# --- REST DAYS ---
def calc_rest(df):
    long_df = pd.concat([
        df[['date', 'home_team_name']].rename(columns={'home_team_name':'team'}),
        df[['date', 'away_team_name']].rename(columns={'away_team_name':'team'})
    ]).sort_values(['team', 'date'])
    long_df['rest'] = (long_df['date'] - long_df.groupby('team')['date'].shift(1)).dt.days.fillna(7).clip(upper=14)
    return dict(zip(zip(long_df['date'], long_df['team']), long_df['rest']))

rest_map = calc_rest(df)
df['diff_rest'] = df.apply(lambda x: rest_map.get((x['date'], x['home_team_name']),7), axis=1) - \
                  df.apply(lambda x: rest_map.get((x['date'], x['away_team_name']),7), axis=1)

# ==========================================
# 2. ROLLING AVERAGES ENGINE (UPGRADED TO EWMA)
# ==========================================
print("Generating Smart Form (EWMA)...")

# Recalculate Points for Form
df['home_team_points'] = np.select([df['home_team_score']>df['away_team_score'], df['home_team_score']==df['away_team_score']], [3, 1], 0)
df['away_team_points'] = np.select([df['away_team_score']>df['home_team_score'], df['away_team_score']==df['home_team_score']], [3, 1], 0)

# Add 'team_score' (Goals) to calculate Efficiency later
roll_feats = ['team_xg', 'team_possession', 'shots_onTarget', 'corners', 'team_points', 'fouls', 'team_score']

h_d = df[['date', 'match_url', 'home_team_name']].rename(columns={'home_team_name':'team'})
a_d = df[['date', 'match_url', 'away_team_name']].rename(columns={'away_team_name':'team'})

for f in roll_feats:
    c_h = f"home_{f}" if f in ['team_points', 'team_xg', 'team_possession', 'team_score'] else f"home_{f}"
    c_a = f"away_{f}" if f in ['team_points', 'team_xg', 'team_possession', 'team_score'] else f"away_{f}"
    if c_h in df.columns: h_d[f] = df[c_h]
    if c_a in df.columns: a_d[f] = df[c_a]

stacked = pd.concat([h_d, a_d]).sort_values(['team', 'date'])

for f in roll_feats:
    if f in stacked.columns:
        # --- THE SMART CHANGE: EWM (Exponential Weighted Mean) ---
        # span=5 means the last 5 games matter, but the most recent one counts for ~33% of the weight
        stacked[f'roll_{f}'] = stacked.groupby('team')[f].transform(lambda x: x.shift(1).ewm(span=5, min_periods=1).mean())

df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'home_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'home_roll_{f}' for f in roll_feats})
df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'away_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'away_roll_{f}' for f in roll_feats})

# Calculate Differentials for rest
for f in roll_feats:
    df[f'diff_{f}'] = df[f'home_roll_{f}'] - df[f'away_roll_{f}']

df = df.fillna(0)

# ==========================================
# 2. PREPARE FOR TUNING
# ==========================================
conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
df['match_outcome'] = np.select(conditions, [2, 1, 0])

features = [
    'diff_elo', 'home_elo', 'away_elo',
    'diff_rest', 'diff_team_points', 'diff_team_xg', 
    'diff_shots_onTarget', 'diff_corners',
    'home_roll_team_xg', 'away_roll_team_xg',
    'home_roll_team_possession', 'away_roll_team_possession',
    'home_roll_shots_onTarget', 'away_roll_shots_onTarget',
    'home_roll_corners', 'away_roll_corners',
    'home_roll_fouls', 'away_roll_fouls',
]

X = df[features]
y = df['match_outcome']
split = int(len(df) * 0.8)

X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

# Time Series Split (Crucial for validity)
# 4 splits = Test on last 20%, then last 40%, etc.
tscv = TimeSeriesSplit(n_splits=5)

print(f"Tuning on {len(X)} matches...")

# ==========================================
# 3. TUNE XGBOOST
# ==========================================
print("\n--- Tuning XGBoost ---")
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.03, 0.05],
    'max_depth': [4, 5],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'gamma': [1] # Regularization
}

xgb_model = xgb.XGBClassifier(objective='multi:softprob', num_class=3, tree_method='hist', random_state=42)
xgb_search = GridSearchCV(xgb_model, xgb_param_grid, cv=tscv, scoring='accuracy', n_jobs=-1, verbose=1)
xgb_search.fit(X, y)

print(f"✅ Best XGB Params: {xgb_search.best_params_}")
print(f"✅ Best XGB Score: {xgb_search.best_score_:.2%}")
best_xgb = xgb_search.best_estimator_

# ==========================================
# 4. TUNE RANDOM FOREST
# ==========================================
print("\n--- Tuning Random Forest ---")
rf_param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [5, 8, 10,12], # Keep shallow to prevent overfitting noise
    'min_samples_leaf': [3, 5, 10], # Higher = more generalized
    'max_features': ['sqrt', 0.5]
}

rf_model = RandomForestClassifier(random_state=42,class_weight='balanced')
rf_search = GridSearchCV(rf_model, rf_param_grid, cv=tscv, scoring='accuracy', n_jobs=-1, verbose=1)
rf_search.fit(X, y)

print(f"✅ Best RF Params: {rf_search.best_params_}")
print(f"✅ Best RF Score: {rf_search.best_score_:.2%}")
best_rf = rf_search.best_estimator_

# ==========================================
# 5. TUNE LOGISTIC REGRESSION
# ==========================================
print("\n--- Tuning Logistic Regression ---")
lr_param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0], # Regularization strength
    'solver': ['lbfgs', 'newton-cg']
}

scaler = StandardScaler() # LR needs scaled data
X_scaled = scaler.fit_transform(X)

lr_model = LogisticRegression(multi_class='multinomial', max_iter=10000,class_weight='balanced')
lr_search = GridSearchCV(lr_model, lr_param_grid, cv=tscv, scoring='accuracy', n_jobs=-1)
lr_search.fit(X_scaled, y)

print(f"✅ Best LR Params: {lr_search.best_params_}")
print(f"✅ Best LR Score: {lr_search.best_score_:.2%}")
best_lr = lr_search.best_estimator_


# --- 4. The Ensemble (UPGRADED TO STACKING) ---
print("Training Stacking Ensemble (The Supervisor)...")

# The "Final Estimator" decides how to mix the models based on their outputs.
# Logistic Regression is excellent for this "Arbitration" role.
stacking_ensemble = StackingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('rf', best_rf),
        ('lr', best_lr)
    ],
    final_estimator=LogisticRegression(),
    cv=5, # Internal Cross-Validation to train the supervisor without leakage
    n_jobs=-1
)

# Note: We wrap this in Calibration just like before
calibrated_stack = CalibratedClassifierCV(
    estimator=stacking_ensemble,
    method='isotonic',
    cv=3
)

calibrated_stack.fit(X, y)
print("✅ Smart Ensemble Trained & Calibrated.")

# Update the variable name for saving
calibrated_ensemble = calibrated_stack

# ==========================================
# 7. FINAL EVALUATION
# ==========================================
preds = calibrated_ensemble.predict(X_test)
probs = calibrated_ensemble.predict_proba(X_test)
acc = accuracy_score(y_test, preds)

print("==================================================")
print(f"   CALIBRATED ACCURACY: {acc:.2%}   ")
print("==================================================")

# Check the new Probabilities
print("Sample Probabilities (First 5 matches):")
print(probs[:5])

# Value Bets Check
results = pd.DataFrame({
    'Home': df.iloc[split:]['home_team_name'],
    'Away': df.iloc[split:]['away_team_name'],
    'Actual': y_test.values,
    'Pred': preds,
    'Conf': np.max(probs, axis=1)
})

for t in [0.45, 0.50, 0.60, 0.70, 0.80]:
    sub = results[results['Conf'] > t]
    if len(sub) > 0:
        print(f"Threshold > {t:.2f}: {len(sub)} matches | Accuracy: {accuracy_score(sub['Actual'], sub['Pred']):.2%}")

# Save the CALIBRATED model
joblib.dump({
    'model': calibrated_ensemble, 
    'features': features,
    'elo_dict': dict(zip(df['home_team_name'], df['home_elo'])),
    'df_recent': df[['date', 'home_team_name', 'away_team_name'] + [c for c in df.columns if 'roll_' in c]].tail(1000)
}, 'football_model_final.pkl')

print("✅ Calibrated Model saved.")

   HYPERPARAMETER TUNING ENGINE (ELO + ENSEMBLE)  
Generating Smart Form (EWMA)...
Tuning on 1272 matches...

--- Tuning XGBoost ---
Fitting 5 folds for each of 72 candidates, totalling 360 fits
✅ Best XGB Params: {'colsample_bytree': 0.8, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.7}
✅ Best XGB Score: 51.60%

--- Tuning Random Forest ---
Fitting 5 folds for each of 48 candidates, totalling 240 fits
✅ Best RF Params: {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'n_estimators': 400}
✅ Best RF Score: 49.34%

--- Tuning Logistic Regression ---
✅ Best LR Params: {'C': 0.01, 'solver': 'lbfgs'}
✅ Best LR Score: 49.81%
Training Stacking Ensemble (The Supervisor)...
✅ Smart Ensemble Trained & Calibrated.
   CALIBRATED ACCURACY: 63.14%   
Sample Probabilities (First 5 matches):
[[0.09973734 0.34912205 0.55114061]
 [0.46309597 0.19801065 0.33889338]
 [0.57251035 0.22114087 0.20634878]
 [0.14946671 0.1353177  0.71521558]
 [0.18539342

In [50]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, preds)
print(cm)
print(f"Accuracy: {accuracy_score(y_test, preds):.2%}")

[[55  0 29]
 [10  7 30]
 [25  0 99]]
Accuracy: 63.14%
