In [62]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import re
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

print("==================================================")
print("   FOOTBALL MATCH PREDICTOR (GOALS ADDED)         ")
print("==================================================")

# ==========================================
# 1. LOAD & CLEAN DATA
# ==========================================
try:
    df = pd.read_csv("match_data.csv")
except FileNotFoundError:
    print("Error: match_data.csv not found.")
    exit()

def extract_date(url):
    try:
        match = re.search(r'([A-Za-z]+-\d{1,2}-\d{4})', str(url))
        if match: return pd.to_datetime(match.group(1), format='%B-%d-%Y', errors='coerce')
    except: pass
    return pd.NaT

df['date'] = df['match_url'].apply(extract_date)
df = df.sort_values(by='date').reset_index(drop=True)

# Numeric Cleaning
stats_cols = ["xg", "possession", "shots_onTarget", "corners", "fouls", "team_points"]
for side in ['home', 'away']:
    p_col = f"{side}_team_possession"
    if p_col in df.columns:
        df[p_col] = pd.to_numeric(df[p_col].astype(str).str.rstrip('%'), errors='coerce').fillna(50) / 100.0
    
    for s in stats_cols:
        col = f"{side}_{s}" if s not in ['xg', 'possession', 'team_points'] else f"{side}_team_{s}"
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df.groupby(f'{side}_team_name')[col].transform('median'))
            df[col] = df[col].fillna(df[col].mean())

# ==========================================
# 2. FEATURE ENGINEERING ENGINE
# ==========================================

# --- ELO (Standard +70 Home) ---
def calculate_elo(df):
    k_factor = 20
    home_advantage = 70
    team_elos = {team: 1500 for team in set(df['home_team_name']).union(set(df['away_team_name']))}
    elo_h, elo_a = [], []
    
    for _, row in df.iterrows():
        h, a = row['home_team_name'], row['away_team_name']
        rh, ra = team_elos[h], team_elos[a]
        elo_h.append(rh); elo_a.append(ra)
        
        if row['home_team_score'] > row['away_team_score']: res = 1
        elif row['home_team_score'] == row['away_team_score']: res = 0.5
        else: res = 0
        
        dr = (rh + home_advantage) - ra
        e_h = 1 / (1 + 10 ** (-dr / 400))
        
        change = k_factor * (res - e_h)
        team_elos[h] = rh + change
        team_elos[a] = ra - change
        
    return elo_h, elo_a, team_elos

df['home_elo'], df['away_elo'], current_elos = calculate_elo(df)
df['diff_elo'] = (df['home_elo'] + 70) - df['away_elo']

# --- Rest Days ---
long_df = pd.concat([
    df[['date', 'home_team_name']].rename(columns={'home_team_name':'team'}),
    df[['date', 'away_team_name']].rename(columns={'away_team_name':'team'})
]).sort_values(['team', 'date'])
long_df['rest'] = (long_df['date'] - long_df.groupby('team')['date'].shift(1)).dt.days.fillna(7).clip(upper=14)
rest_map = dict(zip(zip(long_df['date'], long_df['team']), long_df['rest']))

df['diff_rest'] = df.apply(lambda x: rest_map.get((x['date'], x['home_team_name']),7), axis=1) - \
                  df.apply(lambda x: rest_map.get((x['date'], x['away_team_name']),7), axis=1)

# --- NEW: GOALS & CONCEDED COLUMNS ---
# We need to explicitly calculate what each team scored and conceded to roll it
df['home_goals_scored'] = df['home_team_score']
df['home_goals_conceded'] = df['away_team_score']
df['away_goals_scored'] = df['away_team_score']
df['away_goals_conceded'] = df['home_team_score']

# --- Rolling Stats (EWMA) ---
df['home_team_points'] = np.select([df['home_team_score']>df['away_team_score'], df['home_team_score']==df['away_team_score']], [3, 1], 0)
df['away_team_points'] = np.select([df['away_team_score']>df['home_team_score'], df['away_team_score']==df['home_team_score']], [3, 1], 0)

# ADDED: 'goals_scored', 'goals_conceded'
roll_feats = ['team_xg', 'team_possession', 'shots_onTarget', 'corners', 'team_points', 'fouls', 'goals_scored', 'goals_conceded']

h_d = df[['date', 'match_url', 'home_team_name']].rename(columns={'home_team_name':'team'})
a_d = df[['date', 'match_url', 'away_team_name']].rename(columns={'away_team_name':'team'})

for f in roll_feats:
    # Logic to find the right column name in the source DF
    if f in ['goals_scored', 'goals_conceded']:
        col_h, col_a = f"home_{f}", f"away_{f}"
    elif f in ['team_points', 'team_xg', 'team_possession']:
        col_h, col_a = f"home_{f}", f"away_{f}"
    else:
        col_h, col_a = f"home_{f}", f"away_{f}"
        
    if col_h in df.columns: h_d[f] = df[col_h]
    if col_a in df.columns: a_d[f] = df[col_a]

stacked = pd.concat([h_d, a_d]).sort_values(['team', 'date'])

for f in roll_feats:
    if f in stacked.columns:
        # EWMA Span 10 (Longer term view helps stability)
        stacked[f'roll_{f}'] = stacked.groupby('team')[f].transform(lambda x: x.shift(1).ewm(span=10, min_periods=1).mean())

df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'home_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'home_roll_{f}' for f in roll_feats})
df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'away_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'away_roll_{f}' for f in roll_feats})

for f in roll_feats:
    df[f'diff_{f}'] = df[f'home_roll_{f}'] - df[f'away_roll_{f}']

df = df.fillna(0)

# ==========================================
# 3. MODEL CONFIGURATION
# ==========================================
features = [
    'diff_elo', 'home_elo', 'away_elo',
    'diff_rest', 'diff_team_points', 
    'diff_team_xg', 'diff_team_possession', 
    'diff_shots_onTarget', 'diff_corners',
    # NEW DIFFS
    'diff_goals_scored', 'diff_goals_conceded', 
    
    # Raw Rolling Context
    'home_roll_team_xg', 'away_roll_team_xg',
    'home_roll_goals_scored', 'away_roll_goals_scored', # Crucial for "Can they finish?"
    'home_roll_goals_conceded', 'away_roll_goals_conceded' # Crucial for "Is defense leaky?"
]

conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
y = np.select(conditions, [2, 1, 0])
X = df[features]

print(f"Training on {len(X)} matches with {len(features)} features...")

# --- Stacking Ensemble ---
xgb_clf = xgb.XGBClassifier(
    n_estimators=300, learning_rate=0.01, max_depth=3, 
    subsample=0.7, colsample_bytree=0.8, gamma=3, # Slightly reduced gamma
    objective='multi:softprob', num_class=3, random_state=42
)

rf_clf = RandomForestClassifier(
    n_estimators=200, max_depth=8, min_samples_leaf=3, 
    max_features=0.5, random_state=42
)

lr_clf = make_pipeline(StandardScaler(), LogisticRegression(C=0.1, solver='lbfgs', multi_class='multinomial', max_iter=1000))

stacking_ensemble = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('lr', lr_clf)],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

calibrated = CalibratedClassifierCV(stacking_ensemble, method='isotonic', cv=3)
calibrated.fit(X, y)

print("✅ Enhanced Model Trained (Goals Added).")

# Save
joblib.dump({
    'model': calibrated, 
    'features': features,
    'elo_dict': current_elos,
    'df_recent': df[['date', 'home_team_name', 'away_team_name'] + [c for c in df.columns if 'roll_' in c]].tail(1000)
}, 'football_model_final.pkl')

print("✅ Saved to 'football_model_final.pkl'")

   FOOTBALL MATCH PREDICTOR (GOALS ADDED)         
Training on 1272 matches with 17 features...
✅ Enhanced Model Trained (Goals Added).
✅ Saved to 'football_model_final.pkl'


In [None]:
from sklearn.metrics import confusion_matrix

split = int(len(X) * 0.8)
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]


# ==========================================
# 7. FINAL EVALUATION
# ==========================================
preds = calibrated.predict(X_test)
probs = calibrated.predict_proba(X_test)
acc = accuracy_score(y_test, preds)

print("==================================================")
print(f"   CALIBRATED ACCURACY: {acc:.2%}   ")
print("==================================================")

# Check the new Probabilities
print("Sample Probabilities (First 5 matches):")
print(probs[:5])

# Value Bets Check
results = pd.DataFrame({
    'Home': df.iloc[split:]['home_team_name'],
    'Away': df.iloc[split:]['away_team_name'],
    'Actual': y_test,
    'Pred': preds,
    'Conf': np.max(probs, axis=1)
})

for t in [0.45, 0.50, 0.60, 0.70, 0.80]:
    sub = results[results['Conf'] > t]
    if len(sub) > 0:
        print(f"Threshold > {t:.2f}: {len(sub)} matches | Accuracy: {accuracy_score(sub['Actual'], sub['Pred']):.2%}")

# Save the CALIBRATED model
joblib.dump({
    'model': calibrated, 
    'features': features,
    'elo_dict': dict(zip(df['home_team_name'], df['home_elo'])),
    'df_recent': df[['date', 'home_team_name', 'away_team_name'] + [c for c in df.columns if 'roll_' in c]].tail(1000)
}, 'football_model_final.pkl')

print("✅ Calibrated Model saved.")

cm = confusion_matrix(y_test, preds)
print(cm)
print(f"Accuracy: {accuracy_score(y_test, preds):.2%}")

   CALIBRATED ACCURACY: 60.00%   
Sample Probabilities (First 5 matches):
[[0.13950142 0.25849408 0.6020045 ]
 [0.52919244 0.20402902 0.26677854]
 [0.47610285 0.21800867 0.30588848]
 [0.1257092  0.20255163 0.67173916]
 [0.20530875 0.21450411 0.58018714]]


AttributeError: 'numpy.ndarray' object has no attribute 'values'