In [None]:
# Set environment variables for OpenMP on macOS (if needed)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
# Set OpenMP library path for XGBoost
os.environ['LDFLAGS'] = '-L/opt/homebrew/opt/libomp/lib'
os.environ['CPPFLAGS'] = '-I/opt/homebrew/opt/libomp/include'
# Alternative approach - set library path directly
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/opt/libomp/lib'

# Suppress pandas FutureWarnings for cleaner output
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='pandas')


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load and prepare the data
matches = pd.read_csv("matches.csv")

# changes date to not be an object so that it can be fed to model
matches["date"] = pd.to_datetime(matches["date"]) 

# converts home/away column into numeric column 
matches["venue_code"] = matches["venue"].astype("category").cat.codes

# numeric indicator for each team 
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# numeric indicator for when a team plays their game 
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

# number for each day of the week
matches["day_code"] = matches["date"].dt.dayofweek

# predict whether the team won or not 
matches["target"] = (matches["result"] == 'W').astype("int")

# === HIGH-VALUE FEATURES ===
# XG-based features (most predictive in football)
matches["xg_diff"] = matches["xg"] - matches["xga"]
matches["xg_per_shot"] = matches["xg"] / matches["sh"].replace(0, 1)

# Efficiency metrics
matches["goals_per_xg"] = matches["gf"] / matches["xg"].replace(0, 1)
matches["shots_accuracy"] = matches["sot"] / matches["sh"].replace(0, 1)
matches["possession_efficiency"] = matches["gf"] / (matches["poss"] + 1)

# Formation encoding (simple categorical)
matches["formation_code"] = matches["formation"].astype("category").cat.codes

# Season context
matches["season_stage"] = matches["date"].dt.month

# Import required libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Create rolling averages function
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean() # closed takes current week out
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

# Include XG and efficiency metrics in rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg", "xga", "xg_diff", "goals_per_xg", "shots_accuracy"]
new_cols = [f"{c}_rolling" for c in cols]

# Calculate rolling averages for all teams
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

# Define predictors
predictors = ["venue_code", "opp_code", "hour", "day_code"]
updated_predictors = predictors + ["xg_per_shot", "possession_efficiency", "formation_code", "season_stage"] + new_cols

# === ENHANCED MODEL TRAINING FOR 2025-2026 PREDICTIONS ===

def train_final_model(data, predictors, cutoff_date='2025-06-01'):
    """
    Train the final model using ALL available historical data up to cutoff_date
    This will be used for 2025-2026 season predictions
    """
    # Use ALL historical data for training (no test split)
    train_data = data[data["date"] < cutoff_date]
    
    # Initialize models
    rf_model = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
    
    # Train Random Forest
    rf_model.fit(train_data[predictors], train_data["target"])
    
    # Try XGBoost
    try:
        import xgboost as xgb
        xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=1, eval_metric='logloss')
        xgb_model.fit(train_data[predictors], train_data["target"])
        
        # Return both models
        return rf_model, xgb_model, True
    except (ImportError, Exception) as e:
        # Return only RF model
        return rf_model, None, False

# Train final models
print("🚀 TRAINING FINAL MODELS FOR 2025-2026 SEASON")
print("=" * 60)
print("📊 Using ALL historical data for training...")

final_rf, final_xgb, has_xgb = train_final_model(matches_rolling, updated_predictors)

# Get latest team stats for rolling averages
def get_latest_team_stats(team_name, data, n_matches=3):
    """Get the latest rolling averages for a team"""
    team_data = data[data["team"] == team_name].sort_values("date")
    if len(team_data) >= n_matches:
        return team_data.iloc[-n_matches:][new_cols].mean()
    else:
        return team_data[new_cols].mean()

# Create team mappings
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)

# Get unique teams and their codes
teams = sorted(matches["team"].unique())
team_codes = {team: matches[matches["team"] == team]["opp_code"].iloc[0] for team in teams}

print(f"✅ Models trained on {len(matches_rolling)} historical matches")
print(f"🎯 Ready for 2025-2026 season predictions!")
print("=" * 60)






⚽ EPL MATCH PREDICTION SYSTEM
🤖 TRAINING MODELS...
--------------------------------------------------
🌲 Random Forest Precision: 69.9%
⚠️  XGBoost not available - using Random Forest (69.9%)

📊 FEATURE IMPORTANCE - TOP 10 PREDICTORS
--------------------------------------------------
 1. possession_efficiency        39.7%
 2. xg_per_shot                   9.1%
 3. xg_diff_rolling               4.7%
 4. xg_rolling                    4.4%
 5. opp_code                      4.2%
 6. dist_rolling                  4.1%
 7. sh_rolling                    4.0%
 8. xga_rolling                   3.9%
 9. shots_accuracy_rolling        3.9%
10. goals_per_xg_rolling          3.7%

📈 PERFORMANCE SUMMARY
--------------------------------------------------
🎯 Model Precision:        69.9%
⚽ Win Rate (Test Set):    37.8%
📊 Better than Random:    85%
🔥 Top Feature:           possession_efficiency
