In [None]:
# Suppress pandas FutureWarnings for cleaner output
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='pandas')


: 

In [None]:
# === PREDICTION FUNCTION FOR NEW FIXTURES ===

def predict_match(team, opponent, venue, date, time, formation="4-2-3-1"):
    """
    Predict match outcome for a new fixture
    Uses same logic as training data
    """
    # Convert inputs to same format as training data
    venue_code = 1 if venue == "Home" else 0
    
    # Get opponent code (use existing mapping)
    mapped_opponent = mapping[opponent]
    if mapped_opponent in matches["team"].values:
        opp_code = matches[matches["team"] == mapped_opponent]["opp_code"].iloc[0]
    else:
        # Use a default code for new teams
        opp_code = 0
    
    # Parse time and date
    hour = int(time.split(":")[0])
    match_date = pd.to_datetime(date)
    day_code = match_date.dayofweek
    season_stage = match_date.month
    
    # Get formation code
    formation_code = matches[matches["formation"] == formation]["formation_code"].iloc[0] if formation in matches["formation"].values else 0
    
    # Get latest team stats using existing function
    team_stats = get_latest_team_stats(team, matches_rolling)
    
    # Create feature vector (same predictors as training)
    features = [venue_code, opp_code, hour, day_code]
    
    # Add advanced features with defaults if team stats not available
    if pd.isna(team_stats).any():
        # Use team averages as fallback
        team_data = matches[matches["team"] == team]
        xg_per_shot = team_data["xg_per_shot"].mean() if len(team_data) > 0 else 0.1
        possession_efficiency = team_data["possession_efficiency"].mean() if len(team_data) > 0 else 0.02
    else:
        xg_per_shot = team_stats["xg_rolling"] / team_stats["sh_rolling"] if team_stats["sh_rolling"] > 0 else 0.1
        possession_efficiency = team_stats["gf_rolling"] / 50  # Rough possession estimate
    
    features.extend([xg_per_shot, possession_efficiency, formation_code, season_stage])
    
    # Add rolling averages (use 0 if not available)
    for col in new_cols:
        if col in team_stats.index and not pd.isna(team_stats[col]):
            features.append(team_stats[col])
        else:
            features.append(0)
    
    # Make prediction
    rf_prob = final_rf.predict_proba([features])[0][1]
    
    result = {
        "team": team,
        "opponent": opponent,
        "venue": venue,
        "date": date,
        "time": time,
        "win_probability": rf_prob,
        "prediction": "Win" if rf_prob > 0.5 else "Loss/Draw"
    }
    
    return result

def predict_fixtures(fixtures_df):
    """
    Predict outcomes for multiple fixtures
    fixtures_df should have columns: team, opponent, venue, date, time
    """
    predictions = []
    for _, fixture in fixtures_df.iterrows():
        pred = predict_match(
            fixture["team"], 
            fixture["opponent"], 
            fixture["venue"], 
            fixture["date"], 
            fixture["time"]
        )
        predictions.append(pred)
    
    return pd.DataFrame(predictions)

print("✅ Prediction functions ready!")
print("📝 Usage: predict_match(team, opponent, venue, date, time)")
print("📊 Or: predict_fixtures(fixtures_dataframe)")


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load and prepare the data
matches = pd.read_csv("matches.csv")

# changes date to not be an object so that it can be fed to model
matches["date"] = pd.to_datetime(matches["date"]) 

# converts home/away column into numeric column 
matches["venue_code"] = matches["venue"].astype("category").cat.codes

# numeric indicator for each team 
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# numeric indicator for when a team plays their game 
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

# number for each day of the week
matches["day_code"] = matches["date"].dt.dayofweek

# predict whether the team won or not 
matches["target"] = (matches["result"] == 'W').astype("int")

# === HIGH-VALUE FEATURES ===
# XG-based features (most predictive in football)
matches["xg_diff"] = matches["xg"] - matches["xga"]
matches["xg_per_shot"] = matches["xg"] / matches["sh"].replace(0, 1)

# Efficiency metrics
matches["goals_per_xg"] = matches["gf"] / matches["xg"].replace(0, 1)
matches["shots_accuracy"] = matches["sot"] / matches["sh"].replace(0, 1)
matches["possession_efficiency"] = matches["gf"] / (matches["poss"] + 1)

# Formation encoding (simple categorical)
matches["formation_code"] = matches["formation"].astype("category").cat.codes

# Season context
matches["season_stage"] = matches["date"].dt.month

# Import required libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Create rolling averages function
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean() # closed takes current week out
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

# Include XG and efficiency metrics in rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg", "xga", "xg_diff", "goals_per_xg", "shots_accuracy"]
new_cols = [f"{c}_rolling" for c in cols]

# Calculate rolling averages for all teams
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

# Define predictors
predictors = ["venue_code", "opp_code", "hour", "day_code"]
updated_predictors = predictors + ["xg_per_shot", "possession_efficiency", "formation_code", "season_stage"] + new_cols

# === ENHANCED MODEL TRAINING FOR 2025-2026 PREDICTIONS ===

def train_final_model(data, predictors, cutoff_date='2025-06-01'):
    """
    Train the final model using ALL available historical data up to cutoff_date
    This will be used for 2025-2026 season predictions
    """
    # Use ALL historical data for training (no test split)
    train_data = data[data["date"] < cutoff_date]
    
    # Initialize and train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
    rf_model.fit(train_data[predictors], train_data["target"])
    
    return rf_model

# Train final models
print("🚀 TRAINING FINAL MODELS FOR 2025-2026 SEASON")
print("=" * 60)
print("📊 Using ALL historical data for training...")

final_rf = train_final_model(matches_rolling, updated_predictors)

# Get latest team stats for rolling averages
def get_latest_team_stats(team_name, data, n_matches=3):
    """Get the latest rolling averages for a team"""
    team_data = data[data["team"] == team_name].sort_values("date")
    if len(team_data) >= n_matches:
        return team_data.iloc[-n_matches:][new_cols].mean()
    else:
        return team_data[new_cols].mean()

# Create team mappings
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)

# Get unique teams and their codes
teams = sorted(matches["team"].unique())
team_codes = {team: matches[matches["team"] == team]["opp_code"].iloc[0] for team in teams}

print(f"✅ Models trained on {len(matches_rolling)} historical matches")
print(f"🎯 Ready for 2025-2026 season predictions!")
print("=" * 60)

