# FPL AI Advisor v3.0 - Advanced Data Integration 🚀

This version is a major overhaul designed to work directly with your local historical data structure.

**Key Improvements:**
1.  **Historical Data Loading**: The code now reads all your `gw_data/GW*.json` files to build a complete picture of the season.
2.  **Dynamic Feature Engineering**: We will create "form" metrics (e.g., goals scored, ICT index) based on a **5-gameweek rolling average**. This is the crucial fix for the match prediction model.
3.  **Accurate Training Set**: A proper training dataset is constructed from past matches using these new dynamic features. This will stop the `0-0` predictions.

In [28]:
import pandas as pd
import numpy as np
import json
import os
import glob # To find files matching a pattern
from tqdm import tqdm # For progress bars

# --- Machine Learning ---
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_absolute_error

# --- Visualization & Utilities ---
from IPython.display import display, Markdown, HTML

# --- Settings ---
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

DATA_DIR = 'data/' # Set your main data directory here
print("Libraries imported and project structure configured.")

Libraries imported and project structure configured.


In [None]:
# --- Load Bootstrap (static) data ---
with open(os.path.join(DATA_DIR, 'bootstrap-static.json'), 'r', encoding='utf-8') as f:
    bootstrap_data = json.load(f)

players_master_df = pd.DataFrame(bootstrap_data['elements'])
teams_df = pd.DataFrame(bootstrap_data['teams'])
teams_map = teams_df.set_index('id')['name'].to_dict()

# --- Load ALL historical gameweek data ---
gw_files = glob.glob(os.path.join(DATA_DIR, 'gw_data', 'GW*.json'))
historical_data = []

print(f"Found {len(gw_files)} gameweek files. Loading...")

for gw_file in tqdm(gw_files):
    gw_num = int(os.path.basename(gw_file).replace('GW', '').replace('.json', ''))
    with open(gw_file, 'r', encoding='utf-8') as f:
        gw_data = json.load(f)
        
    gw_df = pd.DataFrame(gw_data['elements'])
    gw_df['gw'] = gw_num
    historical_data.append(gw_df)

# Combine all gameweeks into a single dataframe
historical_df = pd.concat(historical_data).sort_values(by=['id', 'gw']).reset_index(drop=True)

print("\nHistorical data for all players and gameweeks loaded successfully.")
display(historical_df[['id', 'gw', 'total_points', 'goals_scored', 'assists', 'minutes']].head())

Found 38 gameweek files. Loading...


  0%|          | 0/38 [00:00<?, ?it/s]


TypeError: list indices must be integers or slices, not str

## The Fix: Engineering Dynamic, Form-Based Features

This is the most important step to fix the match predictions. We will calculate each team's performance (goals, ICT, etc.) for every gameweek. Then, we'll compute a **5-gameweek rolling average** for these stats. This gives our model a sense of a team's recent "form," which is a powerful predictor of future results.

In [None]:
# Add team_id to the historical data for grouping
player_team_map = players_master_df.set_index('id')['team'].to_dict()
historical_df['team_id'] = historical_df['id'].map(player_team_map)

# --- Aggregate player stats to get team-level stats for each gameweek ---
team_gw_stats = historical_df.groupby(['team_id', 'gw']).agg(
    goals_scored=('goals_scored', 'sum'),
    assists=('assists', 'sum'),
    ict_index=('ict_index', 'sum'),
    minutes=('minutes', 'sum')
).reset_index()

# We also need goals conceded. We find this by looking at the opponent's goals.
# First, let's get a list of all historical fixtures
fixtures_df = pd.read_csv('https://fixturedownload.com/feed/epl/2023') # Using an external source for ease
fixtures_df['home_id'] = fixtures_df['Home Team'].map({v: k for k, v in teams_map.items()})
fixtures_df['away_id'] = fixtures_df['Away Team'].map({v: k for k, v in teams_map.items()})

# --- Calculate Goals Conceded ---
goals_conceded = []
for _, row in team_gw_stats.iterrows():
    team = row['team_id']
    gw = row['gw']
    
    # Find the fixture for this team in this gameweek
    fixture = fixtures_df[(fixtures_df['Gameweek'] == gw) & ((fixtures_df['home_id'] == team) | (fixtures_df['away_id'] == team))]
    if fixture.empty:
        goals_conceded.append(0)
        continue
    
    if fixture.iloc[0]['home_id'] == team:
        goals_conceded.append(fixture.iloc[0]['Result'].split(' - ')[1]) # They were home, take away score
    else:
        goals_conceded.append(fixture.iloc[0]['Result'].split(' - ')[0]) # They were away, take home score
        
team_gw_stats['goals_conceded'] = pd.to_numeric(goals_conceded)


# --- Calculate 5-GW Rolling Averages ---
# Sort values to ensure the rolling window is correct
team_gw_stats = team_gw_stats.sort_values(by=['team_id', 'gw'])

stats_to_roll = ['goals_scored', 'goals_conceded', 'ict_index']
for stat in stats_to_roll:
    # We use .shift(1) to ensure we're only using data *before* the current gameweek
    team_gw_stats[f'rolling_{stat}'] = team_gw_stats.groupby('team_id')[stat].shift(1).rolling(window=5, min_periods=1).mean()

print("5-Gameweek rolling average features created.")
display(team_gw_stats[team_gw_stats['team_id'] == 1].tail(10).style.format(precision=2)) # Example for Arsenal

In [None]:
# --- Build the final training set for the match prediction model ---
training_data = []
# We can only start from GW6 since we need a 5-week history
for gw in range(6, historical_df['gw'].max() + 1):
    gw_fixtures = fixtures_df[fixtures_df['Gameweek'] == gw]
    for _, fixture in gw_fixtures.iterrows():
        home_team = fixture['home_id']
        away_team = fixture['away_id']
        
        # Get the rolling stats for each team *before* this gameweek
        home_stats = team_gw_stats[(team_gw_stats['team_id'] == home_team) & (team_gw_stats['gw'] == gw)]
        away_stats = team_gw_stats[(team_gw_stats['team_id'] == away_team) & (team_gw_stats['gw'] == gw)]
        
        if home_stats.empty or away_stats.empty: continue
            
        training_data.append({
            'gw': gw,
            'home_team': home_team,
            'away_team': away_team,
            'home_goals_actual': int(fixture['Result'].split(' - ')[0]),
            'away_goals_actual': int(fixture['Result'].split(' - ')[1]),
            'home_rolling_gs': home_stats.iloc[0]['rolling_goals_scored'],
            'home_rolling_gc': home_stats.iloc[0]['rolling_goals_conceded'],
            'home_rolling_ict': home_stats.iloc[0]['rolling_ict_index'],
            'away_rolling_gs': away_stats.iloc[0]['rolling_goals_scored'],
            'away_rolling_gc': away_stats.iloc[0]['rolling_goals_conceded'],
            'away_rolling_ict': away_stats.iloc[0]['rolling_ict_index'],
        })

match_train_df = pd.DataFrame(training_data).dropna()

# --- Train the Poisson Models with our new features ---
features = [
    'home_rolling_gs', 'home_rolling_gc', 'home_rolling_ict',
    'away_rolling_gs', 'away_rolling_gc', 'away_rolling_ict'
]
X = match_train_df[features]
y_home = match_train_df['home_goals_actual']
y_away = match_train_df['away_goals_actual']

poisson_home = PoissonRegressor().fit(X, y_home)
poisson_away = PoissonRegressor().fit(X, y_away)

print("Match prediction models re-trained with dynamic 'form' features.")
display(match_train_df.head().style.format(precision=2))

In [None]:
def predict_upcoming_fixtures(fixtures_df, team_gw_stats, teams_map, home_model, away_model):
    """Predicts scores for the next gameweek using the latest rolling stats."""
    
    # Find the next gameweek
    next_gw = events_df[events_df['is_next'] == True]['id'].values[0]
    upcoming_fixtures = fixtures_df[fixtures_df['Gameweek'] == next_gw]
    
    if upcoming_fixtures.empty:
        print("No fixtures found for the next gameweek.")
        return
        
    predictions = []
    # Get the latest available stats for each team (from the last completed GW)
    latest_stats = team_gw_stats.loc[team_gw_stats.groupby('team_id')['gw'].idxmax()]
    
    for _, fixture in upcoming_fixtures.iterrows():
        home_team = fixture['home_id']
        away_team = fixture['away_id']
        
        home_form = latest_stats[latest_stats['team_id'] == home_team]
        away_form = latest_stats[latest_stats['team_id'] == away_team]
        
        if home_form.empty or away_form.empty: continue
            
        feature_vector = pd.DataFrame([{
            'home_rolling_gs': home_form.iloc[0]['rolling_goals_scored'],
            'home_rolling_gc': home_form.iloc[0]['rolling_goals_conceded'],
            'home_rolling_ict': home_form.iloc[0]['rolling_ict_index'],
            'away_rolling_gs': away_form.iloc[0]['rolling_goals_scored'],
            'away_rolling_gc': away_form.iloc[0]['rolling_goals_conceded'],
            'away_rolling_ict': away_form.iloc[0]['rolling_ict_index'],
        }])
        
        home_goals = int(round(home_model.predict(feature_vector)[0]))
        away_goals = int(round(away_model.predict(feature_vector)[0]))
        
        predictions.append({
            'Home Team': teams_map[home_team],
            'Away Team': teams_map[away_team],
            'Predicted Score': f"{home_goals} - {away_goals}"
        })
        
    return pd.DataFrame(predictions)


# --- Generate and Display Predictions ---
final_predictions = predict_upcoming_fixtures(fixtures_df, team_gw_stats, teams_map, poisson_home, poisson_away)

display(HTML("<h3>✅ Match Predictions (Fixed)</h3>"))
display(final_predictions.style.hide(axis="index"))