In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "dev"

try:
    PROTOCOL = dbutils.widgets.get("PROTOCOL")
except Exception:
    PROTOCOL = "HIST"

#ensure valid ENV and PROTOCOL
valid_envs = {"dev", "test", "prod"}
valid_protocols = {"HIST", "INCR"}

# Validate ENV
if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")

# Validate PROTOCOL
if PROTOCOL not in valid_protocols:
    print(f"Invalid PROTOCOL: {PROTOCOL}. Must be one of {valid_protocols}. Exiting notebook.")
    dbutils.notebook.exit("Invalid PROTOCOL")
    
silver_schema = f"fpl_silver_{ENV}"
feature_schema = f"fpl_feature_{ENV}"

rolling_window_size = 5

In [0]:
fixtures_df = spark.read.table(f"{silver_schema}.fixtures")
player_features_df = spark.read.table(f"{feature_schema}.player_features")
team_features_df = spark.read.table(f"{feature_schema}.team_features")
gameweek_stats_df = spark.read.table("fpl_silver_dev.gameweek_stats")
players_df = spark.read.table("fpl_silver_dev.players")

In [0]:
max_gameweek_row = team_features_df.agg(F.max("gameweek_key").alias("max_gameweek_key")).collect()[0]
max_gameweek = max_gameweek_row["max_gameweek_key"]
next_gameweek_key = max_gameweek + 1

next_fixtures_df = fixtures_df.filter(
    F.col("gameweek_key") == next_gameweek_key
).select(
    "fixture_key", 
    "home_team_key", 
    "away_team_key", 
    "gameweek_key", 
    "season_key"
)

display(next_fixtures_df)

fixture_key,home_team_key,away_team_key,gameweek_key,season_key
202526091,36,2,20252610,202526
202526092,90,3,20252610,202526
202526093,31,94,20252610,202526
202526094,54,39,20252610,202526
202526097,17,1,20252610,202526
202526099,6,8,20252610,202526
202526095,14,7,20252610,202526
202526100,21,4,20252610,202526
202526096,43,91,20252610,202526
202526098,56,11,20252610,202526


In [0]:
teams_next_fixtures_df = next_fixtures_df.select(
    "fixture_key", 
    "gameweek_key", 
    "season_key",
    F.col("home_team_key").alias("team_key")
).unionByName(
    next_fixtures_df.select(
        "fixture_key", 
        "gameweek_key", 
        "season_key",
        F.col("away_team_key").alias("team_key")
    )
)

# assumes if player played for a team last week they will again. slightly flawed but will be able to know this when selecting the team
players_next_fixtures_df = teams_next_fixtures_df.alias("tf").join(
    players_df.alias("pf"),
    (teams_next_fixtures_df.team_key == players_df.team_key) &
    (players_df.last_gameweek_key == max_gameweek),
    "inner"
).select(
    "tf.fixture_key",
    "tf.team_key",
    "tf.season_key",
    "tf.gameweek_key",
    "pf.player_key"
)

# Identify rolling stat columns from player_features_df
exclude_cols = ["player_key", "team_key", "fixture_key", "season_key", "gameweek_key", "player_id",
                "player_season_key", "player_fixture_key", "position_key", "was_home", "opponent_team_key", "exp_stats_available", "def_con_available", "total_points", "last_updated", "match_points"]
rolling_stat_cols = [c for c in player_features_df.columns if c not in exclude_cols]

# For each player, get last games (excluding the next fixture) from gameweek_stats_df
player_recent_df = players_next_fixtures_df.alias("nf").join(
    gameweek_stats_df.alias("gs"),
    (F.col("nf.player_key") == F.col("gs.player_key")) &
    (F.col("nf.team_key") == F.col("gs.team_key")) &
    (F.col("nf.season_key") == F.col("gs.season_key")) &
    (F.col("gs.gameweek_key") < F.col("nf.gameweek_key"))
)

window_spec = Window.partitionBy(
    "nf.player_key", "nf.team_key", "nf.fixture_key"
).orderBy(F.col("gs.gameweek_key").desc())

player_recent_df = player_recent_df.withColumn(
    "rank", F.row_number().over(window_spec)
).filter(
    F.col("rank") <= rolling_window
).drop("rank", "nf.gameweek_key", "nf.fixture_key", "nf.season_key")

display(player_recent_df)

fixture_key,team_key,season_key,gameweek_key,player_key,fixture_key.1,player_id,assists,bonus,bps,clean_sheets,clearances_blocks_interceptions,creativity,defensive_contribution,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,fixture,goals_conceded,goals_scored,ict_index,influence,minutes,own_goals,penalties_missed,penalties_saved,recoveries,red_cards,saves,starts,tackles,threat,total_points,value,was_home,yellow_cards,exp_stats_available,def_con_available,season_key.1,gameweek_key.1,player_season_key,team_key.1,player_key.1,position_key,opponent_team_key,team_score,opponent_score,player_fixture_key,minutes_points,assist_points,goal_points,clean_sheet_points,defensive_contribution_points,penalty_miss_points,goals_conceded_points,yellow_card_points,red_card_points,own_goal_points,last_updated
202526091,36,202526,20252610,15157,202526088,170,1,0,24,0,1,34.2,3,0.28,0.28,0.0,0.67,88,2,0,5.9,24.4,31,0,0,0,2,0,0,0,0,0.0,4,50,False,0,True,True,202526,20252609,202526170,36,15157,3,1,2,4,15157202526088,1,3,0,0,0,0,0,0,0,0,2025-11-18T19:31:00.619Z
202526091,36,202526,20252610,15157,202526071,170,0,0,1,0,2,0.1,3,0.0,0.0,0.0,0.75,71,1,0,0.1,1.2,19,0,0,0,1,0,0,0,0,0.0,1,50,True,0,True,True,202526,20252608,202526170,36,15157,3,4,2,1,15157202526071,1,0,0,0,0,0,0,0,0,0,2025-11-18T19:31:00.619Z
202526091,36,202526,20252610,15157,202526070,170,0,0,6,0,0,20.1,1,0.06,0.06,0.0,0.0,70,0,0,2.5,5.2,13,0,0,0,1,0,0,0,0,0.0,1,50,False,0,True,True,202526,20252607,202526170,36,15157,3,39,1,0,15157202526070,1,0,0,0,0,0,0,0,0,0,2025-11-18T19:31:00.619Z
202526091,36,202526,20252610,15157,202526053,170,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,53,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,50,False,0,True,True,202526,20252606,202526170,36,15157,3,8,3,1,15157202526053,0,0,0,0,0,0,0,0,0,0,2025-11-18T19:31:00.619Z
202526091,36,202526,20252610,15157,202526043,170,0,0,5,0,1,11.3,4,0.03,0.03,0.0,0.31,43,1,0,1.7,5.2,27,0,0,0,3,0,0,0,0,0.0,1,50,True,0,True,True,202526,20252605,202526170,36,15157,3,6,2,1,15157202526043,1,0,0,0,0,0,0,0,0,0,2025-11-18T19:31:00.619Z
202526098,11,202526,20252610,17761,202526086,291,0,0,6,0,6,14.5,7,0.06,0.06,0.0,2.08,86,3,0,3.3,16.0,90,0,0,0,0,0,0,1,1,2.0,1,55,True,0,True,True,202526,20252609,202526291,11,17761,2,6,0,3,17761202526086,2,0,0,0,0,0,-1,0,0,0,2025-11-18T19:31:00.619Z
202526098,11,202526,20252610,17761,202526076,291,0,0,8,0,11,0.0,13,0.0,0.0,0.0,2.39,76,2,0,2.9,27.2,90,0,0,0,1,0,0,1,2,2.0,3,55,False,0,True,True,202526,20252608,202526291,11,17761,2,43,0,2,17761202526076,2,0,0,0,2,0,-1,0,0,0,2025-11-18T19:31:00.619Z
202526098,11,202526,20252610,17761,202526066,291,0,1,22,0,9,16.9,11,0.26,0.26,0.0,1.65,66,1,0,6.0,34.6,90,0,0,0,6,0,0,1,2,8.0,5,55,True,0,True,True,202526,20252607,202526291,11,17761,2,31,2,1,17761202526066,2,0,0,0,2,0,0,0,0,0,2025-11-18T19:31:00.619Z
202526098,11,202526,20252610,17761,202526055,291,0,0,10,0,7,15.7,7,0.11,0.11,0.0,1.17,55,1,0,3.7,21.6,90,0,0,0,3,0,0,1,0,0.0,1,55,True,1,True,True,202526,20252606,202526291,11,17761,2,21,1,1,17761202526055,2,0,0,0,0,0,0,-1,0,0,2025-11-18T19:31:00.619Z
202526098,11,202526,20252610,17761,202526046,291,0,0,12,0,11,7.8,14,0.02,0.02,0.0,0.9,46,2,0,4.2,34.6,90,0,0,0,2,0,0,1,3,0.0,3,55,False,0,True,True,202526,20252605,202526291,11,17761,2,14,1,2,17761202526046,2,0,0,0,2,0,-1,0,0,0,2025-11-18T19:31:00.619Z


In [0]:
def build_team_features(
    gameweek_stats_df,
    rolling_window_size,
    silver_schema,
    ignore_current_gameweek = True
):    
    
    fixtures_df = spark.read.table(f"{silver_schema}.fixtures")
    teams_df = spark.read.table(f"{silver_schema}.teams")

    #Aggregate xG, xA, and exp_stats_available per team per fixture
    team_xg_xa_df = gameweek_stats_df.groupBy(
        "fixture_key", "team_key"
        ).agg(
            F.sum("expected_goals").alias("team_expected_goals"),
            F.sum("expected_assists").alias("team_expected_assists"),
            F.max("exp_stats_available").alias("team_exp_stats_available")
        ).withColumn(
            "team_expected_goal_involvements",
            F.col("team_expected_goals") + F.col("team_expected_assists")
        )

    #Create opponent xG/xA aggregates
    opponent_xg_xa_df = team_xg_xa_df.select(
            "fixture_key",
            F.col("team_key").alias("opponent_team_key"),
            F.col("team_expected_goals").alias("expected_goals_against"),
            F.col("team_expected_assists").alias("expected_assists_against"),
            F.col("team_expected_goal_involvements").alias("expected_goal_involvements_against")
        )

    #Transform fixtures into team-level records
    home_df = fixtures_df.select(
            "fixture_key",
            "season_key",
            "gameweek_key",
            F.col("home_team_key").alias("team_key"),
            F.col("away_team_key").alias("opponent_team_key"),
            F.lit(True).alias("is_home"),
            F.col("home_team_score").alias("goals_for"),
            F.col("away_team_score").alias("goals_against")
        )

    away_df = fixtures_df.select(
            "fixture_key",
            "season_key",
            "gameweek_key",
            F.col("away_team_key").alias("team_key"),
            F.col("home_team_key").alias("opponent_team_key"),
            F.lit(False).alias("is_home"),
            F.col("away_team_score").alias("goals_for"),
            F.col("home_team_score").alias("goals_against")
        )

    team_fixtures_df = home_df.unionByName(away_df)

    # Window specs for rolling metrics up to previous GW
    base_window = Window.partitionBy("team_key", "season_key").orderBy("gameweek_key")

    if ignore_current_gameweek:
        rolling_window = base_window.rowsBetween(-rolling_window_size, -1)
    else:
        rolling_window = base_window

    first_gw = F.col("gameweek_key") == 1
    
    #Add match-level metrics
    team_fixtures_df = team_fixtures_df.withColumns({
        "goal_diff": F.col("goals_for") - F.col("goals_against"),
        "match_points": F.when(F.col("goals_for") > F.col("goals_against"), F.lit(3))
                        .when(F.col("goals_for") == F.col("goals_against"), F.lit(1))
                        .otherwise(F.lit(0))
    })

    #Join team xG/xA and opponent xG/xA
    team_fixtures_df = team_fixtures_df.join(
        team_xg_xa_df, 
        on=["fixture_key", "team_key"], 
        how="left"
        ).join(
            opponent_xg_xa_df, 
            on=["fixture_key", "opponent_team_key"], 
            how="left"
        )

    # Rolling metrics expressions
    rolling_exprs = {
        "rolling_points": F.when(first_gw, None).otherwise(F.sum("match_points").over(rolling_window)),
        "home_rolling_points": F.when(first_gw, None).otherwise(F.sum(F.when(F.col("is_home"), F.col("match_points")).otherwise(0)).over(rolling_window)),
        "away_rolling_points": F.when(first_gw, None).otherwise(F.sum(F.when(~F.col("is_home"), F.col("match_points")).otherwise(0)).over(rolling_window)),
        "rolling_team_expected_goals": F.when(first_gw, None).otherwise(F.sum("team_expected_goals").over(rolling_window)),
        "rolling_team_expected_assists": F.when(first_gw, None).otherwise(F.sum("team_expected_assists").over(rolling_window)),
        "rolling_team_expected_goal_involvements": F.when(first_gw, None).otherwise(F.sum("team_expected_goal_involvements").over(rolling_window)),
        "rolling_expected_goals_against": F.when(first_gw, None).otherwise(F.sum("expected_goals_against").over(rolling_window)),
        "rolling_expected_assists_against": F.when(first_gw, None).otherwise(F.sum("expected_assists_against").over(rolling_window)),
        "rolling_expected_goal_involvements_against": F.when(first_gw, None).otherwise(F.sum("expected_goal_involvements_against").over(rolling_window)),
        "rolling_goal_difference": F.when(first_gw, None).otherwise(F.sum("goal_diff").over(rolling_window)),
        "rolling_games_played": F.when(first_gw, None).otherwise(F.count("fixture_key").over(rolling_window)),
        "avg_team_expected_goals": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_goals") / F.col("rolling_games_played"), 3)),
        "avg_team_expected_assists": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_assists") / F.col("rolling_games_played"), 3)),
        "avg_team_expected_goal_involvements": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_goal_involvements") / F.col("rolling_games_played"), 3)),
        "avg_expected_goals_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_goals_against") / F.col("rolling_games_played"), 3)),
        "avg_expected_assists_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_assists_against") / F.col("rolling_games_played"), 3)),
        "avg_expected_goal_involvements_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_goal_involvements_against") / F.col("rolling_games_played"), 3)),
        "avg_goal_difference": F.when(first_gw, None).otherwise(F.round(F.col("rolling_goal_difference") / F.col("rolling_games_played"), 3))
    }

    team_fixtures_df = team_fixtures_df.withColumns(rolling_exprs)

    #Join team metadata
    team_features_df = team_fixtures_df.join(
        teams_df.select("team_key", "team_name", "team_name_short", "is_promoted", "is_relegated", "season_key"),
        on=["team_key", "season_key"],
        how="left"
    )

    #Select final columns
    team_features_df = team_features_df.select(
        "team_key", "team_name", "team_name_short", "season_key", "gameweek_key", "fixture_key",
        "is_home", "goals_for", "goals_against", "goal_diff", "match_points",
        "team_expected_goals", "team_expected_assists", "team_expected_goal_involvements",
        "expected_goals_against", "expected_assists_against", "expected_goal_involvements_against",
        "team_exp_stats_available",
        "rolling_points", "home_rolling_points", "away_rolling_points",
        "rolling_team_expected_goals", "rolling_team_expected_assists", "rolling_team_expected_goal_involvements",
        "rolling_expected_goals_against", "rolling_expected_assists_against", "rolling_expected_goal_involvements_against",
        "rolling_goal_difference", "rolling_games_played",
        "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements",
        "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against",
        "avg_goal_difference",
        "is_promoted", "is_relegated"
    )

    return team_features_df

In [0]:
team_features = build_team_features(
    gameweek_stats_df = player_recent_df,
    rolling_window_size = rolling_window_size,
    silver_schema = silver_schema,
    ignore_current_gameweek = True
)

display(team_features)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-8287318452954709>, line 8[0m
[1;32m      1[0m team_features [38;5;241m=[39m build_team_features(
[1;32m      2[0m     gameweek_stats_df [38;5;241m=[39m player_recent_df,
[1;32m      3[0m     rolling_window_size [38;5;241m=[39m rolling_window_size,
[1;32m      4[0m     silver_schema [38;5;241m=[39m silver_schema,
[1;32m      5[0m     ignore_current_gameweek [38;5;241m=[39m [38;5;28;01mTrue[39;00m
[1;32m      6[0m )
[0;32m----> 8[0m display(team_features)

File [0;32m/databricks/python_shell/lib/dbruntime/display.py:133[0m, in [0;36mDisplay.display[0;34m(self, input, *args, **kwargs)[0m
[1;32m    131[0m     [38;5;28;01mpass[39;00m
[1;32m    132[0m [38;5;28;01melif[39;00m [38;5;28mself[39m[38;5;241m.[39m_cf_helper [38;5;129;01mis[39;00m [38;5;129;01mn