In [0]:
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from functools import reduce

#Functions

In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = False,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

#Variables

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "dev"

try:
    PROTOCOL = dbutils.widgets.get("PROTOCOL")
except Exception:
    PROTOCOL = "HIST"

#ensure valid ENV and PROTOCOL
valid_envs = {"dev", "test", "prod"}
valid_protocols = {"HIST", "INCR"}

# Validate ENV
if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")

# Validate PROTOCOL
if PROTOCOL not in valid_protocols:
    print(f"Invalid PROTOCOL: {PROTOCOL}. Must be one of {valid_protocols}. Exiting notebook.")
    dbutils.notebook.exit("Invalid PROTOCOL")
    
silver_schema = f"fpl_silver_{ENV}"
feature_schema = f"fpl_feature_{ENV}"
rolling_window_size = 5 

#Load Source Tables

In [0]:
stats_df = spark.table(f"{silver_schema}.gameweek_stats")

#Team Feature Engineering

In [0]:
def build_team_features(
    gameweek_stats_df,
    rolling_window_size,
    silver_schema,
    training = True
):    
    if training:
        fixtures_df = spark.read.table(f"{silver_schema}.fixtures").filter(F.col("home_team_score").isNotNull())
    else: 
        fixtures_df = spark.read.table(f"{silver_schema}.fixtures")
    teams_df = spark.read.table(f"{silver_schema}.teams")

    #Aggregate xG, xA, and exp_stats_available per team per fixture
    team_xg_xa_df = gameweek_stats_df.groupBy(
        "fixture_key", "team_key"
        ).agg(
            F.sum("expected_goals").alias("team_expected_goals"),
            F.sum("expected_assists").alias("team_expected_assists"),
            F.max("exp_stats_available").alias("team_exp_stats_available")
        ).withColumn(
            "team_expected_goal_involvements",
            F.col("team_expected_goals") + F.col("team_expected_assists")
        )

    #Create opponent xG/xA aggregates
    opponent_xg_xa_df = team_xg_xa_df.select(
            "fixture_key",
            F.col("team_key").alias("opponent_team_key"),
            F.col("team_expected_goals").alias("expected_goals_against"),
            F.col("team_expected_assists").alias("expected_assists_against"),
            F.col("team_expected_goal_involvements").alias("expected_goal_involvements_against")
        )

    #Transform fixtures into team-level records
    home_df = fixtures_df.select(
            "fixture_key",
            "season_key",
            "gameweek_key",
            F.col("home_team_key").alias("team_key"),
            F.col("away_team_key").alias("opponent_team_key"),
            F.lit(True).alias("is_home"),
            F.col("home_team_score").alias("goals_for"),
            F.col("away_team_score").alias("goals_against")
        )

    away_df = fixtures_df.select(
            "fixture_key",
            "season_key",
            "gameweek_key",
            F.col("away_team_key").alias("team_key"),
            F.col("home_team_key").alias("opponent_team_key"),
            F.lit(False).alias("is_home"),
            F.col("away_team_score").alias("goals_for"),
            F.col("home_team_score").alias("goals_against")
        )

    team_fixtures_df = home_df.unionByName(away_df)

    # Window specs for rolling metrics up to previous GW
    base_window = Window.partitionBy("team_key", "season_key").orderBy("gameweek_key")

    if training:
        rolling_window = base_window.rowsBetween(-rolling_window_size, -1)
    else:
        rolling_window = base_window

    first_gw = F.col("gameweek_key") == 1
    
    #Add match-level metrics
    team_fixtures_df = team_fixtures_df.withColumns({
        "goal_diff": F.col("goals_for") - F.col("goals_against"),
        "match_points": F.when(F.col("goals_for") > F.col("goals_against"), F.lit(3))
                        .when(F.col("goals_for") == F.col("goals_against"), F.lit(1))
                        .otherwise(F.lit(0))
    })

    #Join team xG/xA and opponent xG/xA
    team_fixtures_df = team_fixtures_df.join(
        team_xg_xa_df, 
        on=["fixture_key", "team_key"], 
        how="left"
        ).join(
            opponent_xg_xa_df, 
            on=["fixture_key", "opponent_team_key"], 
            how="left"
        )

    # Rolling metrics expressions
    rolling_exprs = {
        "rolling_points": F.when(first_gw, None).otherwise(F.sum("match_points").over(rolling_window)),
        "home_rolling_points": F.when(first_gw, None).otherwise(F.sum(F.when(F.col("is_home"), F.col("match_points")).otherwise(0)).over(rolling_window)),
        "away_rolling_points": F.when(first_gw, None).otherwise(F.sum(F.when(~F.col("is_home"), F.col("match_points")).otherwise(0)).over(rolling_window)),
        "rolling_team_expected_goals": F.when(first_gw, None).otherwise(F.sum("team_expected_goals").over(rolling_window)),
        "rolling_team_expected_assists": F.when(first_gw, None).otherwise(F.sum("team_expected_assists").over(rolling_window)),
        "rolling_team_expected_goal_involvements": F.when(first_gw, None).otherwise(F.sum("team_expected_goal_involvements").over(rolling_window)),
        "rolling_expected_goals_against": F.when(first_gw, None).otherwise(F.sum("expected_goals_against").over(rolling_window)),
        "rolling_expected_assists_against": F.when(first_gw, None).otherwise(F.sum("expected_assists_against").over(rolling_window)),
        "rolling_expected_goal_involvements_against": F.when(first_gw, None).otherwise(F.sum("expected_goal_involvements_against").over(rolling_window)),
        "rolling_goal_difference": F.when(first_gw, None).otherwise(F.sum("goal_diff").over(rolling_window)),
        "rolling_games_played": F.when(first_gw, None).otherwise(F.count("fixture_key").over(rolling_window)),
        "avg_team_expected_goals": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_goals") / F.col("rolling_games_played"), 3)),
        "avg_team_expected_assists": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_assists") / F.col("rolling_games_played"), 3)),
        "avg_team_expected_goal_involvements": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_goal_involvements") / F.col("rolling_games_played"), 3)),
        "avg_expected_goals_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_goals_against") / F.col("rolling_games_played"), 3)),
        "avg_expected_assists_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_assists_against") / F.col("rolling_games_played"), 3)),
        "avg_expected_goal_involvements_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_goal_involvements_against") / F.col("rolling_games_played"), 3)),
        "avg_goal_difference": F.when(first_gw, None).otherwise(F.round(F.col("rolling_goal_difference") / F.col("rolling_games_played"), 3))
    }

    team_fixtures_df = team_fixtures_df.withColumns(rolling_exprs)

    #Join team metadata
    team_features_df = team_fixtures_df.join(
        teams_df.select("team_key", "team_name", "team_name_short", "is_promoted", "is_relegated", "season_key"),
        on=["team_key", "season_key"],
        how="left"
    )

    #Select final columns
    team_features_df = team_features_df.select(
        "team_key", "team_name", "team_name_short", "season_key", "gameweek_key", "fixture_key",
        "is_home", "goals_for", "goals_against", "goal_diff", "match_points",
        "team_expected_goals", "team_expected_assists", "team_expected_goal_involvements",
        "expected_goals_against", "expected_assists_against", "expected_goal_involvements_against",
        "team_exp_stats_available",
        "rolling_points", "home_rolling_points", "away_rolling_points",
        "rolling_team_expected_goals", "rolling_team_expected_assists", "rolling_team_expected_goal_involvements",
        "rolling_expected_goals_against", "rolling_expected_assists_against", "rolling_expected_goal_involvements_against",
        "rolling_goal_difference", "rolling_games_played",
        "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements",
        "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against",
        "avg_goal_difference",
        "is_promoted", "is_relegated"
    )

    return team_features_df

In [0]:
team_features_df = build_team_features(
    gameweek_stats_df = stats_df,
    rolling_window_size = rolling_window_size,
    silver_schema = silver_schema,
    training = True
)

write_to_table(
    df=team_features_df,
    table_name=f"{feature_schema}.team_features_backup",
    mode="overwrite"
)

display(team_features_df)

team_key,team_name,team_name_short,season_key,gameweek_key,fixture_key,is_home,goals_for,goals_against,goal_diff,match_points,team_expected_goals,team_expected_assists,team_expected_goal_involvements,expected_goals_against,expected_assists_against,expected_goal_involvements_against,team_exp_stats_available,rolling_points,home_rolling_points,away_rolling_points,rolling_team_expected_goals,rolling_team_expected_assists,rolling_team_expected_goal_involvements,rolling_expected_goals_against,rolling_expected_assists_against,rolling_expected_goal_involvements_against,rolling_goal_difference,rolling_games_played,avg_team_expected_goals,avg_team_expected_assists,avg_team_expected_goal_involvements,avg_expected_goals_against,avg_expected_assists_against,avg_expected_goal_involvements_against,avg_goal_difference,is_promoted,is_relegated
1,Manchester United,MUN,201617,20161701,201617009,False,3,1,2,3,,,,,,,False,,,,,,,,,,,0,,,,,,,,False,False
1,Manchester United,MUN,201617,20161702,201617011,True,2,0,2,3,,,,,,,False,3.0,0.0,3.0,,,,,,,2.0,1,,,,,,,2.0,False,False
1,Manchester United,MUN,201617,20161703,201617024,False,1,0,1,3,,,,,,,False,6.0,3.0,3.0,,,,,,,4.0,2,,,,,,,2.0,False,False
1,Manchester United,MUN,201617,20161704,201617035,True,1,2,-1,0,,,,,,,False,9.0,3.0,6.0,,,,,,,5.0,3,,,,,,,1.667,False,False
1,Manchester United,MUN,201617,20161705,201617050,False,1,3,-2,0,,,,,,,False,9.0,3.0,6.0,,,,,,,4.0,4,,,,,,,1.0,False,False
1,Manchester United,MUN,201617,20161706,201617054,True,4,1,3,3,,,,,,,False,9.0,3.0,6.0,,,,,,,2.0,5,,,,,,,0.4,False,False
1,Manchester United,MUN,201617,20161707,201617065,True,1,1,0,1,,,,,,,False,9.0,6.0,3.0,,,,,,,3.0,5,,,,,,,0.6,False,False
1,Manchester United,MUN,201617,20161708,201617075,False,0,0,0,1,,,,,,,False,7.0,4.0,3.0,,,,,,,1.0,5,,,,,,,0.2,False,False
1,Manchester United,MUN,201617,20161709,201617084,False,0,4,-4,0,,,,,,,False,5.0,4.0,1.0,,,,,,,0.0,5,,,,,,,0.0,False,False
1,Manchester United,MUN,201617,20161710,201617093,True,0,0,0,1,,,,,,,False,5.0,4.0,1.0,,,,,,,-3.0,5,,,,,,,-0.6,False,False


#Player Features

In [0]:
# Prepare rolling window up to previous Gameweek per player per season
player_base_window = Window.partitionBy("player_key", "season_key", "team_key").orderBy("gameweek_key")
player_rolling_window = player_base_window.rowsBetween(-rolling_window_size, -1)

player_first_gw = F.row_number().over(player_base_window) == 1

# Calculate rolling player stats (including new FPL point features)
player_rolling_exprs = {
    "rolling_expected_goals": F.when(player_first_gw, None).otherwise(F.sum("expected_goals").over(player_rolling_window)),
    "rolling_expected_assists": F.when(player_first_gw, None).otherwise(F.sum("expected_assists").over(player_rolling_window)),
    "rolling_expected_goal_involvements": F.when(player_first_gw, None).otherwise(F.sum("expected_goal_involvements").over(player_rolling_window)),
    "rolling_goals_scored": F.when(player_first_gw, None).otherwise(F.sum("goals_scored").over(player_rolling_window)),
    "rolling_assists": F.when(player_first_gw, None).otherwise(F.sum("assists").over(player_rolling_window)),
    "rolling_total_points": F.when(player_first_gw, None).otherwise(F.sum("total_points").over(player_rolling_window)),
    "rolling_minutes": F.when(player_first_gw, None).otherwise(F.sum("minutes").over(player_rolling_window)),
    "rolling_clean_sheets": F.when(player_first_gw, None).otherwise(F.sum("clean_sheets").over(player_rolling_window)),
    "rolling_bps": F.when(player_first_gw, None).otherwise(F.sum("bps").over(player_rolling_window)),
    "rolling_ict_index": F.when(player_first_gw, None).otherwise(F.sum("ict_index").over(player_rolling_window)),
    "rolling_influence": F.when(player_first_gw, None).otherwise(F.sum("influence").over(player_rolling_window)),
    "rolling_creativity": F.when(player_first_gw, None).otherwise(F.sum("creativity").over(player_rolling_window)),
    "rolling_threat": F.when(player_first_gw, None).otherwise(F.sum("threat").over(player_rolling_window)),
    "rolling_defensive_contribution": F.when(player_first_gw, None).otherwise(F.sum("defensive_contribution").over(player_rolling_window)),
    "rolling_clearances_blocks_interceptions": F.when(player_first_gw, None).otherwise(F.sum("clearances_blocks_interceptions").over(player_rolling_window)),
    "rolling_bonus": F.when(player_first_gw, None).otherwise(F.sum("bonus").over(player_rolling_window)),
    # Position-specific: saves only for GK
    "rolling_saves": F.when(player_first_gw, None).otherwise(F.sum(F.when(F.col("position_key") == 1, F.col("saves")).otherwise(None)).over(player_rolling_window)),
    "rolling_games_played": F.when(player_first_gw, None).otherwise(F.count("fixture_key").over(player_rolling_window)),
    # Rolling FPL rule-based points
    "rolling_minutes_points": F.when(player_first_gw, None).otherwise(F.sum("minutes_points").over(player_rolling_window)),
    "rolling_assist_points": F.when(player_first_gw, None).otherwise(F.sum("assist_points").over(player_rolling_window)),
    "rolling_goal_points": F.when(player_first_gw, None).otherwise(F.sum("goal_points").over(player_rolling_window)),
    "rolling_clean_sheet_points": F.when(player_first_gw, None).otherwise(F.sum("clean_sheet_points").over(player_rolling_window)),
    "rolling_defensive_contribution_points": F.when(player_first_gw, None).otherwise(F.sum("defensive_contribution_points").over(player_rolling_window)),
    "rolling_penalty_miss_points": F.when(player_first_gw, None).otherwise(F.sum("penalty_miss_points").over(player_rolling_window)),
    "rolling_goals_conceded_points": F.when(player_first_gw, None).otherwise(F.sum("goals_conceded_points").over(player_rolling_window)),
    "rolling_yellow_card_points": F.when(player_first_gw, None).otherwise(F.sum("yellow_card_points").over(player_rolling_window)),
    "rolling_red_card_points": F.when(player_first_gw, None).otherwise(F.sum("red_card_points").over(player_rolling_window)),
    "rolling_own_goal_points": F.when(player_first_gw, None).otherwise(F.sum("own_goal_points").over(player_rolling_window))
}

player_features_df = stats_df.withColumns(player_rolling_exprs)

# Calculate rolling averages to 3 decimal places, set to null for first gameweek
player_avg_exprs = {
    f"avg_{k[8:]}": F.when(
        player_first_gw,
        None
    ).otherwise(
        F.round(
            F.when(
                F.col("rolling_games_played") != 0,
                F.col(k) / F.col("rolling_games_played")
            ).otherwise(None),
            3
        )
    )
    for k in player_rolling_exprs if k != "rolling_games_played"
}
player_features_df = player_features_df.withColumns(player_avg_exprs)

# Join with team_features for contextual strength and match_points
pf = player_features_df.alias("pf")
tf = team_features_df.alias("tf")

team_strength_cols = [
    F.col("team_key").alias("_team_key"), F.col("season_key").alias("team_season_key"), F.col("gameweek_key").alias("team_gamweek_key"),
    "rolling_points", "rolling_team_expected_goals",
    "rolling_expected_goals_against", "rolling_goal_difference",
    "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements",
    "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against",
    "avg_goal_difference", "match_points"
]

tf = tf.select(*team_strength_cols)

pf = pf.join(
    tf,
    (pf["team_key"] == tf["_team_key"]) &
    (pf["season_key"] == tf["team_season_key"]) &
    (pf["gameweek_key"] == tf["team_gamweek_key"]),
    how="left"
).drop("_team_key", "team_season_key", "team_gamweek_key")

# Join opponent team strength features
tf_opp = team_features_df.alias("tf_opp")
opponent_team_strength_cols = [
    F.col("team_key").alias("_opponent_team_key"),
    F.col("season_key").alias("opponent_season_key"),
    F.col("gameweek_key").alias("opponent_gameweek_key"),
    F.col("rolling_points").alias("opponent_rolling_points"),
    F.col("rolling_team_expected_goals").alias("opponent_rolling_team_expected_goals"),
    F.col("rolling_expected_goals_against").alias("opponent_rolling_expected_goals_against"),
    F.col("rolling_goal_difference").alias("opponent_rolling_goal_difference"),
    F.col("avg_team_expected_goals").alias("opponent_avg_team_expected_goals"),
    F.col("avg_team_expected_assists").alias("opponent_avg_team_expected_assists"),
    F.col("avg_team_expected_goal_involvements").alias("opponent_avg_team_expected_goal_involvements"),
    F.col("avg_expected_goals_against").alias("opponent_avg_expected_goals_against"),
    F.col("avg_expected_assists_against").alias("opponent_avg_expected_assists_against"),
    F.col("avg_expected_goal_involvements_against").alias("opponent_avg_expected_goal_involvements_against"),
    F.col("avg_goal_difference").alias("opponent_avg_goal_difference")
]

tf_opp = team_features_df.select(*opponent_team_strength_cols)

pf = pf.join(
    tf_opp,
    (pf["opponent_team_key"] == tf_opp["_opponent_team_key"]) &
    (pf["season_key"] == tf_opp["opponent_season_key"]) &
    (pf["gameweek_key"] == tf_opp["opponent_gameweek_key"]),
    how="left"
).drop("_opponent_team_key", "opponent_season_key", "opponent_gameweek_key")

# For GK/DEF/MID, add team defensive rolling stats
player_features_df = pf.withColumns({
    "team_rolling_goals_conceded": F.when(F.col("position_key").isin([1,2,3]), F.col("rolling_expected_goals_against")).otherwise(None),
    "team_rolling_goal_difference": F.when(F.col("position_key").isin([1,2,3]), F.col("rolling_goal_difference")).otherwise(None)
})

# Add ratios of team share
player_features_df = player_features_df.withColumns({
    "player_share_of_team_xG": F.round(
        F.when(
            F.col("rolling_team_expected_goals") != 0,
            F.col("rolling_expected_goals") / F.col("rolling_team_expected_goals")
        ).otherwise(None),
        3
    ),
    "player_share_of_team_points": F.round(
        F.when(
            F.col("rolling_points") != 0,
            F.col("rolling_total_points") / F.col("rolling_points")
        ).otherwise(None),
        3
    )
})

# Select final columns, including raw stats from stats_df and match_points from team_features
raw_stats_cols = [
    "fixture_key", "player_id", "player_key", "player_season_key", "player_fixture_key", "team_key", "season_key", "gameweek_key", "position_key", "opponent_team_key", "was_home", "exp_stats_available", "def_con_available", "total_points"
]

final_cols = [
    *raw_stats_cols,
    # Rolling stats
    *list(player_rolling_exprs.keys()),
    # Rolling averages
    *list(player_avg_exprs.keys()),
    # Team/contextual features
    "rolling_points", "rolling_team_expected_goals", "rolling_expected_goals_against", "rolling_goal_difference",
    "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements",
    "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against",
    "avg_goal_difference",
    "match_points",
    "team_rolling_goals_conceded", "team_rolling_goal_difference",
    "player_share_of_team_xG", "player_share_of_team_points",
    # Opponent team strength features
    "opponent_rolling_points", "opponent_rolling_team_expected_goals",
    "opponent_rolling_expected_goals_against", "opponent_rolling_goal_difference",
    "opponent_avg_team_expected_goals", "opponent_avg_team_expected_assists", "opponent_avg_team_expected_goal_involvements",
    "opponent_avg_expected_goals_against", "opponent_avg_expected_assists_against", "opponent_avg_expected_goal_involvements_against",
    "opponent_avg_goal_difference"
]

player_features_df = player_features_df.select(*final_cols)

player_features_df = player_features_df.withColumns({
    "was_home": F.col("was_home").cast("int"),
    "exp_stats_available": F.col("exp_stats_available").cast("int"),
    "def_con_available": F.col("def_con_available").cast("int")
})

write_to_table(
    df=player_features_df,
    table_name=f"{feature_schema}.player_features",
    mode="overwrite"
)