In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from functools import reduce

#Functions


In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def normalise_season_keys(season_start: int) -> dict:
    start_short = str(season_start)[-2:]
    end_short = str(season_start + 1)[-2:]

    return {
        "season_key": f"{season_start}{end_short}",      # e.g. "201617"
        "season_table_suffix": f"{start_short}_{end_short}",  # e.g. "16_17"
        "season_short": f"{start_short}{end_short}"     # e.g. "1617"
    }


#Variables

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "dev"

try:
    PROTOCOL = dbutils.widgets.get("PROTOCOL")
except Exception:
    PROTOCOL = "HIST"

#ensure valid ENV and PROTOCOL
valid_envs = {"dev", "test", "prod"}
valid_protocols = {"HIST", "INCR"}

# Validate ENV
if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")

# Validate PROTOCOL
if PROTOCOL not in valid_protocols:
    print(f"Invalid PROTOCOL: {PROTOCOL}. Must be one of {valid_protocols}. Exiting notebook.")
    dbutils.notebook.exit("Invalid PROTOCOL")
    
bronze_schema = f"fpl_bronze_{ENV}"
silver_schema = f"fpl_silver_{ENV}"
CURRENT_SEASON = "25_26"
API_SEASONS = ["25_26"]

#Define Stats Schema



In [0]:
canonical_schema = {
    "element": IntegerType(),
    "assists": IntegerType(),
    "bonus": IntegerType(),
    "bps": IntegerType(),
    "clean_sheets": IntegerType(),
    "clearances_blocks_interceptions": IntegerType(),
    "creativity": DoubleType(),
    "defensive_contribution": IntegerType(),
    "element": IntegerType(),
    "expected_assists": DoubleType(),
    "expected_goal_involvements": DoubleType(),
    "expected_goals": DoubleType(),
    "expected_goals_conceded": DoubleType(),
    "fixture": IntegerType(),
    "goals_conceded": IntegerType(),
    "goals_scored": IntegerType(),
    "ict_index": DoubleType(),
    "influence": DoubleType(),
    "minutes": IntegerType(),
    "own_goals": IntegerType(),
    "penalties_missed": IntegerType(),
    "penalties_saved": IntegerType(),
    "recoveries": IntegerType(),
    "red_cards": IntegerType(),
    "saves": IntegerType(),
    "starts": IntegerType(),
    "tackles": IntegerType(),
    "threat": DoubleType(),
    "total_points": IntegerType(),
    "value": IntegerType(),
    "was_home": BooleanType(),
    "yellow_cards": IntegerType()
}

In [0]:
def transform_gameweek_stats_df(bronze_schema: str, silver_schema:str, season: int) -> DataFrame:

    season_keys = normalise_season_keys(season)
    season_suffix = season_keys["season_table_suffix"]
    season_key = season_keys["season_key"]

    raw_df = spark.table(f"{bronze_schema}.player_gameweek_stats_{season_suffix}")

    raw_df = raw_df.withColumns(
        {
            "exp_stats_available": F.lit("expected_goals" in raw_df.columns),
            "def_con_available": F.lit("defensive_contribution" in raw_df.columns),
            "season_key": F.lit(season_key).cast("int")
        }
        )

    # Add missing columns with nulls
    for col_name, col_type in canonical_schema.items():
        if col_name not in raw_df.columns:
            raw_df = raw_df.withColumn(col_name, F.lit(None).cast(col_type))
        else:
            raw_df = raw_df.withColumn(col_name, raw_df[col_name].cast(col_type))

    raw_df = raw_df.select(
        list(canonical_schema.keys()) + ["exp_stats_available", "def_con_available", "season_key"]
                           ).withColumnRenamed(
                               "element", "player_id"
                           )

    #get player data
    raw_df = raw_df.withColumn(
        "player_season_key",
        F.concat(F.col("season_key").cast("string"), F.lpad(F.col("player_id").cast("string"), 3, "0")).cast("int")
    )

    players_df = spark.table(f"{silver_schema}.players")

    players_raw_df = raw_df.join(
        players_df.select("player_season_key", "team_key", "player_key", "position_key"),
        on="player_season_key",
        how="left"
    )

    #get fixture data

    players_raw_df = players_raw_df.withColumn(
        "fixture_key",
        F.concat(F.col("season_key").cast("string"), F.lpad(F.col("fixture").cast("string"), 3, "0"))
    )

    fixtures_df = spark.table(f"{silver_schema}.fixtures")

    fixtures_raw_df = players_raw_df.join(
        fixtures_df.select("fixture_key", "home_team_key", "away_team_key", "home_team_score", "away_team_score"),
        on="fixture_key",
        how="left"
    )

    fixtures_raw_df = fixtures_raw_df.withColumns(
        {
            "opponent_team_key": F.when(F.col("was_home"), F.col("away_team_key")).otherwise(F.col("home_team_key")),
            "team_score": F.when(F.col("was_home"), F.col("home_team_score")).otherwise(F.col("away_team_score")),
            "opponent_score": F.when(F.col("was_home"), F.col("away_team_score")).otherwise(F.col("home_team_score"))
        }
        ).drop(
            "home_team_key", "away_team_key", "home_team_score", "away_team_score"
        )

    return fixtures_raw_df

all_player_stats_df = []
for season in range(2016, 2026): # 2016 to 2025 inclusive

    df = transform_gameweek_stats_df(bronze_schema, silver_schema, season)
    all_player_stats_df.append(df)

display(all_player_stats_df)

In [0]:
all_player_stats_df = []
for season in range(2016, 2026): # 2016 to 2025 inclusive

    df = transform_gameweek_stats_df(bronze_schema, silver_schema, season)

    all_player_stats_df.append(df)

player_stats_df = all_player_stats_df[0]
for df in all_player_stats_df[1:]:
    player_stats_df = player_stats_df.unionByName(df)

display(player_stats_df)

#hist and incr
#next 'step' - add running averages for ML
#add positions table


fixture_key,player_season_key,player_id,assists,bonus,bps,clean_sheets,clearances_blocks_interceptions,creativity,defensive_contribution,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,fixture,goals_conceded,goals_scored,ict_index,influence,minutes,own_goals,penalties_missed,penalties_saved,recoveries,red_cards,saves,starts,tackles,threat,total_points,value,was_home,yellow_cards,exp_stats_available,def_con_available,season_key,team_key,player_key,position_key,opponent_team_key,team_score,opponent_score
201617010,201617454,454,0,0,0,0,0,0.0,,,,,,10,0,0,0.0,0.0,0,0,0,0,0,0,0,,0,0.0,0,55,False,0,False,False,201617,21,55459,2,8,1,2
201617003,201617142,142,0,0,6,0,1,0.3,,,,,,3,0,0,0.9,8.2,15,0,0,0,1,0,0,,2,0.0,1,60,True,0,False,False,201617,11,17349,3,6,1,1
201617008,201617016,16,0,0,5,0,2,4.9,,,,,,8,3,0,3.0,2.2,60,0,0,0,2,0,0,,0,23.0,2,80,True,0,False,False,201617,3,41792,3,14,3,4
201617007,201617482,482,0,0,0,0,0,0.0,,,,,,7,0,0,0.0,0.0,0,0,0,0,0,0,0,,0,0.0,0,50,False,0,False,False,201617,57,121599,3,20,1,1
201617010,201617080,80,0,0,0,0,0,0.0,,,,,,10,0,0,0.0,0.0,0,0,0,0,0,0,0,,0,0.0,0,55,True,0,False,False,201617,8,118335,2,21,2,1
201617004,201617163,163,1,0,10,0,0,12.2,,,,,,4,1,0,5.7,14.4,90,0,0,0,1,0,0,,0,30.0,5,60,True,0,False,False,201617,88,59856,4,13,2,1
201617004,201617164,164,0,2,29,0,3,16.8,,,,,,4,1,1,10.7,45.2,90,0,0,0,6,0,0,,0,45.0,8,45,True,0,False,False,201617,88,111847,4,13,2,1
201617006,201617283,283,0,0,6,0,4,2.2,,,,,,6,1,0,1.4,3.2,90,0,0,0,2,0,0,,0,9.0,2,45,True,0,False,False,201617,25,59044,3,110,1,1
201617009,201617030,30,0,0,0,0,0,0.0,,,,,,9,0,0,0.0,0.0,0,0,0,0,0,0,0,,0,0.0,0,45,True,0,False,False,201617,91,15885,1,1,1,3
201617006,201617286,286,0,0,3,0,3,1.3,,,,,,6,1,0,0.3,2.0,69,0,0,0,4,0,0,,0,0.0,1,45,True,1,False,False,201617,25,80179,3,110,1,1
