In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from functools import reduce

#Functions


In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def merge_to_table(
    df: DataFrame,
    table_name: str,
    merge_condition: str,
    spark: SparkSession,
    partition_by: list[str] = None
) -> None:
    """
    Performs an upsert (merge) into a Delta table.

    Parameters:
    - df (DataFrame): Incoming DataFrame to merge.
    - table_name (str): Target Delta table name.
    - merge_condition (str): SQL condition for matching rows.
    - spark (SparkSession): Active Spark session.
    - partition_by (list[str], optional): Columns to partition by on initial write.

    If the table does not exist, it will be created using write_to_table.
    """
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        write_to_table(
            df=df_with_ts,
            table_name=table_name,
            partition_by=partition_by
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source=df_with_ts.alias("source"),
                condition=merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

In [0]:
def normalise_season_keys(season_start: int) -> dict:
    start_short = str(season_start)[-2:]
    end_short = str(season_start + 1)[-2:]

    return {
        "season_key": f"{season_start}{end_short}",      # e.g. "201617"
        "season_table_suffix": f"{start_short}_{end_short}",  # e.g. "16_17"
        "season_short": f"{start_short}{end_short}"     # e.g. "1617"
    }


#Variables

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "dev"

try:
    PROTOCOL = dbutils.widgets.get("PROTOCOL")
except Exception:
    PROTOCOL = "INCR"

#ensure valid ENV and PROTOCOL
valid_envs = {"dev", "test", "prod"}
valid_protocols = {"HIST", "INCR"}

# Validate ENV
if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")

# Validate PROTOCOL
if PROTOCOL not in valid_protocols:
    print(f"Invalid PROTOCOL: {PROTOCOL}. Must be one of {valid_protocols}. Exiting notebook.")
    dbutils.notebook.exit("Invalid PROTOCOL")
    
bronze_schema = f"fpl_bronze_{ENV}"
silver_schema = f"fpl_silver_{ENV}"
CURRENT_SEASON_START = 2025

#Define Stats Schema



In [0]:
canonical_schema = {
    "element": IntegerType(),
    "assists": IntegerType(),
    "bonus": IntegerType(),
    "bps": IntegerType(),
    "clean_sheets": IntegerType(),
    "clearances_blocks_interceptions": IntegerType(),
    "creativity": DoubleType(),
    "defensive_contribution": IntegerType(),
    "element": IntegerType(),
    "expected_assists": DoubleType(),
    "expected_goal_involvements": DoubleType(),
    "expected_goals": DoubleType(),
    "expected_goals_conceded": DoubleType(),
    "fixture": IntegerType(),
    "goals_conceded": IntegerType(),
    "goals_scored": IntegerType(),
    "ict_index": DoubleType(),
    "influence": DoubleType(),
    "minutes": IntegerType(),
    "own_goals": IntegerType(),
    "penalties_missed": IntegerType(),
    "penalties_saved": IntegerType(),
    "recoveries": IntegerType(),
    "red_cards": IntegerType(),
    "saves": IntegerType(),
    "starts": IntegerType(),
    "tackles": IntegerType(),
    "threat": DoubleType(),
    "total_points": IntegerType(),
    "value": IntegerType(),
    "was_home": BooleanType(),
    "yellow_cards": IntegerType()
}

In [0]:
def transform_gameweek_stats_df(bronze_schema: str, silver_schema:str, season: int) -> DataFrame:

    season_keys = normalise_season_keys(season)
    season_suffix = season_keys["season_table_suffix"]
    season_key = season_keys["season_key"]

    raw_df = spark.table(f"{bronze_schema}.player_gameweek_stats_{season_suffix}")

    raw_df = raw_df.withColumns(
        {
            "exp_stats_available": F.lit("expected_goals" in raw_df.columns),
            "def_con_available": F.lit("defensive_contribution" in raw_df.columns),
            "season_key": F.lit(season_key).cast("int")
        }
        )

    # Add missing columns with nulls
    for col_name, col_type in canonical_schema.items():
        if col_name not in raw_df.columns:
            raw_df = raw_df.withColumn(col_name, F.lit(None).cast(col_type))
        else:
            raw_df = raw_df.withColumn(col_name, raw_df[col_name].cast(col_type))

    raw_df = raw_df.select(
        list(canonical_schema.keys()) + ["exp_stats_available", "def_con_available", "season_key"]
                           ).withColumnRenamed(
                               "element", "player_id"
                           )

    #get fixture data

    raw_df = raw_df.withColumn(
        "fixture_key",
        F.concat(F.col("season_key").cast("string"), F.lpad(F.col("fixture").cast("string"), 3, "0"))
    )

    fixtures_df = spark.table(f"{silver_schema}.fixtures")

    fixtures_raw_df = raw_df.join(
        fixtures_df.select("fixture_key", "home_team_key", "away_team_key",
                            "home_team_score", 
                            "away_team_score", 
                            "gameweek_key"),
        on="fixture_key",
        how="left"
    )

    #get player data
    players_raw_df = fixtures_raw_df.withColumn(
        "player_season_key",
        F.concat(F.col("season_key").cast("string"), F.lpad(F.col("player_id").cast("string"), 3, "0")).cast("int")
    )

    players_df = spark.table(f"{silver_schema}.players").select(
            F.col("player_season_key").alias("_player_season_key"), 
            "team_key", 
            "player_key", 
            "position_key", 
            "first_fixture_key", 
            "last_fixture_key")

    gameweek_stats_df = players_raw_df.join(
        players_df,
        on=(
            (players_raw_df["player_season_key"] == players_df["_player_season_key"]) &
            (players_raw_df["fixture_key"] >= players_df["first_fixture_key"]) &
            (players_raw_df["fixture_key"] <= players_df["last_fixture_key"])
        ),
        how="left"
    )

    gameweek_stats_df = gameweek_stats_df.withColumns(
        {
            "opponent_team_key": F.when(F.col("was_home"), F.col("away_team_key")).otherwise(F.col("home_team_key")),
            "team_score": F.when(F.col("was_home"), F.col("home_team_score")).otherwise(F.col("away_team_score")),
            "opponent_score": F.when(F.col("was_home"), F.col("away_team_score")).otherwise(F.col("home_team_score")),
            "player_fixture_key": F.concat(F.col("player_key"), F.col("fixture_key")).cast("long")
        }
        ).drop(
            "home_team_key", "away_team_key", "home_team_score", "away_team_score", 
            "first_fixture_key", "last_fixture_key", "_player_season_key"
        )

    return gameweek_stats_df

In [0]:
all_player_stats_df = []

if PROTOCOL == "HIST":
    for season in range(2016, CURRENT_SEASON_START + 1): # 2016 to 2025 inclusive

        df = transform_gameweek_stats_df(bronze_schema, silver_schema, season)

        all_player_stats_df.append(df)

    player_stats_df = all_player_stats_df[0]
    for df in all_player_stats_df[1:]:
        player_stats_df = player_stats_df.unionByName(df)

    write_to_table(
        df = player_stats_df,
        table_name = f"{silver_schema}.gameweek_stats",
        mode = "overwrite",
        merge_schema = False
    )

elif PROTOCOL == "INCR":
    season = CURRENT_SEASON_START

    player_stats_df = transform_gameweek_stats_df(bronze_schema, silver_schema, season)

    merge_to_table(
        df = player_stats_df,
        table_name = f"{silver_schema}.gameweek_stats",
        merge_condition = "source.player_fixture_key = target.player_fixture_key",
        spark = spark
    )