In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *

#Functions

In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def merge_to_table(
    df: DataFrame,
    table_name: str,
    merge_condition: str,
    spark: SparkSession,
    partition_by: list[str] = None
) -> None:
    """
    Performs an upsert (merge) into a Delta table.

    Parameters:
    - df (DataFrame): Incoming DataFrame to merge.
    - table_name (str): Target Delta table name.
    - merge_condition (str): SQL condition for matching rows.
    - spark (SparkSession): Active Spark session.
    - partition_by (list[str], optional): Columns to partition by on initial write.

    If the table does not exist, it will be created using write_to_table.
    """
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        write_to_table(
            df=df_with_ts,
            table_name=table_name,
            partition_by=partition_by
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source=df_with_ts.alias("source"),
                condition=merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

In [0]:
# Load and normalise players
def load_and_normalise_players(
    bronze_schema: str, 
    season: str):

    df = spark.table(f"{bronze_schema}.players_raw_{season}")
    for col in expected_players_cols:
        if col not in df.columns:
            df = df.withColumn(col, F.lit(None).cast("string"))
    return df.select(*expected_players_cols)

# Load and normalise stats
def load_and_normalise_stats(
    bronze_schema: str, 
    season: str):

    df = spark.table(f"{bronze_schema}.player_gameweek_stats_{season}")
    for col in expected_stats_cols:
        if col not in df.columns:
            df = df.withColumn(col, F.lit(None))
    return df.select(*expected_stats_cols)

# Extract fixtures from legacy source
def extract_unique_fixtures_historic(
    player_stats_df, 
    players_df, 
    season_key):

    stats_with_team = player_stats_df.join(
        players_df.withColumnRenamed("id", "element"),
        on="element",
        how="left"
    )

    home_df = stats_with_team.filter(F.col("was_home") == True).select(
        "fixture",
        F.col("team_code").alias("home_team_key"),
        F.col("team_h_score").alias("home_team_score"),
        F.col("team_a_score").alias("away_team_score")
    )

    away_df = stats_with_team.filter(F.col("was_home") == False).select(
        "fixture",
        F.col("team_code").alias("away_team_key")
    )

    base_info_df = player_stats_df.select(
        "fixture", "round", "kickoff_time"
    ).dropDuplicates(["fixture"])

    fixtures_df = home_df.join(away_df, 
                               on="fixture", 
                               how="inner"
                            ).join(
                                base_info_df, 
                                on="fixture", 
                                how="left"
                            ).dropDuplicates(["fixture"])

    fixtures_df = fixtures_df.withColumn(
            "season_key", 
            F.lit(season_key).cast("int")
        ).withColumn(
            "fixture_id", 
            F.col("fixture")
        ).withColumn(
            "gameweek_key", 
            F.concat(F.lit(season_key), F.lpad(F.col("round").cast("string"), 2, "0")).cast("int")
        ).withColumn(
            "fixture_key", 
            F.concat(F.lit(season_key), F.lpad(F.col("fixture_id").cast("string"), 3, "0")).cast("int")
        )

    return fixtures_df.select(
        "fixture_key", "season_key", "gameweek_key", "home_team_key", "away_team_key",
        "fixture_id", "home_team_score", "away_team_score", "kickoff_time", "round"
    )

# Extract fixtures from modern source
def extract_fixtures_from_api(
    bronze_schema: str, 
    silver_schema: str, 
    season: str, 
    season_key: str):

    fixtures_df = spark.table(f"{bronze_schema}.fixtures_{season}")
    teams_df = spark.table(f"{silver_schema}.teams").filter(F.col("season_key") == F.lit(season_key))

    teams_home = teams_df.select(
        F.col("team_id").alias("team_h"),
        F.col("team_key").alias("home_team_key")
    )
    teams_away = teams_df.select(
        F.col("team_id").alias("team_a"),
        F.col("team_key").alias("away_team_key")
    )

    fixtures_df = fixtures_df.join(
            teams_home, 
            on="team_h", 
            how="left"
        ).join(
            teams_away, 
            on="team_a", 
            how="left"
        )

    # Filter for finished fixtures
    finished_df = fixtures_df.filter(F.col("finished") == True)

    # Explode stats array
    exploded_stats = finished_df.select(
        "id", "kickoff_time", "home_team_key", "away_team_key", "finished",
        F.explode("stats").alias("stat")
    ).filter(
        F.col("stat.identifier") == "goals_scored"
    )

    # Explode home and away arrays
    home_goals = exploded_stats.select(
        "id",
        F.explode("stat.h").alias("home_stat")
    ).groupBy("id").agg(F.sum("home_stat.value").cast("int").alias("home_team_score"))

    away_goals = exploded_stats.select(
        "id",
        F.explode("stat.a").alias("away_stat")
    ).groupBy("id").agg(F.sum("away_stat.value").cast("int").alias("away_team_score"))

    # Join scores back to fixtures
    scored_df = fixtures_df.join(
            home_goals, 
            on="id", 
            how="left"
        ).join(
            away_goals, 
            on="id", 
            how="left"
        )

    # Add keys and fix null scores
    scored_df = scored_df.withColumn(
        "season_key", 
        F.lit(season_key).cast("int")
        ).withColumn(
            "fixture_id", 
            F.col("id")
        ).withColumn(
            "round", 
            F.col("event")
        ).withColumn(
            "gameweek_key", 
            F.concat(F.lit(season_key), F.lpad(F.col("event").cast("string"), 2, "0")).cast("int")
        ).withColumn(
            "fixture_key", 
            F.concat(F.lit(season_key), F.lpad(F.col("id").cast("string"), 3, "0")).cast("int")
        ).withColumn(
            "home_team_score", 
            F.when(F.col("finished") & F.col("home_team_score").isNull(), F.lit(0)).otherwise(F.col("home_team_score"))
        ).withColumn(
            "away_team_score", 
            F.when(F.col("finished") & F.col("away_team_score").isNull(), F.lit(0)).otherwise(F.col("away_team_score"))
        )

    return scored_df.select(
        "fixture_key", 
        "season_key", 
        "gameweek_key", 
        "home_team_key", 
        "away_team_key",
        "fixture_id", 
        "home_team_score", 
        "away_team_score", 
        "kickoff_time", 
        "round"
    )

#Variables

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "dev"

try:
    PROTOCOL = dbutils.widgets.get("PROTOCOL")
except Exception:
    PROTOCOL = "INCR"

#ensure valid ENV and PROTOCOL
valid_envs = {"dev", "test", "prod"}
valid_protocols = {"HIST", "INCR"}

# Validate ENV
if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")

# Validate PROTOCOL
if PROTOCOL not in valid_protocols:
    print(f"Invalid PROTOCOL: {PROTOCOL}. Must be one of {valid_protocols}. Exiting notebook.")
    dbutils.notebook.exit("Invalid PROTOCOL")
    
bronze_schema = f"fpl_bronze_{ENV}"
silver_schema = f"fpl_silver_{ENV}"
CURRENT_SEASON = "25_26"
API_SEASONS = ["25_26"]

#Transform Fixtures and write to {silver_schema}.fixtures

Split into 2 ETL patterns, historic and api.

If HIST protocol, need to get historic seasons as well as current season. Data for previous seasons found in player gameweek stats, getting team data from the players_raw datasets.

Historic method uses players_raw (with player id and player team_code), player_gameweek_stats for each gameweek stats per player to get a full fixture. 

API uses the fixtures data from bootstrap-static API data and {silver_schema}.teams to get fixture schema.

Can be run HIST (get all fixtures) or INCR (only current season).






In [0]:
# Define expected columns for legacy seasons
expected_players_cols = ["id", "team_code"]
expected_stats_cols = [
    "element", "round", "fixture", "opponent_team", "was_home",
    "team_a_score", "team_h_score", "kickoff_time"
]

# Season mapping
season_keys = {
    "16_17": "201617",
    "17_18": "201718",
    "18_19": "201819",
    "19_20": "201920",
    "20_21": "202021",
    "21_22": "202122",
    "22_23": "202223",
    "23_24": "202324",
    "24_25": "202425",
    "25_26": "202526"
}

# Run HIST or INCR protocol
if PROTOCOL == "HIST":
    fixtures_all = []

    for season, season_key in season_keys.items():
        print(f"Processing season: {season}")
        
        if season in API_SEASONS:  # Extend this list for future seasons
            fixtures_df = extract_fixtures_from_api(
                bronze_schema = bronze_schema,
                silver_schema = silver_schema,
                season = season,
                season_key = season_key
            )
        else:
            players_df = load_and_normalise_players(bronze_schema, season)
            player_stats_df = load_and_normalise_stats(bronze_schema, season)

            fixtures_df = extract_unique_fixtures_historic(player_stats_df, players_df, season_key)

        fixtures_all.append(fixtures_df)

    final_fixtures_df = fixtures_all[0]
    for df in fixtures_all[1:]:
        final_fixtures_df = final_fixtures_df.unionByName(df)

    write_to_table(
        df = final_fixtures_df,
        table_name = f"{silver_schema}.fixtures",
        mode =  "overwrite",
        merge_schema = False
    ) 

elif PROTOCOL == "INCR":
    season_key = season_keys[CURRENT_SEASON]

    print(f"Running INCR protocol for season: {season}")
    incr_df = extract_fixtures_from_api(
        bronze_schema = bronze_schema,
        silver_schema = silver_schema,
        season = season,
        season_key = season_key
    )

    merge_to_table(
        df = incr_df,
        table_name = f"{silver_schema}.fixtures",
        merge_condition = "target.fixture_key = source.fixture_key",
        spark = spark
    )

Running INCR protocol for season: 25_26
