In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window

In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def merge_to_table(
    df: DataFrame,
    table_name: str,
    merge_condition: str,
    spark: SparkSession,
    partition_by: list[str] = None,
    merge_schema: bool = True
) -> None:
    """
    Performs an upsert (merge) into a Delta table.

    Parameters:
    - df (DataFrame): Incoming DataFrame to merge.
    - table_name (str): Target Delta table name.
    - merge_condition (str): SQL condition for matching rows.
    - spark (SparkSession): Active Spark session.
    - partition_by (list[str], optional): Columns to partition by on initial write.
    - merge_schema (bool): Whether to merge schema on write.

    If the table does not exist, it will be created using write_to_table.
    """
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        write_to_table(
            df=df_with_ts,
            table_name=table_name,
            partition_by=partition_by,
            merge_schema=merge_schema
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source=df_with_ts.alias("source"),
                condition=merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )


In [0]:
ENV = "dev"
silver_schema = f"fpl_silver_{ENV}"
feature_schema = f"fpl_feature_{ENV}"
gold_schema = f"fpl_gold_{ENV}"

In [0]:
#bi seasonal    
position_df = spark.table(f"{silver_schema}.positions").select("position_key", "position_name")
team_df = spark.table(f"{silver_schema}.teams").select("team_key", "team_name_short").distinct()

player_df = spark.table(f"{silver_schema}.players"
        ).withColumn(
            "player_name", 
            F.concat_ws(" ", F.col("first_name"), F.col("second_name"))
        ).join(
            team_df, 
            on="team_key", 
            how="inner"
        ).join(
            position_df, 
            on="position_key", 
            how="inner"
        )

window_spec = Window.partitionBy("player_name", "season_key").orderBy("player_key")
player_df = player_df.withColumn(
    "is_duplicate",
    F.when(F.count("player_key").over(window_spec) > 1, F.lit(True)).otherwise(F.lit(False))
)

player_df = player_df.withColumn(
    "player_name",
    F.when(
        F.col("is_duplicate"),
        F.concat_ws(" ", F.col("player_name"), F.col("team_name_short"))
    ).otherwise(F.col("player_name"))
)

player_df = player_df.select(
    "player_surrogate_key",
    "player_key",
    "player_season_spell_key",
    "season_key",
    "player_name",
    "player_season_key",
    "team_key",
    "team_name_short",
    "position_name",
    "initial_value",
    "current_value",
    "effective_from",
    "effective_to"
)

merge_to_table(
    df=player_df,
    table_name=f"{gold_schema}.dim_player",
    merge_condition="target.player_surrogate_key = source.player_surrogate_key",
    spark=spark
)

In [0]:
gameweek_df = spark.table(f"{silver_schema}.gameweeks")

merge_to_table(
    df = gameweek_df,
    table_name = f"{gold_schema}.dim_gameweek",
    merge_condition = "target.gameweek_key = source.gameweek_key",
    spark = spark
)

In [0]:
#weekly
team_df = spark.table(f"{silver_schema}.teams").select("team_key", "team_name_short").distinct()

fixture_df = spark.table(f"{silver_schema}.fixtures").select(
    "fixture_key",
    "season_key",
    "gameweek_key",
    "home_team_key",
    "away_team_key",
    "home_team_score",
    "away_team_score",
    "kickoff_time",
    "gameweek"
)

fixture_df = fixture_df.join(
    team_df.withColumnRenamed("team_key", "home_team_key"
        ).withColumnRenamed("team_name_short", "home_team_name"),
    on="home_team_key",
    how="left"
).join(
    team_df.withColumnRenamed("team_key", "away_team_key"
        ).withColumnRenamed("team_name_short", "away_team_name"),
    on="away_team_key",
    how="left"
)

merge_to_table(
    df = fixture_df,
    table_name = f"{gold_schema}.dim_fixture",
    merge_condition = "target.fixture_key = source.fixture_key",
    spark = spark
)

In [0]:
#weekly

players = spark.table(f"{silver_schema}.players").select(
    "player_surrogate_key",
    "player_key",
    "effective_from",
    "effective_to"
)

player_stats = spark.table(f"{silver_schema}.gameweek_stats").select(
    "player_key",
    "season_key",
    "fixture_key",
    "gameweek_key",
    "team_key",
    "position_key",
    "opponent_team_key",
    "minutes",
    "assists",
    "goals_scored",
    "goals_conceded",
    "clean_sheets",
    "own_goals",
    "bps",
    "clearances_blocks_interceptions",
    "recoveries",
    "creativity",
    "defensive_contribution",
    "tackles",
    "saves",
    "yellow_cards",
    "red_cards",
    "expected_assists",
    "expected_goals",
    "expected_goal_involvements",
    "expected_goals_conceded",
    "penalties_saved",
    "penalties_missed",
    "ict_index",
    "influence",
    "value",
    "was_home"
)

player_stats = player_stats.join(
    players,
    (player_stats.player_key == players.player_key) &
    (player_stats.gameweek_key >= players.effective_from) &
    (player_stats.gameweek_key <= players.effective_to),
    "left"
).drop(
    "effective_from",
    "effective_to",
    players.player_key
).withColumn(
    "player_gw_stat_key",
    F.concat(F.col("player_surrogate_key"), F.col("fixture_key"))
)

merge_to_table(
    df = player_stats,
    table_name = f"{gold_schema}.fact_player_gameweek_stats",
    merge_condition = "target.player_gw_stat_key = source.player_gw_stat_key",
    spark = spark
)

In [0]:
#weekly

players = spark.table(f"{silver_schema}.players").select(
    "player_surrogate_key",
    "player_key",
    "effective_from",
    "effective_to"
)

player_points = spark.table(f"{silver_schema}.gameweek_stats").select(
    "player_key",
    "season_key",
    "fixture_key",
    "gameweek_key",
    "team_key",
    "position_key",
    "opponent_team_key",
    "minutes_points",
    "assist_points",
    "goal_points",
    "clean_sheet_points",
    "defensive_contribution_points",
    "goals_conceded_points",
    "own_goal_points",
    "penalty_miss_points",
    "penalty_saves_points",
    "tackles",
    "saves_points",
    "yellow_card_points",
    "red_card_points",
    F.col("bonus").alias("bonus_points")
)

player_points = player_points.join(
    players,
    (player_points.player_key == players.player_key) &
    (player_points.gameweek_key >= players.effective_from) &
    (player_points.gameweek_key <= players.effective_to),
    "left"
).drop(
    "effective_from",
    "effective_to",
    players.player_key
).withColumn(
    "player_gw_stat_key",
    F.concat(F.col("player_surrogate_key"), F.col("fixture_key"))
).withColumn(
    "total_points",
    F.coalesce(F.col("minutes_points"), F.lit(0)) +
    F.coalesce(F.col("assist_points"), F.lit(0)) +
    F.coalesce(F.col("goal_points"), F.lit(0)) +
    F.coalesce(F.col("clean_sheet_points"), F.lit(0)) +
    F.coalesce(F.col("defensive_contribution_points"), F.lit(0)) +
    F.coalesce(F.col("goals_conceded_points"), F.lit(0)) +
    F.coalesce(F.col("own_goal_points"), F.lit(0)) +
    F.coalesce(F.col("penalty_miss_points"), F.lit(0)) +
    F.coalesce(F.col("penalty_saves_points"), F.lit(0)) +
    F.coalesce(F.col("saves_points"), F.lit(0)) +
    F.coalesce(F.col("yellow_card_points"), F.lit(0)) +
    F.coalesce(F.col("red_card_points"), F.lit(0)) +
    F.coalesce(F.col("bonus_points"), F.lit(0))
)

merge_to_table(
    df = player_points,
    table_name = f"{gold_schema}.fact_player_fpl_points",
    merge_condition = "target.player_gw_stat_key = source.player_gw_stat_key",
    spark = spark
)

In [0]:
#weekly

player_stats = spark.table(f"{silver_schema}.gameweek_stats").select(
    "team_key",
    "fixture_key",
    "gameweek_key",
    "season_key",
    "minutes",
    "assists",
    "goals_scored",
    "goals_conceded",
    "clean_sheets",
    "own_goals",
    "bps",
    "clearances_blocks_interceptions",
    "recoveries",
    "creativity",
    "defensive_contribution",
    "tackles",
    "saves",
    "yellow_cards",
    "red_cards",
    "expected_assists",
    "expected_goals",
    "expected_goal_involvements",
    "expected_goals_conceded",
    "penalties_saved",
    "penalties_missed",
    "ict_index",
    "influence",
    "value",
    "opponent_team_key"
)

agg_exprs = [
    F.sum("minutes").alias("minutes"),
    F.sum("assists").alias("assists"),
    F.sum("goals_scored").alias("goals_scored_raw"),
    F.max("goals_conceded").alias("goals_conceded"),
    F.max("clean_sheets").alias("clean_sheets"),
    F.sum("own_goals").alias("own_goals"),
    F.sum("bps").alias("bps"),
    F.sum("clearances_blocks_interceptions").alias("clearances_blocks_interceptions"),
    F.sum("recoveries").alias("recoveries"),
    F.round(F.sum("creativity"), 3).alias("creativity"),
    F.sum("defensive_contribution").alias("defensive_contribution"),
    F.sum("tackles").alias("tackles"),
    F.sum("saves").alias("saves"),
    F.sum("yellow_cards").alias("yellow_cards"),
    F.sum("red_cards").alias("red_cards"),
    F.round(F.sum("expected_assists"), 3).alias("expected_assists"),
    F.round(F.sum("expected_goals"), 3).alias("expected_goals"),
    F.round(F.sum("expected_goal_involvements"), 3).alias("expected_goal_involvements"),
    F.round(F.sum("expected_goals_conceded"), 3).alias("expected_goals_conceded"),
    F.sum("penalties_saved").alias("penalties_saved"),
    F.sum("penalties_missed").alias("penalties_missed")
]

team_gw_stats = player_stats.groupBy(
    "team_key", "fixture_key", "gameweek_key", "season_key"
).agg(*agg_exprs)

# Calculate sum of own_goals from players with opponent_team_key for the same fixture
own_goals_against_df = player_stats.groupBy(
    "opponent_team_key", "fixture_key", "gameweek_key", "season_key"
).agg(
    F.sum("own_goals").alias("own_goals_against")
).withColumnRenamed("opponent_team_key", "team_key")

team_gw_stats = team_gw_stats.join(
    own_goals_against_df,
    on=["team_key", "fixture_key", "gameweek_key", "season_key"],
    how="left"
).withColumn(
    "goals_scored",
    F.col("goals_scored_raw") + F.coalesce(F.col("own_goals_against"), F.lit(0))
).drop("goals_scored_raw", "own_goals_against")

team_gw_stats = team_gw_stats.withColumn(
    "team_gw_stat_key",
    F.concat(F.col("team_key"), F.col("fixture_key"))
)

merge_to_table(
    df = team_gw_stats,
    table_name = f"{gold_schema}.fact_team_gameweek_stats",
    merge_condition = "target.team_gw_stat_key = source.team_gw_stat_key",
    spark = spark
)

In [0]:
#weekly 
#fact table as well as dim to expand for future reference - can add in match stats, scorers, attendance etc.
team_df = spark.table(f"{silver_schema}.teams").select("team_key", "team_name_short").distinct()

fixture_df = spark.table(f"{silver_schema}.fixtures").select(
    "fixture_key",
    "season_key",
    "gameweek_key",
    "home_team_key",
    "away_team_key",
    "home_team_score",
    "away_team_score",
    "kickoff_time",
    "gameweek"
)

fixture_df = fixture_df.join(
    team_df.withColumnRenamed("team_key", "home_team_key"
        ).withColumnRenamed("team_name_short", "home_team_name"),
    on="home_team_key",
    how="left"
).join(
    team_df.withColumnRenamed("team_key", "away_team_key"
        ).withColumnRenamed("team_name_short", "away_team_name"),
    on="away_team_key",
    how="left"
)

merge_to_table(
    df = fixture_df,
    table_name = f"{gold_schema}.fact_fixture",
    merge_condition = "target.fixture_key = source.fixture_key",
    spark = spark
)

In [0]:
players = spark.table(f"{silver_schema}.players").select(
    "player_surrogate_key",
    "player_key",
    "effective_from",
    "effective_to"
)

player_predicted_points = spark.table(f"{feature_schema}.player_points_inference").select(
    "player_key",
    "fixture_key",
    "predicted_total_points"
)

fixture_gw = spark.table(f"{silver_schema}.fixtures").select(
    "fixture_key",
    "gameweek_key"
)

player_predicted_points = player_predicted_points.join(
    fixture_gw,
    on="fixture_key",
    how="left"
)

player_points = player_predicted_points.join(
    players,
    (player_predicted_points.player_key == players.player_key) &
    (player_predicted_points.gameweek_key == (players.effective_to + 1)),
    "left"
).drop(
    "effective_from",
    "effective_to",
    players.player_key
)

fpl_points = spark.table(f"{gold_schema}.fact_player_fpl_points").select(
    "player_gw_stat_key",
    "total_points"
)

player_predicted_actual_points = player_points.join(
    fpl_points,
    F.concat(player_points.player_surrogate_key, player_points.fixture_key) == fpl_points.player_gw_stat_key,
    "left"
)

merge_to_table(
    df = player_predicted_actual_points,
    table_name = f"{gold_schema}.fact_fpl_point_prediction_performance",
    merge_condition = "target.player_gw_stat_key = source.player_gw_stat_key",
    spark = spark
)