In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window

In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def merge_to_table(
    df: DataFrame,
    table_name: str,
    merge_condition: str,
    spark: SparkSession,
    partition_by: list[str] = None
) -> None:
    """
    Performs an upsert (merge) into a Delta table.

    Parameters:
    - df (DataFrame): Incoming DataFrame to merge.
    - table_name (str): Target Delta table name.
    - merge_condition (str): SQL condition for matching rows.
    - spark (SparkSession): Active Spark session.
    - partition_by (list[str], optional): Columns to partition by on initial write.

    If the table does not exist, it will be created using write_to_table.
    """
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        write_to_table(
            df=df_with_ts,
            table_name=table_name,
            partition_by=partition_by
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source=df_with_ts.alias("source"),
                condition=merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

In [0]:
ENV = "dev"
silver_schema = f"fpl_silver_{ENV}"
feature_schema = f"fpl_feature_{ENV}"
gold_schema = f"fpl_gold_{ENV}"

In [0]:

start_date = "2016-08-01"
end_date = "2029-08-01"

date_df = spark.sql(f"SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) AS date_arr"
                     ).selectExpr("explode(date_arr) AS date"
                                  ).withColumns({
        "date_key": F.date_format("date", "yyyyMMdd").cast("int"),
        "year": F.year("date"),
        "month": F.month("date"),
        "day": F.dayofmonth("date"),
        "day_name": F.date_format("date", "EEEE"),
        "day_name_short": F.date_format("date", "E"),
        "month_name": F.date_format("date", "MMMM"),
        "month_name_short": F.date_format("date", "MMM"),
        "month_year": F.date_format("date", "yyyy-MM"),
        "month_id": F.date_format("date", "yyyyMM").cast("int"),
        "is_weekend": F.when(F.date_format("date", "E").isin("Sat", "Sun"), F.lit(True)).otherwise(F.lit(False))
    })

merge_to_table(
    df = date_df,
    table_name = f"{gold_schema}.dim_date",
    merge_condition = "target.date_key = source.date_key",
    spark = spark
)

In [0]:
#bi seasonal    

#spell key to int

In [0]:
team_df = spark.table(f"{silver_schema}.teams")

display(team_df)

merge_to_table(
    df = team_df,
    table_name = f"{gold_schema}.dim_team",
    merge_condition = "target.team_season_key = source.team_season_key",
    spark = spark
)

In [0]:
season_df = spark.table(f"{silver_schema}.seasons")

merge_to_table(
    df = season_df,
    table_name = f"{gold_schema}.dim_season",
    merge_condition = "target.season_key = source.season_key",
    spark = spark
)

In [0]:
gameweek_df = spark.table(f"{silver_schema}.gameweeks")

merge_to_table(
    df = gameweek_df,
    table_name = f"{gold_schema}.dim_gameweek",
    merge_condition = "target.gameweek_key = source.gameweek_key",
    spark = spark
)

In [0]:
#weekly

In [0]:
position_df = spark.table(f"{silver_schema}.positions")

merge_to_table(
    df = position_df,
    table_name = f"{gold_schema}.dim_position",
    merge_condition = "target.position_key = source.position_key",
    spark = spark
)

In [0]:
#weekly

In [0]:
#need to add save points into gameweek stats 

In [0]:
#roll up player gameweek stats

In [0]:
#similar to dim fixtures, has match stats 

In [0]:
#merge predicted with fact player points (from silver)