In [0]:
import os
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import StructType
from pyspark.sql import functions as F

#Functions

In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def read_csv_to_df(
    spark: SparkSession,
    path: str,
    header: bool = True,
    infer_schema: bool = True,
    schema: StructType = None,
    delimiter: str = ",",
    encoding: str = "UTF-8",
    quote: str = '"',
    escape: str = "\\",
    null_value: str = None,
    date_format: str = None,
    timestamp_format: str = None
) -> DataFrame:
    """
    Generalised CSV reader for PySpark.

    Parameters:
    - spark (SparkSession): Active Spark session.
    - path (str): Path to the CSV file.
    - header (bool): Whether the CSV has a header row.
    - infer_schema (bool): Whether to infer schema automatically.
    - schema (StructType, optional): Explicit schema to apply.
    - delimiter (str): Field delimiter (default: ',').
    - encoding (str): File encoding (default: 'UTF-8').
    - quote (str): Quote character (default: '"').
    - escape (str): Escape character (default: '\\').
    - null_value (str, optional): String to interpret as null.
    - date_format (str, optional): Format for date columns.
    - timestamp_format (str, optional): Format for timestamp columns.

    Returns:
    - DataFrame: Loaded Spark DataFrame.
    """

    reader = spark.read.option("header", str(header).lower()
        ).option(
            "delimiter", delimiter
        ).option(
            "encoding", encoding 
        ).option(
            "quote", quote
        ).option(
            "escape", escape
        )

    if null_value:
        reader = reader.option("nullValue", null_value)
    if date_format:
        reader = reader.option("dateFormat", date_format)
    if timestamp_format:
        reader = reader.option("timestampFormat", timestamp_format)

    if schema:
        return reader.schema(schema).csv(path)
    elif infer_schema:
        return reader.option("inferSchema", "true").csv(path)
    else:
        return reader.csv(path)

In [0]:
ENV = "dev"
bronze_schema = f"fpl_bronze_{ENV}"

base_path = "/Volumes/workspace/fpl_raw/player_data"
output_base = f"{bronze_schema}"

In [0]:
for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)

    # Skip non-directories
    if not os.path.isdir(folder_path):
        continue

    # Extract season years
    try:
        start_year, end_year = folder.split("_")
        start_year_int = int(start_year)
    except ValueError:
        print(f"Skipping folder: {folder}")
        continue

    # Skip folders from 2025_26 onwards here
    if start_year_int >= 2025:
        print(f"Skipping folder: {folder}")
        continue

    start_year_short = start_year[2:]

    #ingest historic player gameweek stats
    csv_file = f"all_player_stats_{start_year_short}_{end_year}.csv"
    csv_path = os.path.join(folder_path, csv_file)

    try:
        df = read_csv_to_df(
            spark=spark,
            path=csv_path,
            header=True,
            infer_schema=True,
            encoding="utf-8"
        )
    except Exception as e:
        print(f"Failed to read {csv_path}: {e}")
        continue

    delta_table_name = f"{output_base}.player_gameweek_stats_{start_year_short}_{end_year}"

    write_to_table(
        df=df,
        table_name=delta_table_name,
        mode="overwrite",
        merge_schema=False  
    )

    print(f"Written Delta table: player_gameweek_stats_{start_year_short}_{end_year}")

    #ingest historic players_raw data - detailed data per player
    csv_file = f"players_raw_{start_year_short}_{end_year}.csv"
    csv_path = os.path.join(folder_path, csv_file)

    try:
        df = read_csv_to_df(
            spark=spark,
            path=csv_path,
            header=True,
            infer_schema=True,
            encoding="utf-8"
        )
    except Exception as e:
        print(f"Failed to read {csv_path}: {e}")
        continue

    delta_table_name = f"{output_base}.players_raw_{start_year_short}_{end_year}"

    write_to_table(
        df=df,
        table_name=delta_table_name,
        mode="overwrite",
        merge_schema=False  
    )

    print(f"Written Delta table: players_raw_{start_year_short}_{end_year}")

Writing to: fpl_bronze_dev.player_gameweek_stats_16_17
Written Delta table: player_gameweek_stats_16_17
Writing to: fpl_bronze_dev.players_raw_16_17
Written Delta table: players_raw_16_17
Writing to: fpl_bronze_dev.player_gameweek_stats_17_18
Written Delta table: player_gameweek_stats_17_18
Writing to: fpl_bronze_dev.players_raw_17_18
Written Delta table: players_raw_17_18
Writing to: fpl_bronze_dev.player_gameweek_stats_18_19
Written Delta table: player_gameweek_stats_18_19
Writing to: fpl_bronze_dev.players_raw_18_19
Written Delta table: players_raw_18_19
Writing to: fpl_bronze_dev.player_gameweek_stats_19_20
Written Delta table: player_gameweek_stats_19_20
Writing to: fpl_bronze_dev.players_raw_19_20
Written Delta table: players_raw_19_20
Writing to: fpl_bronze_dev.player_gameweek_stats_20_21
Written Delta table: player_gameweek_stats_20_21
Writing to: fpl_bronze_dev.players_raw_20_21
Written Delta table: players_raw_20_21
Writing to: fpl_bronze_dev.player_gameweek_stats_21_22
Writt