In [0]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import StructType

#Functions

In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def read_csv_to_df(
    spark: SparkSession,
    path: str,
    header: bool = True,
    infer_schema: bool = True,
    schema: StructType = None,
    delimiter: str = ",",
    encoding: str = "UTF-8",
    quote: str = '"',
    escape: str = "\\",
    null_value: str = None,
    date_format: str = None,
    timestamp_format: str = None
) -> DataFrame:
    """
    Generalised CSV reader for PySpark.

    Parameters:
    - spark (SparkSession): Active Spark session.
    - path (str): Path to the CSV file.
    - header (bool): Whether the CSV has a header row.
    - infer_schema (bool): Whether to infer schema automatically.
    - schema (StructType, optional): Explicit schema to apply.
    - delimiter (str): Field delimiter (default: ',').
    - encoding (str): File encoding (default: 'UTF-8').
    - quote (str): Quote character (default: '"').
    - escape (str): Escape character (default: '\\').
    - null_value (str, optional): String to interpret as null.
    - date_format (str, optional): Format for date columns.
    - timestamp_format (str, optional): Format for timestamp columns.

    Returns:
    - DataFrame: Loaded Spark DataFrame.
    """

    reader = spark.read.option("header", str(header).lower()
        ).option(
            "delimiter", delimiter
        ).option(
            "encoding", encoding 
        ).option(
            "quote", quote
        ).option(
            "escape", escape
        )

    if null_value:
        reader = reader.option("nullValue", null_value)
    if date_format:
        reader = reader.option("dateFormat", date_format)
    if timestamp_format:
        reader = reader.option("timestampFormat", timestamp_format)

    if schema:
        return reader.schema(schema).csv(path)
    elif infer_schema:
        return reader.option("inferSchema", "true").csv(path)
    else:
        return reader.csv(path)

In [0]:
teams_csv_path = "/Volumes/workspace/fpl_raw/player_data/fpl_teams.csv"
BRONZE_SCHEMA = "fpl_bronze_dev"
SILVER_SCHEMA = "fpl_silver_dev"

In [0]:
teams_df = read_csv_to_df(
    spark=spark,
    path=teams_csv_path,
    header=True,
    infer_schema=True
)

#write raw to bronze
write_to_table(
    df = teams_df,
    table_name = f"{BRONZE_SCHEMA}.teams",
    mode = "overwrite",
    merge_schema = False
)

#cast columns to expected types and write to silver
silver_teams_df = teams_df.select(
    F.col("season_key").cast("int").alias("season_key"),
    F.col("team_code").cast("int"),
    F.col("team_id").cast("int"),
    F.col("team_name").cast("string"),
    F.col("team_name_short").cast("string"),
    F.col("is_promoted").cast("boolean"),
    F.col("is_relegated").cast("boolean")
)

write_to_table(
    df = silver_teams_df,
    table_name = f"{SILVER_SCHEMA}.teams",
    mode = "overwrite",
    merge_schema = False
)