# Ingest JSON from Raw to Bronze

Ingest bootstrap-static and fixtures from manually uploaded JSON files

Write to bronze layer tables in fpl_bronze volume.

In [0]:
import pyspark.sql.functions as F
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession

In [0]:
def read_latest_raw_json(
        base_path: str, 
        filename: str, 
        spark: SparkSession, 
        utils) -> DataFrame:
    """
    Reads the latest raw JSON file from a folder structure in DBFS.

    Parameters:
    - base_path (str): Base path where folders are stored.
    - filename (str): Name of the JSON file to read.
    - spark (SparkSession): Active Spark session.
    - utils: utility object (databricks or fabric).

    Returns:
    - DataFrame: Parsed Spark DataFrame from the latest folder.
    """
    folders = utils.fs.ls(base_path)
    latest_folder = sorted(folders, key=lambda x: x.name, reverse=True)[0].path
    print(f"Loading folder: {latest_folder}")
    return spark.read.option("multiline", "true").json(f"{latest_folder}/{filename}")

def ingest_entity(
    entity_config: dict,
    bronze_schema: str,
    protocol: str,
    season: str,
    spark: SparkSession
) -> None:
    """
    Ingests a single entity into the bronze layer.

    Parameters:
    - entity_config (dict): Configuration for the entity.
    - bronze_schema (str): Target schema name.
    - protocol (str): Global ingestion protocol ('HIST' or 'INCR').
    - season (str): current season suffix for tables
    - spark (SparkSession): Active Spark session.

    The entity_config must include:
    - name (str): Entity name.
    - df (DataFrame): Source DataFrame.
    - protocol (str): Entity-specific protocol.
    Optional keys:
    - path (str): Column to select or explode.
    - explode (bool): Whether to explode the path.
    - alias (str): Alias for exploded column.
    - merge_key (str): Key to use for merge condition.
    """
    name = entity_config["name"] + "_" + season
    df = entity_config["df"]
    path = entity_config.get("path")
    explode = entity_config.get("explode", False)
    alias = entity_config.get("alias")
    merge_key = entity_config.get("merge_key")
    entity_protocol = entity_config["protocol"]

    # Extract and transform
    if explode and path:
        entity_df = df.select(F.explode(path).alias(alias)).select(f"{alias}.*")
    elif path:
        entity_df = df.select(path)
    else:
        entity_df = df

    # # Detect schema drift
    # detect_schema_drift(
    #     new_df=entity_df,
    #     table_name=f"{bronze_schema}.{name}",
    #     spark=spark
    # )

    # Write or merge
    if protocol == "HIST":
        write_to_table(
            df=entity_df,
            table_name=f"{bronze_schema}.{name}"
        )
        print(f"[HIST] {name} written to {bronze_schema}.{name}.")
    elif entity_protocol == "INCR" and protocol == "INCR":
        merge_to_table(
            df=entity_df,
            table_name=f"{bronze_schema}.{name}",
            merge_condition=f"target.{merge_key} = source.{merge_key}",
            spark=spark
        )
        print(f"[INCR] {name} merged to {bronze_schema}.{name}.")


In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

def detect_schema_drift(new_df: DataFrame, table_name: str, spark: SparkSession) -> bool:
    """
    Detects schema drift between a new DataFrame and an existing Delta table.

    Parameters:
    - new_df (DataFrame): The new DataFrame to compare.
    - table_name (str): The name of the existing Delta table.
    - spark (SparkSession): The active Spark session.

    Returns:
    - bool: True if schema drift is detected, False otherwise.
    """
    try:
        existing_df = spark.table(table_name)
        existing_fields = set(field.name for field in existing_df.schema.fields if field.name != "last_updated")
        new_fields = set(field.name for field in new_df.schema.fields if field.name != "last_updated")

        added = new_fields - existing_fields
        removed = existing_fields - new_fields

        if added or removed:
            print(f"Schema drift detected in {table_name}")
            if added:
                print(f"Added fields: {added}")
            if removed:
                print(f"Removed fields: {removed}")
            return True
        return False
    except Exception:
        print(f"ℹ No existing table found for {table_name}. Assuming first write.")
        return False

def merge_to_table(
    df: DataFrame,
    table_name: str,
    merge_condition: str,
    spark: SparkSession,
    partition_by: list[str] = None
) -> None:
    """
    Performs an upsert (merge) into a Delta table.

    Parameters:
    - df (DataFrame): Incoming DataFrame to merge.
    - table_name (str): Target Delta table name.
    - merge_condition (str): SQL condition for matching rows.
    - spark (SparkSession): Active Spark session.
    - partition_by (list[str], optional): Columns to partition by on initial write.

    If the table does not exist, it will be created using write_to_table.
    """
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        write_to_table(
            df=df_with_ts,
            table_name=table_name,
            partition_by=partition_by
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source=df_with_ts.alias("source"),
                condition=merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "dev"

try:
    PROTOCOL = dbutils.widgets.get("PROTOCOL")
except Exception:
    PROTOCOL = "HIST"

#ensure valid ENV and PROTOCOL
valid_envs = {"dev", "test", "prod"}
valid_protocols = {"HIST", "INCR"}

# Validate ENV
if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")

# Validate PROTOCOL
if PROTOCOL not in valid_protocols:
    print(f"Invalid PROTOCOL: {PROTOCOL}. Must be one of {valid_protocols}. Exiting notebook.")
    dbutils.notebook.exit("Invalid PROTOCOL")

In [0]:
BRONZE_SCHEMA = f"fpl_bronze_{ENV}"
CURRENT_SEASON = "2025_26"
CURRENT_SEASON_SHORT = CURRENT_SEASON[2:]
BASE_RAW_JSON_PATH = f"/Volumes/workspace/fpl_raw/player_data/{CURRENT_SEASON}"

# Ingest Raw JSON files

bootstrap-static is core data, with schema as:

- events: Basic information of every Gameweek such as average score, highest score, top scoring player, most captained, etc. Incremental
- game_settings: The game settings and rules. 
- phases: Phases of FPL season. 
- teams: Basic information of current Premier League clubs.
- total_players: Total FPL players.
- elements: Information of all Premier League players including points, status, value, match stats (goals, assists, etc.), ICT index, etc. Incremental
- element_types: Basic information about player’s position (GK, DEF, MID, FWD).
- chips: All chips available in FPL.
- game_config: scoring and game setup rules.


fixtures contains all data about fixtures for the season. It needs to be incrementally loaded as fixtures change often due to clashes/TV viewing changes.

Ingested on a weekly basis (after end of gameweek) and tables written to {table_name}_{season} e.g. events_25_26 for future season ingestion.

In [0]:
bootstrap_static_df = read_latest_raw_json(
    base_path = BASE_RAW_JSON_PATH, 
    filename = "bootstrap_static.json",
    spark = spark,
    utils = dbutils
    )
    
fixtures_df = read_latest_raw_json(
    base_path = BASE_RAW_JSON_PATH, 
    filename = "fixtures.json",
    spark = spark,
    utils = dbutils
    )

Loading folder: dbfs:/Volumes/workspace/fpl_raw/player_data/2025_26/gw_07/
Loading folder: dbfs:/Volumes/workspace/fpl_raw/player_data/2025_26/gw_07/


In [0]:
bootstrap_static_df.printSchema()

root
 |-- chips: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chip_type: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- overrides: struct (nullable = true)
 |    |    |    |-- element_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- pick_multiplier: string (nullable = true)
 |    |    |    |-- rules: struct (nullable = true)
 |    |    |    |    |-- squad_squadsize: long (nullable = true)
 |    |    |-- start_event: long (nullable = true)
 |    |    |-- stop_event: long (nullable = true)
 |-- element_stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- element_types: array (nullable = true)
 |    |-- element: struct (containsNull = tr

In [0]:
fixtures_df.printSchema()

root
 |-- code: long (nullable = true)
 |-- event: long (nullable = true)
 |-- finished: boolean (nullable = true)
 |-- finished_provisional: boolean (nullable = true)
 |-- id: long (nullable = true)
 |-- kickoff_time: string (nullable = true)
 |-- minutes: long (nullable = true)
 |-- provisional_start_time: boolean (nullable = true)
 |-- pulse_id: long (nullable = true)
 |-- started: boolean (nullable = true)
 |-- stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- a: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- element: long (nullable = true)
 |    |    |    |    |-- value: long (nullable = true)
 |    |    |-- h: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- element: long (nullable = true)
 |    |    |    |    |-- value: long (nullable = true)
 |    |    |-- identifier: string (nullable = true)
 |-- team_a: long (nullabl

In [0]:
ENTITY_CONFIG = [
    {
        "name": "chips",
        "df": bootstrap_static_df,
        "path": "chips",
        "explode": True,
        "alias": "chip",
        "protocol": "HIST"
    },
    {
        "name": "element_stats",
        "df": bootstrap_static_df,
        "path": "element_stats",
        "explode": True,
        "alias": "stat",
        "protocol": "HIST"
    },
    {
        "name": "element_types",
        "df": bootstrap_static_df,
        "path": "element_types",
        "explode": True,
        "alias": "type",
        "protocol": "HIST"
    },
    {
        "name": "game_config_scoring",
        "df": bootstrap_static_df,
        "path": "game_config.scoring",
        "season": CURRENT_SEASON_SHORT,
        "explode": False,
        "protocol": "HIST"
    },
    {
        "name": "game_config_rules",
        "df": bootstrap_static_df,
        "path": "game_config.rules",
        "explode": False,
        "protocol": "HIST"
    },
    {
        "name": "phases",
        "df": bootstrap_static_df,
        "path": "phases",
        "explode": True,
        "alias": "phase",
        "protocol": "HIST"
    },
    {
        "name": "teams",
        "df": bootstrap_static_df,
        "path": "teams",
        "explode": True,
        "alias": "team",
        "protocol": "HIST"
    },
    {
        "name": "elements",
        "df": bootstrap_static_df,
        "path": "elements",
        "explode": True,
        "alias": "player",
        "protocol": "INCR",
        "merge_key": "id"
    },
    {
        "name": "events",
        "df": bootstrap_static_df,
        "path": "events",
        "explode": True,
        "alias": "event",
        "protocol": "INCR",
        "merge_key": "id"
    },
    {
        "name": "fixtures",
        "df": fixtures_df,
        "path": None,
        "explode": False,
        "protocol": "INCR",
        "merge_key": "id"
    }

]

In [0]:
for entity in ENTITY_CONFIG:
    ingest_entity(
        entity_config = entity,
        bronze_schema = BRONZE_SCHEMA,
        protocol = PROTOCOL,
        season = CURRENT_SEASON_SHORT,
        spark = spark
    )

[HIST] chips_25_26 written to fpl_bronze_dev.chips_25_26.
[HIST] element_stats_25_26 written to fpl_bronze_dev.element_stats_25_26.
[HIST] element_types_25_26 written to fpl_bronze_dev.element_types_25_26.
[HIST] game_config_scoring_25_26 written to fpl_bronze_dev.game_config_scoring_25_26.
[HIST] game_config_rules_25_26 written to fpl_bronze_dev.game_config_rules_25_26.
[HIST] phases_25_26 written to fpl_bronze_dev.phases_25_26.
[HIST] teams_25_26 written to fpl_bronze_dev.teams_25_26.
[HIST] elements_25_26 written to fpl_bronze_dev.elements_25_26.
[HIST] events_25_26 written to fpl_bronze_dev.events_25_26.
[HIST] fixtures_25_26 written to fpl_bronze_dev.fixtures_25_26.
