# Ingest JSON from Raw to Bronze

Ingest bootstrap-static and fixtures from manually uploaded JSON files

Write to bronze layer tables in fpl_bronze volume.

In [0]:
import pyspark.sql.functions as F
from delta.tables import DeltaTable

In [0]:
ENV = "test"
BRONZE_SCHEMA = f"fpl_bronze_{ENV}"
BASE_RAW_JSON_PATH = "/Volumes/workspace/fpl_raw/raw_json/"
PROTOCOL = "INCR"

# Functions to Modularise

In [0]:
def merge_to_table(
    df,
    table_name,
    merge_condition,
    spark,
    partition_by=None
):
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        # First time write
        write_to_table(
            df = df_with_ts,
            table_name = table_name,
            partition_by = partition_by
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source = df_with_ts.alias("source"),
                condition = merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

In [0]:
def read_latest_raw_json(base_path, filename):
    latest_folder = sorted(dbutils.fs.ls(base_path), key=lambda x: x.name, reverse=True)[0].path
    print("Loading folder: ", latest_folder)
    return spark.read.option("multiline", "true").json(f"{latest_folder}/{filename}")

In [0]:
def write_to_table(
        df,
        table_name,
        mode="overwrite",
        merge_schema=True,
        partition_by=None,
        path=None,
        save_as_table=True
    ):
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df: Spark DataFrame to write.
    - table_name: Name of the Delta table (used if save_as_table=True).
    - mode: Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema: Whether to merge schema on write.
    - partition_by: Optional list of columns to partition by.
    - path: Optional path to save the Delta table (used if save_as_table=False).
    - save_as_table: If True, saves as managed table; else saves to path.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def detect_schema_drift(new_df, table_name, spark):
    try:
        existing_df = spark.table(table_name)
        existing_fields = set([field.name for field in existing_df.schema.fields if field.name != "last_updated"])
        new_fields = set([field.name for field in new_df.schema.fields if field.name != "last_updated"])

        added = new_fields - existing_fields
        removed = existing_fields - new_fields

        if added or removed:
            print(f"Schema drift detected in {table_name}")
            if added:
                print(f"Added fields: {added}")
            if removed:
                print(f"Removed fields: {removed}")
            return True
        else:
            return False
    except Exception as e:
        print(f"ℹNo existing table found for {table_name}. Assuming first write.")
        return False

In [0]:
def ingest_entity(
    entity_config,
    bronze_schema,
    protocol,
    spark
):

    name = entity_config["name"]
    df = entity_config["df"]
    path = entity_config.get("path", None)
    explode = entity_config.get("explode", False)
    alias = entity_config.get("alias", None)
    merge_key = entity_config.get("merge_key", None)
    entity_protocol = entity_config["protocol"]


    # Extract and transform
    if explode and path:
        entity_df = df.select(F.explode(path).alias(alias)).select(f"{alias}.*")
    elif path:
        entity_df = df.select(path)
    else:
        entity_df = df


    # Detect schema drift
    detect_schema_drift(
        new_df = entity_df,
        table_name = f"{bronze_schema}.{name}",
        spark = spark
    )

    # Write or merge - only historic if protocol is HIST
    if protocol == "HIST":
        write_to_table(
            df = entity_df,
            table_name = f"{bronze_schema}.{name}"
        )
        print(f"[HIST] {name} written to {bronze_schema}.{name}.")
    elif entity_protocol == "INCR" and protocol == "INCR":
        merge_to_table(
            df = entity_df,
            table_name = f"{bronze_schema}.{name}",
            merge_condition = f"target.{merge_key} = source.{merge_key}",
            spark = spark
        )
        print(f"[INCR] {name} merged to {bronze_schema}.{name}.")

    

# Ingest Raw JSON files

bootstrap-static is core data, with schema as:

- events: Basic information of every Gameweek such as average score, highest score, top scoring player, most captained, etc. Incremental
- game_settings: The game settings and rules. 
- phases: Phases of FPL season. 
- teams: Basic information of current Premier League clubs.
- total_players: Total FPL players.
- elements: Information of all Premier League players including points, status, value, match stats (goals, assists, etc.), ICT index, etc. Incremental
- element_types: Basic information about player’s position (GK, DEF, MID, FWD).
- chips: All chips available in FPL.
- game_config: scoring and game setup rules.


fixtures contains all data about fixtures for the season. It needs to be incrementally loaded as fixtures change often due to clashes/TV viewing changes.

In [0]:
bootstrap_static_df = read_latest_raw_json(
    base_path = BASE_RAW_JSON_PATH, 
    filename = "bootstrap_static.json"
    )
    
fixtures_df = read_latest_raw_json(
    base_path = BASE_RAW_JSON_PATH, 
    filename = "fixtures.json"
    )

Loading folder:  dbfs:/Volumes/workspace/fpl_raw/raw_json/2025_26_gw_07/
Loading folder:  dbfs:/Volumes/workspace/fpl_raw/raw_json/2025_26_gw_07/


In [0]:
bootstrap_static_df.printSchema()

root
 |-- chips: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chip_type: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- overrides: struct (nullable = true)
 |    |    |    |-- element_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- pick_multiplier: string (nullable = true)
 |    |    |    |-- rules: struct (nullable = true)
 |    |    |    |    |-- squad_squadsize: long (nullable = true)
 |    |    |-- start_event: long (nullable = true)
 |    |    |-- stop_event: long (nullable = true)
 |-- element_stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- element_types: array (nullable = true)
 |    |-- element: struct (containsNull = tr

In [0]:
fixtures_df.printSchema()



In [0]:
ENTITY_CONFIG = [
    {
        "name": "chips",
        "df": bootstrap_static_df,
        "path": "chips",
        "explode": True,
        "alias": "chip",
        "protocol": "HIST"
    },
    {
        "name": "element_stats",
        "df": bootstrap_static_df,
        "path": "element_stats",
        "explode": True,
        "alias": "stat",
        "protocol": "HIST"
    },
    {
        "name": "element_types",
        "df": bootstrap_static_df,
        "path": "element_types",
        "explode": True,
        "alias": "type",
        "protocol": "HIST"
    },
    {
        "name": "game_config_scoring",
        "df": bootstrap_static_df,
        "path": "game_config.scoring",
        "explode": False,
        "protocol": "HIST"
    },
    {
        "name": "game_config_rules",
        "df": bootstrap_static_df,
        "path": "game_config.rules",
        "explode": False,
        "protocol": "HIST"
    },
    {
        "name": "phases",
        "df": bootstrap_static_df,
        "path": "phases",
        "explode": True,
        "alias": "phase",
        "protocol": "HIST"
    },
    {
        "name": "teams",
        "df": bootstrap_static_df,
        "path": "teams",
        "explode": True,
        "alias": "team",
        "protocol": "HIST"
    },
    {
        "name": "elements",
        "df": bootstrap_static_df,
        "path": "elements",
        "explode": True,
        "alias": "player",
        "protocol": "INCR",
        "merge_key": "id"
    },
    {
        "name": "events",
        "df": bootstrap_static_df,
        "path": "events",
        "explode": True,
        "alias": "event",
        "protocol": "INCR",
        "merge_key": "id"
    },
    {
        "name": "fixtures",
        "df": fixtures_df,
        "path": None,
        "explode": False,
        "protocol": "INCR",
        "merge_key": "id"
    }

]

In [0]:
for entity in ENTITY_CONFIG:
    ingest_entity(
        entity_config = entity,
        bronze_schema = BRONZE_SCHEMA,
        protocol = PROTOCOL,
        spark = spark
    )

[INCR] elements merged to fpl_bronze_test.elements.
[INCR] events merged to fpl_bronze_test.events.
[INCR] fixtures merged to fpl_bronze_test.fixtures.


# Next Steps:

modularise functions to .py

unit tests on modularised functions

ci/cd to test 

ci/cd to prod