# Ingest JSON from Raw to Bronze

Ingest bootstrap-static and fixtures from manually uploaded JSON files

Write to bronze layer tables in fpl_bronze volume.

In [0]:
import pyspark.sql.functions as F
from delta.tables import DeltaTable

In [0]:
ENV = "test"
BRONZE_SCHEMA = f"fpl_bronze_{ENV}"
BASE_RAW_JSON_PATH = "/Volumes/workspace/fpl_raw/raw_json/"
PROTOCOL = "INCR"

# Functions to Modularise

In [0]:
def merge_to_table(
    df,
    table_name,
    merge_condition,
    spark,
    partition_by=None
):
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        # First time write
        write_to_table(
            df = df_with_ts,
            table_name = table_name,
            partition_by = partition_by
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source = df_with_ts.alias("source"),
                condition = merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

In [0]:
def read_latest_raw_json(base_path, filename):
    latest_folder = sorted(dbutils.fs.ls(base_path), key=lambda x: x.name, reverse=True)[0].path
    print("Loading folder: ", latest_folder)
    return spark.read.option("multiline", "true").json(f"{latest_folder}/{filename}")

In [0]:
def write_to_table(
        df,
        table_name,
        mode="overwrite",
        merge_schema=True,
        partition_by=None,
        path=None,
        save_as_table=True
    ):
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df: Spark DataFrame to write.
    - table_name: Name of the Delta table (used if save_as_table=True).
    - mode: Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema: Whether to merge schema on write.
    - partition_by: Optional list of columns to partition by.
    - path: Optional path to save the Delta table (used if save_as_table=False).
    - save_as_table: If True, saves as managed table; else saves to path.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def detect_schema_drift(new_df, table_name, spark):
    try:
        existing_df = spark.table(table_name)
        existing_fields = set([field.name for field in existing_df.schema.fields if field.name != "last_updated"])
        new_fields = set([field.name for field in new_df.schema.fields if field.name != "last_updated"])

        added = new_fields - existing_fields
        removed = existing_fields - new_fields

        if added or removed:
            print(f"Schema drift detected in {table_name}")
            if added:
                print(f"Added fields: {added}")
            if removed:
                print(f"Removed fields: {removed}")
            return True
        else:
            return False
    except Exception as e:
        print(f"ℹNo existing table found for {table_name}. Assuming first write.")
        return False

# Bootstrap-static

bootstrap-static is core data, with schema as:

- events: Basic information of every Gameweek such as average score, highest score, top scoring player, most captained, etc.
- game_settings: The game settings and rules. 
- phases: Phases of FPL season. 
- teams: Basic information of current Premier League clubs.
- total_players: Total FPL players.
- elements: Information of all Premier League players including points, status, value, match stats (goals, assists, etc.), ICT index, etc.
- element_types: Basic information about player’s position (GK, DEF, MID, FWD).
- chips: All chips available in FPL.
- game_config: scoring and game setup rules.


In [0]:
bootstrap_static_df = read_latest_raw_json(
    base_path = BASE_RAW_JSON_PATH, 
    filename = "bootstrap_static.json"
    )

Loading folder:  dbfs:/Volumes/workspace/fpl_raw/raw_json/2025_26_gw_07/


In [0]:
bootstrap_static_df.printSchema()

root
 |-- chips: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chip_type: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- overrides: struct (nullable = true)
 |    |    |    |-- element_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- pick_multiplier: string (nullable = true)
 |    |    |    |-- rules: struct (nullable = true)
 |    |    |    |    |-- squad_squadsize: long (nullable = true)
 |    |    |-- start_event: long (nullable = true)
 |    |    |-- stop_event: long (nullable = true)
 |-- element_stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- element_types: array (nullable = true)
 |    |-- element: struct (containsNull = tr

In [0]:
if PROTOCOL == "HIST":
    # Chips
    chips_df = bootstrap_static_df.select(
        F.explode("chips").alias("chip")
    ).select(
        "chip.*"
    )
    detect_schema_drift(
        new_df = chips_df, 
        table_name = f"{BRONZE_SCHEMA}.chips", 
        spark = spark)
    write_to_table(
        df = chips_df, 
        table_name = f"{BRONZE_SCHEMA}.chips")
    
    # Element Stats
    element_stats_df = bootstrap_static_df.select(
            F.explode("element_stats").alias("stat")
        ).select(
            "stat.*"
        )
    detect_schema_drift(
        new_df = element_stats_df, 
        table_name = f"{BRONZE_SCHEMA}.element_stats", 
        spark = spark)

    write_to_table(
        df = element_stats_df, 
        table_name = f"{BRONZE_SCHEMA}.element_stats")

    # Element Types
    element_types_df = bootstrap_static_df.select(
            F.explode("element_types").alias("type")
        ).select(
            "type.*"
        )

    detect_schema_drift(
        new_df = element_types_df, 
        table_name = f"{BRONZE_SCHEMA}.element_types", 
        spark = spark)
    write_to_table(
        df = element_types_df, 
        table_name = f"{BRONZE_SCHEMA}.element_types")
        
    # Game Config - Scoring
    scoring_df = bootstrap_static_df.select(
            "game_config.scoring"
        )
    detect_schema_drift(
        new_df = scoring_df, 
        table_name = f"{BRONZE_SCHEMA}.game_config_scoring", 
        spark = spark)
    write_to_table(
        df = scoring_df, 
        table_name = f"{BRONZE_SCHEMA}.game_config_scoring")

    # Game Config - Rules
    rules_df = bootstrap_static_df.select(
            "game_config.rules"
        )
    detect_schema_drift(
        new_df = rules_df, 
        table_name = f"{BRONZE_SCHEMA}.game_config_rules", 
        spark = spark)
    write_to_table(
        df = rules_df, 
        table_name = f"{BRONZE_SCHEMA}.game_config_rules")

    # Phases
    phases_df = bootstrap_static_df.select(
            F.explode("phases").alias("phase")
        ).select(
            "phase.*"
        )
    detect_schema_drift(
        new_df = phases_df, 
        table_name = f"{BRONZE_SCHEMA}.phases", 
        spark = spark)
    write_to_table(
        df = phases_df, 
        table_name = f"{BRONZE_SCHEMA}.phases")

    # Teams
    teams_df = bootstrap_static_df.select(
            F.explode("teams").alias("team")
        ).select(
            "team.*"
        )
    detect_schema_drift(
        new_df = teams_df, 
        table_name = f"{BRONZE_SCHEMA}.teams", 
        spark = spark)
    write_to_table(
        df = teams_df, 
        table_name = f"{BRONZE_SCHEMA}.teams")



# Elements
elements_df = bootstrap_static_df.select(
        F.explode("elements").alias("player")
    ).select(
        "player.*"
    )
detect_schema_drift(
    new_df = elements_df, 
    table_name = f"{BRONZE_SCHEMA}.elements", 
    spark = spark)

if PROTOCOL == "HIST":
    write_to_table(
        df = elements_df,
        table_name = f"{BRONZE_SCHEMA}.elements"
    )
elif PROTOCOL == "INCR":
    merge_to_table(
        df = elements_df,
        table_name = f"{BRONZE_SCHEMA}.elements",
        merge_condition = "target.id = source.id",
        spark = spark
    )


# Events
events_df = bootstrap_static_df.select(
        F.explode("events").alias("event")
    ).select(
        "event.*"
    )
detect_schema_drift(
    new_df = events_df, 
    table_name = f"{BRONZE_SCHEMA}.events", 
    spark = spark)

if PROTOCOL == "HIST":
    write_to_table(
        df = events_df,
        table_name = f"{BRONZE_SCHEMA}.events"
    )
elif PROTOCOL == "INCR":
    merge_to_table(
        df = events_df,
        table_name = f"{BRONZE_SCHEMA}.events",
        merge_condition = "target.id = source.id",
        spark = spark
    )

# Fixtures

Fixtures contains all data about fixtures for the season.

Needs to be incrementally loaded as fixtures change often due to clashes/TV viewing changes.

In [0]:
fixtures_df = read_latest_raw_json(
    base_path = BASE_RAW_JSON_PATH, 
    filename = "fixtures.json"
    )

Loading folder:  dbfs:/Volumes/workspace/fpl_raw/raw_json/2025_26_gw_07/


In [0]:
fixtures_df.printSchema()



In [0]:
detect_schema_drift(
    new_df = fixtures_df, 
    table_name = f"{BRONZE_SCHEMA}.fixtures", 
    spark = spark)

if PROTOCOL == "HIST":
    write_to_table(
        df = fixtures_df,
        table_name = f"{BRONZE_SCHEMA}.fixtures"
    )
elif PROTOCOL == "INCR":
    merge_to_table(
        df = fixtures_df,
        table_name = f"{BRONZE_SCHEMA}.fixtures",
        merge_condition = "target.id = source.id",
        spark = spark
    )

Schema drift detected in fpl_bronze_test.fixtures
Removed fields: {'last_updated'}


# Next Steps:

modularise functions to .py

unit tests on modularised functions

ci/cd to test 

ci/cd to prod