# Ingest JSON from Raw to Bronze

Ingest bootstrap-static and fixtures from manually uploaded JSON files

Write to bronze layer tables in fpl_bronze volume.

In [0]:
import pyspark.sql.functions as F

In [0]:
env = "test"
bronze_schema = f"fpl_bronze_{env}"

# Functions to Modularise

In [0]:
def write_to_table(
        df,
        table_name,
        mode="overwrite",
        merge_schema=True,
        partition_by=None,
        path=None,
        save_as_table=True
    ):
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df: Spark DataFrame to write.
    - table_name: Name of the Delta table (used if save_as_table=True).
    - mode: Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema: Whether to merge schema on write.
    - partition_by: Optional list of columns to partition by.
    - path: Optional path to save the Delta table (used if save_as_table=False).
    - save_as_table: If True, saves as managed table; else saves to path.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def detect_schema_drift(new_df, table_name, spark):
    try:
        existing_df = spark.table(table_name)
        existing_fields = set([field.name for field in existing_df.schema.fields])
        new_fields = set([field.name for field in new_df.schema.fields])

        added = new_fields - existing_fields
        removed = existing_fields - new_fields

        if added or removed:
            print(f"Schema drift detected in {table_name}")
            if added:
                print(f"Added fields: {added}")
            if removed:
                print(f"Removed fields: {removed}")
            return True
        else:
            return False
    except Exception as e:
        print(f"ℹNo existing table found for {table_name}. Assuming first write.")
        return False

# Bootstrap-static

bootstrap-static is core data, with schema as:

- events: Basic information of every Gameweek such as average score, highest score, top scoring player, most captained, etc.
- game_settings: The game settings and rules. 
- phases: Phases of FPL season. 
- teams: Basic information of current Premier League clubs.
- total_players: Total FPL players.
- elements: Information of all Premier League players including points, status, value, match stats (goals, assists, etc.), ICT index, etc.
- element_types: Basic information about player’s position (GK, DEF, MID, FWD).
- chips: All chips available in FPL.
- game_config: scoring and game setup rules.


In [0]:
bootstrap_static_df = spark.read.option("multiline", "true").json("/Volumes/workspace/fpl_raw/raw_json/bootstrap_static.json")

In [0]:
bootstrap_static_df.printSchema()

root
 |-- chips: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chip_type: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- overrides: struct (nullable = true)
 |    |    |    |-- element_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- pick_multiplier: string (nullable = true)
 |    |    |    |-- rules: struct (nullable = true)
 |    |    |    |    |-- squad_squadsize: long (nullable = true)
 |    |    |-- start_event: long (nullable = true)
 |    |    |-- stop_event: long (nullable = true)
 |-- element_stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- element_types: array (nullable = true)
 |    |-- element: struct (containsNull = tr

In [0]:
# Chips
chips_df = bootstrap_static_df.select(
        F.explode("chips").alias("chip")
    ).select(
        "chip.*"
    )
detect_schema_drift(
    new_df = chips_df, 
    table_name = f"{bronze_schema}.chips", 
    spark = spark)
write_to_table(
    df = chips_df, 
    table_name = f"{bronze_schema}.chips",
    mode = "overwrite",
    merge_schema = True,
    partition_by = None,
    path = None,
    save_as_table = True)

# Element Types
element_types_df = bootstrap_static_df.select(
        F.explode("element_types").alias("type")
    ).select(
        "type.*"
    )
detect_schema_drift(
    new_df = element_types_df, 
    table_name = f"{bronze_schema}.element_types", 
    spark = spark)
write_to_table(
    df = element_types_df, 
    table_name = f"{bronze_schema}.element_types")

# Element Stats
element_stats_df = bootstrap_static_df.select(
        F.explode("element_stats").alias("stat")
    ).select(
        "stat.*"
    )
detect_schema_drift(
    new_df = element_stats_df, 
    table_name = f"{bronze_schema}.element_stats", 
    spark = spark)
write_to_table(
    df = element_stats_df, 
    table_name = f"{bronze_schema}.element_stats")

# Elements
elements_df = bootstrap_static_df.select(
        F.explode("elements").alias("player")
    ).select(
        "player.*"
    )
detect_schema_drift(
    new_df = elements_df, 
    table_name = f"{bronze_schema}.elements", 
    spark = spark)
write_to_table(
    df = elements_df, 
    table_name = f"{bronze_schema}.elements")

# Events
events_df = bootstrap_static_df.select(
        F.explode("events").alias("event")
    ).select(
        "event.*"
    )
detect_schema_drift(
    new_df = events_df, 
    table_name = f"{bronze_schema}.events", 
    spark = spark)
write_to_table(
    df = events_df, 
    table_name = f"{bronze_schema}.events")

# Game Config - Scoring
scoring_df = bootstrap_static_df.select(
        "game_config.scoring"
    )
detect_schema_drift(
    new_df = scoring_df, 
    table_name = f"{bronze_schema}.game_config_scoring", 
    spark = spark)
write_to_table(
    df = scoring_df, 
    table_name = f"{bronze_schema}.game_config_scoring")

# Game Config - Rules
rules_df = bootstrap_static_df.select(
        "game_config.rules"
    )
detect_schema_drift(
    new_df = rules_df, 
    table_name = f"{bronze_schema}.game_config_rules", 
    spark = spark)
write_to_table(
    df = rules_df, 
    table_name = f"{bronze_schema}.game_config_rules")

# Phases
phases_df = bootstrap_static_df.select(
        F.explode("phases").alias("phase")
    ).select(
        "phase.*"
    )
detect_schema_drift(
    new_df = phases_df, 
    table_name = f"{bronze_schema}.phases", 
    spark = spark)
write_to_table(
    df = phases_df, 
    table_name = f"{bronze_schema}.phases")

# Teams
teams_df = bootstrap_static_df.select(
        F.explode("teams").alias("team")
    ).select(
        "team.*"
    )
detect_schema_drift(
    new_df = teams_df, 
    table_name = f"{bronze_schema}.teams", 
    spark = spark)
write_to_table(
    df = teams_df, 
    table_name = f"{bronze_schema}.teams")

# Fixtures

Fixtures contains all data about fixtures for the season.

Needs to be incrementally loaded as fixtures change often due to clashes/TV viewing changes.

In [0]:
fixtures_df = spark.read.option("multiline", "true").json("/Volumes/workspace/fpl_raw/raw_json/fixtures.json")

In [0]:
fixtures_df.printSchema()

root
 |-- code: long (nullable = true)
 |-- event: long (nullable = true)
 |-- finished: boolean (nullable = true)
 |-- finished_provisional: boolean (nullable = true)
 |-- id: long (nullable = true)
 |-- kickoff_time: string (nullable = true)
 |-- minutes: long (nullable = true)
 |-- provisional_start_time: boolean (nullable = true)
 |-- pulse_id: long (nullable = true)
 |-- started: boolean (nullable = true)
 |-- stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- a: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- element: long (nullable = true)
 |    |    |    |    |-- value: long (nullable = true)
 |    |    |-- h: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- element: long (nullable = true)
 |    |    |    |    |-- value: long (nullable = true)
 |    |    |-- identifier: string (nullable = true)
 |-- team_a: long (nullabl

In [0]:
detect_schema_drift(
    new_df = fixtures_df, 
    table_name = f"{bronze_schema}.fixtures", 
    spark = spark)
write_to_table(
    df = fixtures_df, 
    table_name = f"{bronze_schema}.fixtures")

# Next Steps:

Incremental load

In [0]:
display(spark.table("fpl_bronze_test.elements"))

assists,birth_date,bonus,bps,can_select,can_transact,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,clean_sheets_per_90,clearances_blocks_interceptions,code,corners_and_indirect_freekicks_order,corners_and_indirect_freekicks_text,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,creativity,creativity_rank,creativity_rank_type,defensive_contribution,defensive_contribution_per_90,direct_freekicks_order,direct_freekicks_text,dreamteam_count,element_type,ep_next,ep_this,event_points,expected_assists,expected_assists_per_90,expected_goal_involvements,expected_goal_involvements_per_90,expected_goals,expected_goals_conceded,expected_goals_conceded_per_90,expected_goals_per_90,first_name,form,form_rank,form_rank_type,goals_conceded,goals_conceded_per_90,goals_scored,has_temporary_code,ict_index,ict_index_rank,ict_index_rank_type,id,in_dreamteam,influence,influence_rank,influence_rank_type,minutes,news,news_added,now_cost,now_cost_rank,now_cost_rank_type,opta_code,own_goals,penalties_missed,penalties_order,penalties_saved,penalties_text,photo,points_per_game,points_per_game_rank,points_per_game_rank_type,recoveries,red_cards,region,removed,saves,saves_per_90,second_name,selected_by_percent,selected_rank,selected_rank_type,special,squad_number,starts,starts_per_90,status,tackles,team,team_code,team_join_date,threat,threat_rank,threat_rank_type,total_points,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,web_name,yellow_cards,last_updated
0,1995-09-15,3,146,True,True,,,4,0.57,4,154561,,,1,-1,2,-2,10.0,321,8,0,0.0,,,1,1,4.5,4.5,6,0.04,0.01,0.04,0.01,0.0,4.4,0.63,0.0,David,4.0,66,3,3,0.43,0,False,14.5,194,11,1,False,135.8,67,11,630,,,57,121,1,p154561,0,0,,0,,154561.jpg,4.9,42,5,61,0,200.0,False,15,2.14,Raya Martín,27.9,11,2,False,,7,1.0,a,0,1,3,2024-07-04,0.0,726,85,34,1580890,124315,690533,41375,0.7,6.0,Raya,1,2025-10-15T17:43:28.589Z
0,1994-10-03,0,0,True,True,,,0,0.0,0,109745,,,0,0,-2,2,0.0,517,61,0,0.0,,,0,1,0.5,0.5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kepa,0.0,503,72,0,0.0,0,False,0.0,534,75,2,False,0.0,532,75,0,,,43,593,35,p109745,0,0,,0,,109745.jpg,0.0,536,75,0,0,200.0,False,0,0.0,Arrizabalaga Revuelta,0.5,234,32,False,,0,0.0,a,0,1,3,2025-07-01,0.0,485,50,0,6695,300,50403,2445,0.0,0.0,Arrizabalaga,0,2025-10-15T17:43:28.589Z
0,2002-04-13,0,0,False,True,0.0,0.0,0,0.0,0,463748,,,0,0,0,0,0.0,476,43,0,0.0,,,0,1,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Karl,0.0,462,55,0,0.0,0,False,0.0,494,58,3,False,0.0,492,58,0,Has joined Werder Bremen on loan for the rest of the season.,2025-08-26T13:44:02.357864Z,40,655,66,p463748,0,0,,0,,463748.jpg,0.0,497,58,0,0,67.0,False,0,0.0,Hein,0.3,297,43,False,,0,0.0,u,0,1,3,2020-10-28,0.0,443,32,0,5545,0,33851,1495,0.0,0.0,Hein,0,2025-10-15T17:43:28.589Z
0,2006-03-13,0,0,True,True,,,0,0.0,0,551221,,,0,0,0,0,0.0,500,56,0,0.0,,,0,1,0.5,0.5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tommy,0.0,488,68,0,0.0,0,False,0.0,518,71,4,False,0.0,516,71,0,,,40,687,79,p551221,0,0,,0,,551221.jpg,0.0,521,71,0,0,241.0,False,0,0.0,Setford,0.2,349,53,False,,0,0.0,a,0,1,3,2024-07-24,0.0,468,45,0,13936,579,12022,1129,0.0,0.0,Setford,0,2025-10-15T17:43:28.589Z
0,1997-12-19,5,169,True,True,100.0,75.0,4,0.57,56,226597,,,1,-1,3,-3,13.1,298,101,66,9.43,,,1,2,8.7,6.5,9,0.09,0.01,0.57,0.08,0.48,4.4,0.63,0.07,Gabriel,8.2,3,1,3,0.43,1,False,27.0,74,18,5,True,200.6,11,4,630,,2025-10-03T14:00:08.473762Z,63,71,1,p226597,0,0,,0,,226597.jpg,6.7,6,2,13,0,30.0,False,0,0.0,dos Santos Magalhães,29.8,10,3,False,,7,1.0,a,10,1,3,2020-09-01,56.0,111,18,47,1998484,248237,746646,49814,1.3,7.5,Gabriel,0,2025-10-15T17:43:28.589Z
0,2001-03-24,1,109,True,True,100.0,100.0,3,0.66,32,462424,,,0,0,0,0,19.2,263,87,40,8.8,,,0,2,2.7,2.7,6,0.12,0.03,0.25,0.06,0.13,3.27,0.72,0.03,William,2.2,172,62,1,0.22,0,False,13.6,208,74,6,False,108.4,98,47,409,,2025-09-01T10:30:07.602868Z,60,86,4,p462424,0,0,,0,,462424.jpg,4.2,70,32,20,0,73.0,False,0,0.0,Saliba,12.7,29,11,False,,5,1.1,a,8,1,3,2019-07-25,8.0,302,102,25,640685,31850,1041504,29406,0.4,4.2,Saliba,0,2025-10-15T17:43:28.589Z
2,2002-05-19,3,129,True,True,,,4,0.69,18,466075,,,0,0,2,-2,66.4,104,22,29,4.98,,,2,2,4.0,4.0,6,0.15,0.03,1.99,0.35,1.84,3.44,0.59,0.32,Riccardo,3.5,96,40,3,0.52,1,False,28.9,64,14,7,False,105.4,106,51,524,,,57,128,11,p466075,0,0,,0,,466075.jpg,6.0,15,8,24,0,106.0,False,0,0.0,Calafiori,14.8,23,9,False,,7,1.2,a,11,1,3,2024-07-29,118.0,39,3,42,1732597,45705,473934,42084,0.6,7.4,Calafiori,3,2025-10-15T17:43:28.589Z
2,2001-06-17,7,178,True,True,,,3,0.52,20,445122,,,1,-1,4,-4,123.1,40,7,45,7.77,,,2,2,6.0,6.0,11,0.41,0.07,2.3,0.4,1.89,3.3,0.57,0.33,Jurriën,5.5,19,8,3,0.52,2,False,49.8,8,1,8,True,191.0,16,5,521,,,59,103,5,p445122,0,0,,0,,445122.jpg,6.9,5,1,18,0,152.0,False,0,0.0,Timber,18.0,19,8,False,,6,1.04,a,25,1,3,2023-07-14,183.0,9,1,48,2002178,272788,419866,20281,0.9,8.1,J.Timber,2,2025-10-15T17:43:28.589Z
0,2000-02-15,0,0,False,True,0.0,0.0,0,0.0,0,440854,,,0,0,-1,1,0.0,717,244,0,0.0,,,0,2,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Jakub,0.0,709,243,0,0.0,0,False,0.0,719,244,9,False,0.0,718,244,0,has joined Porto on loan for the rest of the season.,2025-09-02T13:37:25.929594Z,54,184,23,p440854,0,0,,0,,440854.jpg,0.0,719,244,0,0,172.0,False,0,0.0,Kiwior,0.1,555,202,False,,0,0.0,u,0,1,3,2023-01-23,0.0,713,242,0,1119,0,10020,221,0.0,0.0,Kiwior,0,2025-10-15T17:43:28.589Z
0,2006-09-26,0,13,True,True,,,0,0.0,3,499169,,,-1,1,-3,3,6.9,331,112,4,4.34,,,0,2,1.3,1.3,1,0.0,0.0,0.0,0.0,0.0,0.93,1.01,0.0,Myles,0.8,308,107,0,0.0,0,False,1.7,389,138,10,False,7.6,380,141,83,,,52,230,32,p499169,0,0,,0,,499169.jpg,0.8,413,138,6,0,241.0,False,0,0.0,Lewis-Skelly,1.8,141,54,False,,0,0.0,a,1,1,3,2022-12-04,4.0,340,122,4,41888,2144,316545,10872,0.2,0.8,Lewis-Skelly,1,2025-10-15T17:43:28.589Z
