# Ingest JSON from Raw to Bronze

Ingest bootstrap-static and fixtures from manually uploaded JSON files

Write to bronze layer tables in fpl_bronze volume.

In [0]:
import pyspark.sql.functions as F
from datetime import datetime, timezone

# Bootstrap-static

bootstrap-static is core data, with schema as:

- events: Basic information of every Gameweek such as average score, highest score, top scoring player, most captained, etc.
- game_settings: The game settings and rules. 
- phases: Phases of FPL season. 
- teams: Basic information of current Premier League clubs.
- total_players: Total FPL players.
- elements: Information of all Premier League players including points, status, value, match stats (goals, assists, etc.), ICT index, etc.
- element_types: Basic information about player’s position (GK, DEF, MID, FWD).
- chips: All chips available in FPL.
- game_config: scoring and game setup rules.


In [0]:
bootstrap_static_df = spark.read.option("multiline", "true").json("/Volumes/workspace/fpl_raw/raw_json/bootstrap_static.json")

In [0]:
bootstrap_static_df.printSchema()

root
 |-- chips: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chip_type: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- overrides: struct (nullable = true)
 |    |    |    |-- element_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- pick_multiplier: string (nullable = true)
 |    |    |    |-- rules: struct (nullable = true)
 |    |    |    |    |-- squad_squadsize: long (nullable = true)
 |    |    |-- start_event: long (nullable = true)
 |    |    |-- stop_event: long (nullable = true)
 |-- element_stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- element_types: array (nullable = true)
 |    |-- element: struct (containsNull = tr

In [0]:
# Chips
chips_df = df.select(
        F.explode("chips").alias("chip")
    ).select(
        "chip.*"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
chips_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.chips")

# Element Types
element_types_df = df.select(
        F.explode("element_types").alias("type")
    ).select(
        "type.*"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
element_types_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.element_types")

# Element Stats
element_stats_df = df.select(
        F.explode("element_stats").alias("stat")
    ).select(
        "stat.*"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
element_stats_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.element_stats")

# Elements
elements_df = df.select(
        F.explode("elements").alias("player")
    ).select(
        "player.*"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
elements_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.elements")

# Events
events_df = df.select(
        F.explode("events").alias("event")
    ).select(
        "event.*"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
events_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.events")

# Game Config - Scoring
scoring_df = df.select(
        "game_config.scoring"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
scoring_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.game_config_scoring")

# Game Config - Rules
rules_df = df.select(
    "game_config.rules"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
rules_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.game_config_rules")

# Phases
phases_df = df.select(
        F.explode("phases").alias("phase")
    ).select(
        "phase.*"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
phases_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.phases")

# Teams
teams_df = df.select(
        F.explode("teams").alias("team")
    ).select(
        "team.*"
    ).withColumn(
        "ingestion_ts", F.lit(ingestion_ts)
    )
teams_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.teams")

# Fixtures

Fixtures contains all data about fixtures for the season.

Needs to be incrementally loaded as fixtures change often due to clashes/TV viewing changes.

In [0]:
fixtures_df = spark.read.option("multiline", "true").json("/Volumes/workspace/fpl_raw/raw_json/fixtures.json")

In [0]:
fixtures_df.printSchema()

root
 |-- code: long (nullable = true)
 |-- event: long (nullable = true)
 |-- finished: boolean (nullable = true)
 |-- finished_provisional: boolean (nullable = true)
 |-- id: long (nullable = true)
 |-- kickoff_time: string (nullable = true)
 |-- minutes: long (nullable = true)
 |-- provisional_start_time: boolean (nullable = true)
 |-- pulse_id: long (nullable = true)
 |-- started: boolean (nullable = true)
 |-- stats: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- a: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- element: long (nullable = true)
 |    |    |    |    |-- value: long (nullable = true)
 |    |    |-- h: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- element: long (nullable = true)
 |    |    |    |    |-- value: long (nullable = true)
 |    |    |-- identifier: string (nullable = true)
 |-- team_a: long (nullabl

In [0]:
fixtures_df = df.withColumn(
    "ingestion_ts", F.lit(ingestion_ts)
    )

fixtures_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fpl_bronze.fixtures")


# Next Steps:

Schema Drift

Incremental load