In [0]:
%python
import importlib.util
if importlib.util.find_spec("nba_api") is None:
    %pip install nba_api

In [0]:
from find_games import get_games_by_date
from etl_pipeline.nba_api_connector.get_game import Game
from pyspark.sql import functions as F

catalog="nba"
source_schema = "source"
date="2025-12-01"
volume_name = "games"

API_KEY = dbutils.secrets.get(
    scope="nba_secrets",
    key="balldontlie_api_key"
)
games=get_games_by_date(date, API_KEY)
games=spark.createDataFrame(games, "game_id: string, away_team: string, home_team: string, date: string")

In [0]:
# spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{source_schema}.{volume_name}")
# volume_path = f"/Volumes/{catalog}/{source_schema}/{volume_name}/games.parquet"
# games.write.mode("overwrite").format("parquet").partitionBy("date").save(volume_path)

In [0]:
from pyspark.sql import functions as F

games_fixed = games.withColumn(
    "game_id",
    F.concat(
        F.lit("00"),
        (F.col("game_id").cast("int") + F.lit(4052978)).cast("string")
    )
)
game_full_results=[]
for row in games_fixed.select("game_id").collect():
    game_id = str(row["game_id"])
    try:
        game = Game(game_id)
        game_full_results.append(game)
    except Exception as e:
        print(f"Boxscore not available for game_id {game_id}: {e}")

In [0]:
game_infos = [game.game_info() for game in game_full_results]
game_results = spark.createDataFrame(game_infos)
game_results = game_results.withColumn("date_day", F.date_format(F.date_sub(F.col("date"), 1), "yyyy-MM-dd"))
volume_path = f"/Volumes/{catalog}/{source_schema}/{volume_name}/game_boxscore.parquet"
game_results.write.mode("overwrite").format("parquet").partitionBy("date_day").save(volume_path)

In [0]:
all_officials = []
for game in game_full_results:
    all_officials.extend(game.get_officials())
officials_df = spark.createDataFrame(all_officials)
volume_path = f"/Volumes/{catalog}/{source_schema}/{volume_name}/game_officials.parquet"
game_results.write.mode("overwrite").format("parquet").partitionBy("date_day").save(volume_path)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("team_id", StringType(), True),
    StructField("against_team_id", StringType(), True),
    StructField("Stat_Type", StringType(), True),
    StructField("Stat_Value", StringType(), True),
    StructField("Home", StringType(), True),
    StructField("game_id", StringType(), True)
])

all_stats = []
for game in game_full_results:
    home_stats, away_stats = game.get_team_stats()
    all_stats.extend([
        [game.home_team_id, game.away_team_id ,k, str(v), 'True', game.game_id] for k, v in home_stats.items()
    ])
    all_stats.extend([
        [game.away_team_id, game.home_team_id ,k, str(v), 'False', game.game_id] for k, v in away_stats.items()
    ])

game_stats_df = spark.createDataFrame(all_stats, schema=schema)
display(game_stats_df)