In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{broadcast, split, lit}

// Read CSV files
val matchesBucketed = spark.read.option("header", "true")
  .option("inferSchema", "true")
  .csv("/home/iceberg/data/matches.csv")

val matchDetailsBucketed = spark.read.option("header", "true")
  .option("inferSchema", "true")
  .csv("/home/iceberg/data/match_details.csv")

// Create Iceberg bucketed table for matches
spark.sql("DROP TABLE IF EXISTS bootcamp.matches_bucketed")

val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
  match_id STRING,
  is_team_game BOOLEAN,
  playlist_id STRING,
  completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id))
"""
spark.sql(bucketedDDL)

// Insert data into matches_bucketed
matchesBucketed
  .select("match_id", "is_team_game", "playlist_id", "completion_date")
  .writeTo("bootcamp.matches_bucketed")
  .append()

// Create Iceberg bucketed table for match_details
spark.sql("DROP TABLE IF EXISTS bootcamp.match_details_bucketed")

val bucketedDetailsDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
  match_id STRING,
  player_gamertag STRING,
  player_total_kills INTEGER,
  player_total_deaths INTEGER
)
USING iceberg
PARTITIONED BY (bucket(16, match_id))
"""
spark.sql(bucketedDetailsDDL)

matchDetailsBucketed
  .select("match_id", "player_gamertag", "player_total_kills", "player_total_deaths")
  .writeTo("bootcamp.match_details_bucketed")
  .append()

// Disable broadcast joins
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

// Register temp views (optional - for comparison joins below)
matchesBucketed.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")

// Explain join plan with Iceberg bucketed tables
spark.sql("""
  SELECT * 
  FROM bootcamp.match_details_bucketed mdb 
  JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
  WHERE md.completion_date = DATE('2016-01-01')
""").explain()

// Explain join plan with raw CSV-based temp views
spark.sql("""
  SELECT * 
  FROM match_details mdb 
  JOIN matches md 
    ON mdb.match_id = md.match_id
""").explain()

// Optional: explicitly test broadcast joins (not used when threshold = -1)
val broadcastJoinDF = matchesBucketed.as("m")
  .join(broadcast(matchDetailsBucketed).as("md"), $"m.match_id" === $"md.match_id")
  .select($"md.*", split($"m.completion_date", " ").getItem(0).as("ds"))

// Optionally show plan
broadcastJoinDF.explain()


SyntaxError: invalid syntax (3165037098.py, line 2)