In [0]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("Bucketing vs Partitioning").getOrCreate()

filePath = "dbfs:/FileStore/tables/Files/names.csv"

df = spark.read.option("header", "true").option("inferSchema", "true").csv(filePath)
df.printSchema()

partitioned_path = "/tmp/names_partitioned"
df.write.mode("overwrite").partitionBy("place_of_birth").parquet(partitioned_path)

print("Partitioned table files:")
display(dbutils.fs.ls(partitioned_path))

spark.conf.set("spark.sql.catalogImplementation", "hive")
spark.conf.set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true")

bucketed_table = "names_bucketed"

df.write.mode("overwrite") \
    .format("parquet") \
    .bucketBy(4, "place_of_birth") \
    .sortBy("name") \
    .saveAsTable(bucketed_table)

bucketed_table_path = spark.sql(f"DESCRIBE FORMATTED {bucketed_table}") \
    .filter("col_name = 'Location'") \
    .select("data_type") \
    .collect()[0][0]

print("Bucketed table files:")
display(dbutils.fs.ls(bucketed_table_path))


root
 |-- imdb_name_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- birth_name: string (nullable = true)
 |-- height: integer (nullable = true)
 |-- bio: string (nullable = true)
 |-- birth_details: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- place_of_birth: string (nullable = true)
 |-- death_details: string (nullable = true)
 |-- date_of_death: string (nullable = true)
 |-- place_of_death: string (nullable = true)
 |-- reason_of_death: string (nullable = true)
 |-- spouses_string: string (nullable = true)
 |-- spouses: integer (nullable = true)
 |-- divorces: integer (nullable = true)
 |-- spouses_with_children: integer (nullable = true)
 |-- children: integer (nullable = true)

