In [0]:
df_original = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .csv("dbfs:/Volumes/zillow/zillow_medallion/raw/Zip_time_series.csv")
)

display(df)


In [0]:
entry_count = df_original.count()
print("Number of entries in df:", entry_count)

In [0]:
from pyspark.sql import functions as F, Window

RAW = "/Volumes/zillow/zillow_medallion/raw"
OUT_DIR = "/Volumes/zillow/zillow_medallion/raw-converted_format/incremental/zip_time_series"

zip_src = f"{RAW}/Zip_time_series.csv"

# Clean output folder (simple overwrite)
dbutils.fs.rm("dbfs:" + OUT_DIR, True)
dbutils.fs.mkdirs("dbfs:" + OUT_DIR)

df = (spark.read.option("header", "true").option("inferSchema", "true").csv(zip_src))

# Simple 4-way split by row number (not fancy, just works)
df2 = df.withColumn("_mid", F.monotonically_increasing_id())
w = Window.orderBy(F.col("_mid"))
df2 = df2.withColumn("_rn", F.row_number().over(w))
df2 = df2.withColumn("_part", ((F.col("_rn") - 1) % 4) + 1).drop("_mid", "_rn")

def _write_single_csv(part_num: int, final_name: str):
    tmp_dir = f"{OUT_DIR}/_tmp_part_{part_num}"
    final_path = f"{OUT_DIR}/{final_name}"

    # remove if exists
    try:
        dbutils.fs.rm("dbfs:" + final_path, True)
    except:
        pass

    (df2.filter(F.col("_part") == part_num)
        .drop("_part")
        .coalesce(1)
        .write.mode("overwrite")
        .option("header", "true")
        .csv(tmp_dir))

    # Find the single part file and rename to final_name
    files = dbutils.fs.ls("dbfs:" + tmp_dir)
    part_file = [f.path for f in files if f.name.startswith("part-") and f.name.endswith(".csv")][0]
    dbutils.fs.mv(part_file, "dbfs:" + final_path, True)

    # cleanup temp folder
    dbutils.fs.rm("dbfs:" + tmp_dir, True)
    print("Wrote:", final_path)

_write_single_csv(1, "zip_part_1.csv")
_write_single_csv(2, "zip_part_2.csv")
_write_single_csv(3, "zip_part_3.csv")
_write_single_csv(4, "zip_part_4.csv")

display(dbutils.fs.ls("dbfs:" + OUT_DIR))


In [0]:
df = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .csv("dbfs:/Volumes/zillow/zillow_medallion/raw-converted_format/incremental/zip_time_series/zip_part_1.csv")
)
entry_count = df.count()
print("Number of entries in df:", entry_count)
