In [0]:
df = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .csv("dbfs:/Volumes/zillow/zillow_medallion/raw/State_time_series.csv")
)

display(df)


In [0]:
entry_count = df.count()
print("Number of entries in df:", entry_count)

In [0]:
from pyspark.sql import functions as F

RAW = "/Volumes/zillow/zillow_medallion/raw"
OUT_FILE = "/Volumes/zillow/zillow_medallion/raw-converted_format/xml/State_time_series.xml"
OUT_DIR = "/Volumes/zillow/zillow_medallion/raw-converted_format/xml"
TMP_DIR = "/Volumes/zillow/zillow_medallion/raw-converted_format/xml/_tmp_state_xml"

src = f"{RAW}/State_time_series.csv"

# Clean output
dbutils.fs.mkdirs("dbfs:" + OUT_DIR)
try: dbutils.fs.rm("dbfs:" + OUT_FILE, True)
except: pass
dbutils.fs.rm("dbfs:" + TMP_DIR, True)

df = (spark.read.option("header", "true").option("inferSchema", "true").csv(src))

# Row-wise flat XML as text lines: <row><col>..</col>...</row>
xml_df = df.select(
    F.concat(
        F.lit("<row>"),
        F.concat_ws("", *[
            F.concat(
                F.lit(f"<{c}>"),
                F.coalesce(F.col(c).cast("string"), F.lit("")),
                F.lit(f"</{c}>")
            )
            for c in df.columns
        ]),
        F.lit("</row>")
    ).alias("value")
)

# write.text writes a folder; we rename the single part file to .xml
xml_df.coalesce(1).write.mode("overwrite").text(TMP_DIR)

files = dbutils.fs.ls("dbfs:" + TMP_DIR)
part_file = [f.path for f in files if f.name.startswith("part-") and f.name.endswith(".txt")][0]

dbutils.fs.mv(part_file, "dbfs:" + OUT_FILE, True)
dbutils.fs.rm("dbfs:" + TMP_DIR, True)

print("Wrote XML:", OUT_FILE)
display(dbutils.fs.ls("dbfs:" + OUT_DIR))


In [0]:
df_xml = spark.read.text("dbfs:/Volumes/zillow/zillow_medallion/raw-converted_format/xml/State_time_series.xml")
display(df_xml)

In [0]:
entry_count = df_xml.count()
print("Number of entries in df_xml:", entry_count)