In [70]:
from pyspark.sql.functions import col, floor

# Base path
base_path = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/ml_ready/"

# List all parquet folders
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
path = spark._jvm.org.apache.hadoop.fs.Path(base_path)
status = fs.listStatus(path)
paths = [str(file.getPath()) for file in status if file.isDirectory()]

# Read each folder individually
dfs = []
for p in paths:
    df = spark.read.parquet(p)
    # Cast serious_count to double to fix Parquet INT64/logicalType mismatch
    if "serious_count" in df.columns:
        df = df.withColumn("serious_count", col("serious_count").cast("double"))
    dfs.append(df)

# Union all DataFrames
df_ml_ready = dfs[0]
for df in dfs[1:]:
    df_ml_ready = df_ml_ready.unionByName(df)

# ------------------------------
# Normalize schema
# ------------------------------
def normalize_schema(df, prefer="bigint"):
    for field in df.schema.fields:
        col_name = field.name
        dtype = field.dataType.simpleString()
        if dtype == "double" and prefer == "bigint":
            non_int_count = df.filter(col(col_name) != floor(col(col_name))).limit(1).count()
            if non_int_count == 0:
                df = df.withColumn(col_name, col(col_name).cast("bigint"))
        elif dtype in ["int", "bigint"] and prefer == "double":
            df = df.withColumn(col_name, col(col_name).cast("double"))
    return df

df_ml_ready = normalize_schema(df_ml_ready, prefer="bigint")

# ------------------------------
# Export as single CSV
# ------------------------------
output_path = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/exports/ml_ready_csv/"

(
    df_ml_ready
    .coalesce(1)
    .write
    .mode("overwrite")
    .option("header", "true")
    .csv(output_path)
)

print(f"✅ CSV exported to: {output_path}")


StatementMeta(openfda, 4, 10, Finished, Available, Finished)

✅ CSV exported to: abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/exports/ml_ready_csv/
