In [0]:
# imports
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [0]:
# read the data
df = spark.read.json("abfss://edaauthoring@stdncuscomlzwork.dfs.core.windows.net/Qualtrics/Dow Direct Relationship Survey.json")
df.printSchema()

# alternative file location: 
#df = spark.read.json(f"/Workspace/Users/ub00049@dow.com/Qualtrics_Source_Files/data/DowDirectRelationshipSurvey.json")
#df.printSchema()

In [0]:
# don't use this approach. Returns 'responses' as a struct and cannot call explode() on a struct

# df_new = df.withColumn(
#     "responses", 
#     F.struct(
#         F.col("responses.responseId").alias("responseId"),
#         F.col("responses.values").alias("values"),
#         F.col("responses.labels").alias("labels")
#     )
# )
# df_new.printSchema()

In [0]:
# keep only the required fields in the 'responses' array; drop displayedFields and displayedLabels
# expr(transform()) returns 'responses' column as an array instead of a struct, so explode() can be used in a later step
df = df.withColumn(
    "responses",
    F.expr("""
        transform(
            responses,
            r -> struct(
                r.responseId as responseId,
                r.values as values,
                r.labels as labels
            )
        )
    """)
)
df.printSchema()

In [0]:
# move responseId, values and labels  to top level schema and explode values

# Explode the 'responses' array to break it into individual rows
df_flattened = df.withColumn("response", F.explode("responses"))

# Select the 'responseId' at the top level, but retain 'values' and 'labels' as nested structures
df_flattened = df_flattened.select(
    "response.responseId",
    "response.values",
    "response.labels"
)

df_flattened.printSchema()

In [0]:
# flip values keys to columns
df_values = df_flattened.select(
    "responseId",
    "labels",
    *[F.col(f"values.{c}").alias(c) for c in df_flattened.schema["values"].dataType.fieldNames()]
)
df_values.printSchema()
display(df_values)

In [0]:
# get the QID columns,  flatten the labels struct
qid_cols = df_values.select(
    "responseId",
    *[col for col in df_values.columns if col.startswith("QID")])
qid_cols.printSchema()

In [0]:
# convert all columns to stringtype
qid_cols = qid_cols.select([F.col(c).cast(T.StringType()) for c in qid_cols.columns])
qid_cols.printSchema()

In [0]:
# validation step
display(qid_cols.select(F.col("QID146")).filter(F.col("QID146").isNotNull()))


In [0]:
# melt qid columns
df_qid_melted = qid_cols.melt(
    ids="responseId",
    values=([col for col in qid_cols.columns if col != "responseId"]),
    variableColumnName="qid",
    valueColumnName="qid_response"
)
display(df_qid_melted)

In [0]:
# get label columns
df_labels = df_flattened.select(
    F.col("responseId"),
    *[F.col(f"labels.{c}").alias(c) for c in df_flattened.schema["labels"].dataType.fieldNames()]
)

In [0]:
# cast label columns to string type
df_labels = df_labels.select([F.col(c).cast(T.StringType()) for c in df_labels.columns])
df_labels.printSchema()

In [0]:
# melt labels columns
df_labels_melted = df_labels.melt(
    ids="responseId",
    values=([col for col in df_labels.columns if col != "responseId"]),
    variableColumnName="qid_label",
    valueColumnName="qid_response_label"
)
display(df_labels_melted)

In [0]:
df_melted = df_qid_melted.join(
    df_labels_melted, 
    (df_qid_melted["qid"] == df_labels_melted["qid_label"]) & (df_qid_melted["responseId"] == df_labels_melted["responseId"]),
    "outer").select(df_qid_melted["*"], df_labels_melted["qid_response_label"])
display(df_melted)

In [0]:
# join melted table with values, dropping the qid columns
df_final = df_values.join(df_melted, on="responseId", how="outer")
cols_to_drop = [col for col in df_final.columns if col.startswith("QID")]
df_final = df_final.drop(*cols_to_drop, "labels")
df_final.printSchema()

In [0]:
display(df_final)

In [0]:
display(df_final.select(F.col("qid"), F.col("qid_response"), F.col("qid_response_label")).filter(F.col("qid") == "QID36_TEXT"))

# wide table below

Converting the labels fields

In [0]:
df_labels = df_flattened.select(F.col("responseId"), F.col("labels"))
df_labels.printSchema()

In [0]:
# flatten labels column
df_labels = df_labels.select(
    "responseId",
    *[F.col(f"labels.{c}").alias(c) for c in df_labels.schema["labels"].dataType.fieldNames()]
)
df_labels.printSchema()

In [0]:
# converting the array columns to String 

# get array columns
array_columns = [f.name for f in df_labels.schema.fields if isinstance(f.dataType, T.ArrayType)]

# concat_ws to convery array columns to comma-separated string
for col_name in array_columns:
    df_labels = df_labels.withColumn(
        col_name, 
        F.array_join(
            F.col(col_name),
            ","
        )
    )

df_labels.printSchema()

In [0]:
# validate one of the array columns to ensure conversion to string didn't remove the data
df_labels.select(F.col("QID146")).filter(F.col("QID146").isNotNull()).show(truncate=False)

In [0]:
# add suffix to QID-fields
df_labels_renamed = df_labels.select(
    [F.col(c).alias(c + "_label") for c in df_labels.columns]
).withColumnRenamed("responseId_label", "responseId") # remove suffix from responseId column
df_labels_renamed.printSchema()

In [0]:
# create final dataframe combining the values and the labels
df_final = df_values.join(df_labels_renamed, "responseId", "left")
df_final = df_final.select(sorted(df_final.columns)).drop(F.col("labels"))
df_final.printSchema()

In [0]:
display(df_final)