In [1]:
# imports
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import SparkSession
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>")) # use to format the show() output

spark = ( SparkSession.builder
         .config("spark.driver.memory", '8g')
         .getOrCreate()
)       
spark


In [64]:
# read the data
# df = spark.read.json("abfss://edaauthoring@stdncuscomlzwork.dfs.core.windows.net/Qualtrics/Dow Direct Relationship Survey.json")
# df.printSchema()

df = spark.read.json("./data/SV_3TPNSvgX6GtUpuJ-Unzip/DowDirectRelationshipSurvey.json")
df.printSchema()

# alternative file location: 
#df = spark.read.json(f"/Workspace/Users/ub00049@dow.com/Qualtrics_Source_Files/data/DowDirectRelationshipSurvey.json")
#df.printSchema()

root
 |-- responses: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- displayedFields: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- displayedValues: struct (nullable = true)
 |    |    |    |-- QID10_1: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_2: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_3: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_4: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_5: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID12: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID128: array (nullable = true)
 |

In [65]:
# keep only the required fields in the 'responses' array; drop displayedFields and displayedLabels
# expr(transform()) returns 'responses' column as an array instead of a struct, so explode() can be used in a later step
df = df.withColumn(
    "responses",
    F.expr("""
        transform(
            responses,
            r -> struct(
                r.responseId as responseId,
                r.values as values,
                r.labels as labels
            )
        )
    """)
)
# df.printSchema()

In [66]:
# move responseId, values and labels  to top level schema and explode values

# Explode the 'responses' array to break it into individual rows
df_flattened = df.withColumn("response", F.explode("responses"))

# Select the 'responseId' at the top level, but retain 'values' and 'labels' as nested structures
df_flattened = df_flattened.select(
    "response.responseId",
    "response.values",
    "response.labels"
)

# df_flattened.printSchema()

In [67]:
# flip values keys to columns
df_values = df_flattened.select(
    "responseId",
    "labels",
    *[F.col(f"values.{c}").alias(c) for c in df_flattened.schema["values"].dataType.fieldNames()]
)
# df_values.printSchema()
# df_values.show(truncate=False)

In [75]:
# get the QID columns
df_qid_cols = df_values.select(
    "responseId",
    *[col for col in df_values.columns if col.startswith("QID")])
df_qid_cols.printSchema()

root
 |-- responseId: string (nullable = true)
 |-- QID101: long (nullable = true)
 |-- QID103_1: long (nullable = true)
 |-- QID103_2: long (nullable = true)
 |-- QID103_3: long (nullable = true)
 |-- QID103_4: long (nullable = true)
 |-- QID103_5: long (nullable = true)
 |-- QID103_6: long (nullable = true)
 |-- QID103_7: long (nullable = true)
 |-- QID103_8: long (nullable = true)
 |-- QID103_9: long (nullable = true)
 |-- QID105_TEXT: string (nullable = true)
 |-- QID106: long (nullable = true)
 |-- QID107: long (nullable = true)
 |-- QID108: long (nullable = true)
 |-- QID10_1: long (nullable = true)
 |-- QID10_2: long (nullable = true)
 |-- QID10_3: long (nullable = true)
 |-- QID10_4: long (nullable = true)
 |-- QID10_5: long (nullable = true)
 |-- QID10_6: long (nullable = true)
 |-- QID119: long (nullable = true)
 |-- QID11_TEXT: string (nullable = true)
 |-- QID11_TEXT_TRANSLATEDen93ihhyq: string (nullable = true)
 |-- QID12: long (nullable = true)
 |-- QID120_1: long (nullab

In [77]:
# cast all qid columns to stringtype
df_qid_cols = df_qid_cols.select([F.col(c).cast(T.StringType()) for c in df_qid_cols.columns])
# qid_cols.printSchema()

In [None]:
# validation step
display(qid_cols.select(F.col("QID146")).filter(F.col("QID146").isNotNull()))


In [78]:
# melt qid columns
df_qid_melted = df_qid_cols.melt(
    ids="responseId",
    values=([col for col in df_qid_cols.columns if col != "responseId"]),
    variableColumnName="qid",
    valueColumnName="qid_response"
)
# df_qid_melted.show()

In [79]:
# get label columns
df_labels = df_flattened.select(
    F.col("responseId"),
    *[F.col(f"labels.{c}").alias(c) for c in df_flattened.schema["labels"].dataType.fieldNames()]
)

In [80]:
# cast all label columns to string type
df_labels = df_labels.select([F.col(c).cast(T.StringType()) for c in df_labels.columns])
# df_labels.printSchema()

In [81]:
# melt labels columns
df_labels_melted = df_labels.melt(
    ids="responseId",
    values=([col for col in df_labels.columns if col != "responseId"]),
    variableColumnName="qid_label",
    valueColumnName="qid_response_label"
)
# df_labels_melted.show()

In [88]:
# validate unique responseIds
df_melted.select(F.countDistinct("responseId")).show()

+--------------------------+
|count(DISTINCT responseId)|
+--------------------------+
|                     40454|
+--------------------------+



In [83]:
# validate unique responseIds
df_values.select(F.countDistinct("responseId")).show()

+--------------------------+
|count(DISTINCT responseId)|
+--------------------------+
|                     40454|
+--------------------------+



In [84]:
#join the melted qid df and the melted labels df, creating a single df containing responseId, 
df_melted = df_qid_melted.join(
    df_labels_melted, 
    (df_qid_melted["qid"] == df_labels_melted["qid_label"]) & (df_qid_melted["responseId"] == df_labels_melted["responseId"]),
    "outer").select(df_qid_melted["*"], df_labels_melted["qid_response_label"])
# df_melted.show()

In [85]:
# join melted table with values, dropping the qid columns
df_final = df_values.join(
    df_melted,
    df_values["responseId"] == df_melted["responseId"], 
    how="inner").select(df_values["*"], df_melted["qid"].alias("questionId"), df_melted["qid_response"].alias("response"), df_melted["qid_response_label"].alias("responseLabel"))
cols_to_drop = [col for col in df_final.columns if col.startswith("QID")]
df_final = df_final.drop(*cols_to_drop, "labels")
# df_final.printSchema()

In [86]:
# display(df_final)
df_final.show(200)

+-----------------+--------------------+------------------------------------------+--------------------------------------------+-------+-------------+--------------------+------------+-------------+-----------+---------------+--------------+------------------+--------------------+--------------------+------------+-------+------------+----------+--------------------+--------------------+----+---------+-------------+------------+----------------+--------------------+--------------------+--------------------+----------+-------+--------------------+-----------+-----------------+--------------------+--------------------+--------+---------------+--------------+------------+-------------+------+----------------------------+----+-----------------+-------------------+--------+--------------------+--------+--------------+----------------+-----------------+--------+--------------------+------------------+-----------------+--------------------+--------------------+------+------------+-------------

In [None]:
display(df_final.select(F.col("qid"), F.col("qid_response"), F.col("qid_response_label")).filter(F.col("qid") == "QID36_TEXT"))