In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import json
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = ( SparkSession.builder
         .config("spark.driver.memory", '8g')
         .getOrCreate()
)         
spark

In [2]:
# read data
df = spark.read.json("./data/SV_3TPNSvgX6GtUpuJ-Unzip/DowDirectRelationshipSurvey.json")
df.printSchema()

root
 |-- responses: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- displayedFields: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- displayedValues: struct (nullable = true)
 |    |    |    |-- QID10_1: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_2: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_3: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_4: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID10_5: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID12: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- QID128: array (nullable = true)
 |

In [3]:
# Use 'transform' to keep only the required fields in the 'responses' array
df = df.withColumn(
    "responses",
    F.expr("""
        transform(
            responses,
            r -> struct(
                r.responseId as responseId,
                r.values as values,
                r.labels as labels
            )
        )
    """)
)

# Show the updated schema to confirm the fields
df.printSchema()


root
 |-- responses: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- responseId: string (nullable = true)
 |    |    |-- values: struct (nullable = true)
 |    |    |    |-- AccountName: string (nullable = true)
 |    |    |    |-- AccountName_5e02b1554bfe4b0dac61f801SenPol: long (nullable = true)
 |    |    |    |-- AccountName_5e02b1554bfe4b0dac61f801SenScore: long (nullable = true)
 |    |    |    |-- BPCode: string (nullable = true)
 |    |    |    |-- COVIDComments: string (nullable = true)
 |    |    |    |-- CRMLink: string (nullable = true)
 |    |    |    |-- CRMOwner: string (nullable = true)
 |    |    |    |-- CRMOwnerEmail: string (nullable = true)
 |    |    |    |-- ContactArea: string (nullable = true)
 |    |    |    |-- ContactAreaCode: string (nullable = true)
 |    |    |    |-- ContactCountry: string (nullable = true)
 |    |    |    |-- ContactCountryCode: string (nullable = true)
 |    |    |    |-- ContactEmail: string (nu

In [4]:
# move responseId, values and labels  to top level schema

# Explode the 'responses' array to break it into individual rows
df_flattened = df.withColumn("response", F.explode("responses"))

# Select the 'responseId' at the top level, but retain 'values' and 'labels' as nested structures
df_flattened = df_flattened.select(
    "response.responseId",  # Move 'responseId' to the top level
    "response.values",       # Keep 'values' as a nested struct
    "response.labels"        # Keep 'labels' as a nested struct
)

# Show the updated schema
df_flattened.printSchema()

# Show the data to verify the transformation
df_flattened.show()


root
 |-- responseId: string (nullable = true)
 |-- values: struct (nullable = true)
 |    |-- AccountName: string (nullable = true)
 |    |-- AccountName_5e02b1554bfe4b0dac61f801SenPol: long (nullable = true)
 |    |-- AccountName_5e02b1554bfe4b0dac61f801SenScore: long (nullable = true)
 |    |-- BPCode: string (nullable = true)
 |    |-- COVIDComments: string (nullable = true)
 |    |-- CRMLink: string (nullable = true)
 |    |-- CRMOwner: string (nullable = true)
 |    |-- CRMOwnerEmail: string (nullable = true)
 |    |-- ContactArea: string (nullable = true)
 |    |-- ContactAreaCode: string (nullable = true)
 |    |-- ContactCountry: string (nullable = true)
 |    |-- ContactCountryCode: string (nullable = true)
 |    |-- ContactEmail: string (nullable = true)
 |    |-- ContactName: string (nullable = true)
 |    |-- CustomerType: string (nullable = true)
 |    |-- Digital: string (nullable = true)
 |    |-- Distinction: string (nullable = true)
 |    |-- GlobalCode: string (nulla

In [5]:
# convert values keys to columns
df_values = df_flattened.select(
    "responseId",
    *[F.col(f"values.{c}").alias(c) for c in df_flattened.schema["values"].dataType.fieldNames()]
)
df_values.printSchema()

root
 |-- responseId: string (nullable = true)
 |-- AccountName: string (nullable = true)
 |-- AccountName_5e02b1554bfe4b0dac61f801SenPol: long (nullable = true)
 |-- AccountName_5e02b1554bfe4b0dac61f801SenScore: long (nullable = true)
 |-- BPCode: string (nullable = true)
 |-- COVIDComments: string (nullable = true)
 |-- CRMLink: string (nullable = true)
 |-- CRMOwner: string (nullable = true)
 |-- CRMOwnerEmail: string (nullable = true)
 |-- ContactArea: string (nullable = true)
 |-- ContactAreaCode: string (nullable = true)
 |-- ContactCountry: string (nullable = true)
 |-- ContactCountryCode: string (nullable = true)
 |-- ContactEmail: string (nullable = true)
 |-- ContactName: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- Digital: string (nullable = true)
 |-- Distinction: string (nullable = true)
 |-- GlobalCode: string (nullable = true)
 |-- GlobalName: string (nullable = true)
 |-- Guide: string (nullable = true)
 |-- LST: string (nullable = true)
 |

In [None]:
df_values_pandas = df_values.toPandas()
display(df_values_pandas)

# transforming the labels column

In [6]:
df_labels = df_flattened.select(F.col("responseId"), F.col("labels"))
df_labels.printSchema()

root
 |-- responseId: string (nullable = true)
 |-- labels: struct (nullable = true)
 |    |-- QID101: string (nullable = true)
 |    |-- QID103_1: string (nullable = true)
 |    |-- QID103_2: string (nullable = true)
 |    |-- QID103_3: string (nullable = true)
 |    |-- QID103_4: string (nullable = true)
 |    |-- QID103_5: string (nullable = true)
 |    |-- QID103_6: string (nullable = true)
 |    |-- QID103_7: string (nullable = true)
 |    |-- QID103_8: string (nullable = true)
 |    |-- QID103_9: string (nullable = true)
 |    |-- QID106: string (nullable = true)
 |    |-- QID107: string (nullable = true)
 |    |-- QID108: string (nullable = true)
 |    |-- QID10_1: string (nullable = true)
 |    |-- QID10_2: string (nullable = true)
 |    |-- QID10_3: string (nullable = true)
 |    |-- QID10_4: string (nullable = true)
 |    |-- QID10_5: string (nullable = true)
 |    |-- QID10_6: string (nullable = true)
 |    |-- QID119: string (nullable = true)
 |    |-- QID12: string (nullab

In [7]:
# flatten labels column
df_labels = df_labels.select(
    "responseId",
    *[F.col(f"labels.{c}").alias(c) for c in df_labels.schema["labels"].dataType.fieldNames()]
)
df_labels.printSchema()

root
 |-- responseId: string (nullable = true)
 |-- QID101: string (nullable = true)
 |-- QID103_1: string (nullable = true)
 |-- QID103_2: string (nullable = true)
 |-- QID103_3: string (nullable = true)
 |-- QID103_4: string (nullable = true)
 |-- QID103_5: string (nullable = true)
 |-- QID103_6: string (nullable = true)
 |-- QID103_7: string (nullable = true)
 |-- QID103_8: string (nullable = true)
 |-- QID103_9: string (nullable = true)
 |-- QID106: string (nullable = true)
 |-- QID107: string (nullable = true)
 |-- QID108: string (nullable = true)
 |-- QID10_1: string (nullable = true)
 |-- QID10_2: string (nullable = true)
 |-- QID10_3: string (nullable = true)
 |-- QID10_4: string (nullable = true)
 |-- QID10_5: string (nullable = true)
 |-- QID10_6: string (nullable = true)
 |-- QID119: string (nullable = true)
 |-- QID12: string (nullable = true)
 |-- QID120_1: string (nullable = true)
 |-- QID120_2: string (nullable = true)
 |-- QID120_3: string (nullable = true)
 |-- QID120_

In [None]:
df_arrays = df_labels[[f.name for f in df_labels.schema.fields if isinstance(f.dataType, T.ArrayType)]].dropDuplicates()
display(df_arrays.toPandas())

In [8]:
# converting the array columns to String 

# get array columns
array_columns = [f.name for f in df_labels.schema.fields if isinstance(f.dataType, T.ArrayType)]

# concat_ws to convery array columns to comma-separated string
for col_name in array_columns:
    df_labels = df_labels.withColumn(
        col_name, 
        F.array_join(
            F.col(col_name),
            ","
        )
    )

df_labels.printSchema()

root
 |-- responseId: string (nullable = true)
 |-- QID101: string (nullable = true)
 |-- QID103_1: string (nullable = true)
 |-- QID103_2: string (nullable = true)
 |-- QID103_3: string (nullable = true)
 |-- QID103_4: string (nullable = true)
 |-- QID103_5: string (nullable = true)
 |-- QID103_6: string (nullable = true)
 |-- QID103_7: string (nullable = true)
 |-- QID103_8: string (nullable = true)
 |-- QID103_9: string (nullable = true)
 |-- QID106: string (nullable = true)
 |-- QID107: string (nullable = true)
 |-- QID108: string (nullable = true)
 |-- QID10_1: string (nullable = true)
 |-- QID10_2: string (nullable = true)
 |-- QID10_3: string (nullable = true)
 |-- QID10_4: string (nullable = true)
 |-- QID10_5: string (nullable = true)
 |-- QID10_6: string (nullable = true)
 |-- QID119: string (nullable = true)
 |-- QID12: string (nullable = true)
 |-- QID120_1: string (nullable = true)
 |-- QID120_2: string (nullable = true)
 |-- QID120_3: string (nullable = true)
 |-- QID120_

In [None]:
# validate one of the array columns to ensure conversion to string didn't remove the data
df_labels.select(F.col("QID146")).filter(F.col("QID146").isNotNull()).show(truncate=False)

In [15]:
df_labels_renamed = df_labels.select(
    [F.col(c).alias(c + "_label") for c in df_labels.columns]
).withColumnRenamed("responseId_label", "responseId") # remove suffix from responseId column
df_labels_renamed.printSchema()

root
 |-- responseId: string (nullable = true)
 |-- QID101_label: string (nullable = true)
 |-- QID103_1_label: string (nullable = true)
 |-- QID103_2_label: string (nullable = true)
 |-- QID103_3_label: string (nullable = true)
 |-- QID103_4_label: string (nullable = true)
 |-- QID103_5_label: string (nullable = true)
 |-- QID103_6_label: string (nullable = true)
 |-- QID103_7_label: string (nullable = true)
 |-- QID103_8_label: string (nullable = true)
 |-- QID103_9_label: string (nullable = true)
 |-- QID106_label: string (nullable = true)
 |-- QID107_label: string (nullable = true)
 |-- QID108_label: string (nullable = true)
 |-- QID10_1_label: string (nullable = true)
 |-- QID10_2_label: string (nullable = true)
 |-- QID10_3_label: string (nullable = true)
 |-- QID10_4_label: string (nullable = true)
 |-- QID10_5_label: string (nullable = true)
 |-- QID10_6_label: string (nullable = true)
 |-- QID119_label: string (nullable = true)
 |-- QID12_label: string (nullable = true)
 |-- Q

In [16]:
df_final = df_values.join(df_labels_renamed, "responseId", "left")
df_final = df_final.select(sorted(df_final.columns))
df_final.printSchema()

root
 |-- AccountName: string (nullable = true)
 |-- AccountName_5e02b1554bfe4b0dac61f801SenPol: long (nullable = true)
 |-- AccountName_5e02b1554bfe4b0dac61f801SenScore: long (nullable = true)
 |-- BPCode: string (nullable = true)
 |-- COVIDComments: string (nullable = true)
 |-- CRMLink: string (nullable = true)
 |-- CRMOwner: string (nullable = true)
 |-- CRMOwnerEmail: string (nullable = true)
 |-- ContactArea: string (nullable = true)
 |-- ContactAreaCode: string (nullable = true)
 |-- ContactCountry: string (nullable = true)
 |-- ContactCountryCode: string (nullable = true)
 |-- ContactEmail: string (nullable = true)
 |-- ContactName: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- Digital: string (nullable = true)
 |-- Distinction: string (nullable = true)
 |-- GlobalCode: string (nullable = true)
 |-- GlobalName: string (nullable = true)
 |-- Guide: string (nullable = true)
 |-- LST: string (nullable = true)
 |-- OwnerArea: string (nullable = true)
 |-

In [18]:
display(df_final.toPandas())

Unnamed: 0,AccountName,AccountName_5e02b1554bfe4b0dac61f801SenPol,AccountName_5e02b1554bfe4b0dac61f801SenScore,BPCode,COVIDComments,CRMLink,CRMOwner,CRMOwnerEmail,ContactArea,ContactAreaCode,...,progress,recipientEmail,recipientFirstName,recipientLastName,recordedDate,responseId,startDate,status,status_label,userLanguage
0,XINA OPERATIONS AND MAINTENANC,,,2436605,,https://dowchemical.crm.dynamics.com/main.aspx...,Kevin Pillay,,EMEAI,1,...,100,christiaan.vanrooyen@abengoa.com,Christiaan Rudolph,van Rooyen,2021-07-24T11:17:26.019Z,R_003olKHEbIQa2fI,2021-07-24T11:17:25.993Z,4,Imported,EN
1,BENVIC EUROPE SAS,,,1982215,,,Sven Petri,,EMEAI,1,...,100,didier.woerther@benvic.com,Didier,,2021-07-24T11:17:14.910Z,R_008TKF0DV29r6Mm,2021-07-24T11:17:14.905Z,4,Imported,FR
2,ECOPLAST PLASTIK AMB,,,1725203,,,Dicle Gunbay,,EMEAI,1,...,100,nurselkizilkan@ecoplast-film.com,Nursel,,2021-07-24T11:17:36.965Z,R_00wicD2aXNMUV9k,2021-07-24T11:17:36.949Z,4,Imported,TR
3,ENTEGRE HARC A S,,,1677275,,,Kaya Akyuz,,EMEAI,1,...,100,mustafa.kirmizi@entegreharc.com.tr,Mustafa,Kirmizi,2021-07-24T11:17:45.947Z,R_01ayXljDgiXbAIS,2021-07-24T11:17:45.905Z,4,Imported,TR
4,ADAMIETZ SP Z O O,,,2217778,,,,,EMEAI,1,...,3,katarzyna.pluciennik@arpanel.pl,Katarzyna,,2021-07-24T11:17:47.761Z,R_021bboTA4DY9Tn0,2021-07-24T11:17:47.751Z,4,Imported,PL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40449,TEKNOS,,,1504454,,https://dowchemical.crm.dynamics.com/main.aspx...,Zafar Iqbal,ZIqbal@dow.com,EMEAI,1,...,100,fredrik.calenius@teknos.fi,Fredrik,Calenius,2023-05-12T07:01:40.229Z,R_yqJKSkzBwcsbEqJ,2023-05-12T06:58:42Z,0,IP Address,EN
40450,MICROPLAST - MEDELLIN,,,341912,,https://dowchemical.crm.dynamics.com/main.aspx...,Sandy Tavares,sftavares@dow.com,LAA,3,...,100,jgiraldo@microplast.com.co,James,Giraldo,2022-05-20T14:55:20.088Z,R_ysVTohnHumYvcyd,2022-05-20T14:43:10Z,0,IP Address,ES
40451,INGERSOLL-RAND INDUSTRIAL IRELAND,,,2463853,,https://dowchemical.crm.dynamics.com/main.aspx...,Vincenzo Pennetta,VPennetta@dow.com,EMEAI,1,...,100,junmei_zhu@eu.irco.com,Junmei,Zhu,2022-05-26T12:54:39.555Z,R_z72GF0PEHdwduq5,2022-05-26T12:44:41Z,0,IP Address,EN
40452,YONG FUN TRADING,,,859188,,https://dowchemical.crm.dynamics.com/main.aspx...,Carvin TENG,c.c.teng@dow.com,APAC,2,...,100,lovemin0607@yahoo.com.tw,Ryan,Tsai,2022-11-08T07:39:14.281Z,R_zVE8IrrL0u19Rh7,2022-11-08T07:36:47Z,0,IP Address,ZH-T
