In [0]:
from pyspark.sql.session import SparkSession

In [0]:
import os
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName('sss') \
        .getOrCreate()

In [0]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]

df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.printSchema()
df.show(truncate = False)

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-------------------+-----------------------------+
|name      |knownLanguages     |properties                   |
+----------+-------------------+-----------------------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|
|Michael   |[Spark, Java, null]|{eye -> null, hair -> brown} |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |
|Washington|null               |null                         |
|Jefferson |[1, 2]             |{}                           |
+----------+-------------------+-----------------------------+



In [0]:
from pyspark.sql.functions import explode
df2 = df.select(df.name,explode(df.knownLanguages))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+---------+------+
|     name|   col|
+---------+------+
|    James|  Java|
|    James| Scala|
|  Michael| Spark|
|  Michael|  Java|
|  Michael|  null|
|   Robert|CSharp|
|   Robert|      |
|Jefferson|     1|
|Jefferson|     2|
+---------+------+



In [0]:

from pyspark.sql.functions import explode
df3 = df.select(df.name,explode(df.properties))
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)

+-------+----+-----+
|   name| key|value|
+-------+----+-----+
|  James| eye|brown|
|  James|hair|black|
|Michael| eye| null|
|Michael|hair|brown|
| Robert| eye|     |
| Robert|hair|  red|
+-------+----+-----+



Example of complex Json

In [0]:
from pyspark.sql.functions import explode, col

In [0]:
source_json = """
{
    "persons": [
        {
            "name": "John",
            "age": 30,
            "cars": [
                {
                    "name": "Ford",
                    "models": [
                        "Fiesta",
                        "Focus",
                        "Mustang"
                    ]
                },
                {
                    "name": "BMW",
                    "models": [
                        "320",
                        "X3",
                        "X5"
                    ]
                }
            ]
        },
        {
            "name": "Peter",
            "age": 46,
            "cars": [
                {
                    "name": "Huyndai",
                    "models": [
                        "i10",
                        "i30"
                    ]
                },
                {
                    "name": "Mercedes",
                    "models": [
                        "E320",
                        "E63 AMG"
                    ]
                }
            ]
        }
    ]
}
"""

In [0]:
dbutils.fs.put("/tmp/source.json", source_json, True)

Wrote 1074 bytes.
Out[4]: True

In [0]:
df = spark.read.option("multiline", "true").json("/tmp/source.json")

In [0]:
df.show(truncate=False)
df.printSchema()

+-----------------------------------------------------------------------------------------------------------------------------------------+
|persons                                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------------------------+
|[{30, [{[Fiesta, Focus, Mustang], Ford}, {[320, X3, X5], BMW}], John}, {46, [{[i10, i30], Huyndai}, {[E320, E63 AMG], Mercedes}], Peter}]|
+-----------------------------------------------------------------------------------------------------------------------------------------+

root
 |-- persons: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- cars: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- models: array (nullable = true)
 |   

In [0]:
persons = df.select(explode("persons").alias("persons"))
persons.show(truncate = False)

+--------------------------------------------------------------------+
|persons                                                             |
+--------------------------------------------------------------------+
|{30, [{[Fiesta, Focus, Mustang], Ford}, {[320, X3, X5], BMW}], John}|
|{46, [{[i10, i30], Huyndai}, {[E320, E63 AMG], Mercedes}], Peter}   |
+--------------------------------------------------------------------+



In [0]:
persons_cars = persons.select(
   col("persons.name").alias("persons_name")
 , col("persons.age").alias("persons_age")
 , explode("persons.cars").alias("persons_cars_brands")
 , col("persons_cars_brands.name").alias("persons_cars_brand")
)

In [0]:
persons_cars.show()

+------------+-----------+--------------------+------------------+
|persons_name|persons_age| persons_cars_brands|persons_cars_brand|
+------------+-----------+--------------------+------------------+
|        John|         30|{[Fiesta, Focus, ...|              Ford|
|        John|         30|{[320, X3, X5], BMW}|               BMW|
|       Peter|         46|{[i10, i30], Huyn...|           Huyndai|
|       Peter|         46|{[E320, E63 AMG],...|          Mercedes|
+------------+-----------+--------------------+------------------+



In [0]:
persons_cars_models = persons_cars.select(
   col("persons_name")
 , col("persons_age")
 , col("persons_cars_brand")
 , explode("persons_cars_brands.models").alias("persons_cars_model")
)

In [0]:
display(persons_cars_models)

persons_name,persons_age,persons_cars_brand,persons_cars_model
John,30,Ford,Fiesta
John,30,Ford,Focus
John,30,Ford,Mustang
John,30,BMW,320
John,30,BMW,X3
John,30,BMW,X5
Peter,46,Huyndai,i10
Peter,46,Huyndai,i30
Peter,46,Mercedes,E320
Peter,46,Mercedes,E63 AMG


Next example of complex json

In [0]:
[{
    "data": {
        "emp_id": "12345",
        "emp_name": "Mohan",
        "awards": [
            {
                "award_type": "Internal",
                "award_name": "Best_emp_of_the_year",
                "year": "2000"
            },
            {
                "award_type": "External",
                "award_name": "Best_presenter",
                "year": "2001"
            }
        ]
    }
}]

In [0]:
df1 = spark.read.format("json").load("dbfs:/FileStore/shared_uploads/sinha.ashish.4.u@gmail.com/test.json",multiLine=True)

In [0]:
df1.show(truncate= False)
df1.printSchema()

+------------------------------------------------------------------------------------------+
|data                                                                                      |
+------------------------------------------------------------------------------------------+
|{[{Best_emp_of_the_year, Internal, 2000}, {Best_presenter, External, 2001}], 12345, Mohan}|
+------------------------------------------------------------------------------------------+

root
 |-- data: struct (nullable = true)
 |    |-- awards: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- award_name: string (nullable = true)
 |    |    |    |-- award_type: string (nullable = true)
 |    |    |    |-- year: string (nullable = true)
 |    |-- emp_id: string (nullable = true)
 |    |-- emp_name: string (nullable = true)



In [0]:
df_person = df1.select(
   col("data.emp_name").alias("name")
  ,col("data.emp_id").alias("id")
  ,explode("data.awards").alias("awards")
)

In [0]:
df_person.show(truncate = False)

+-----+-----+--------------------------------------+
|name |id   |awards                                |
+-----+-----+--------------------------------------+
|Mohan|12345|{Best_emp_of_the_year, Internal, 2000}|
|Mohan|12345|{Best_presenter, External, 2001}      |
+-----+-----+--------------------------------------+



In [0]:
df_awards = df_person.select(
    col("name")
  ,col("id")
  ,col("awards.award_name")
  ,col("awards.award_type")
  ,col("awards.year")

)

In [0]:
df_awards.show()

+-----+-----+--------------------+----------+----+
| name|   id|          award_name|award_type|year|
+-----+-----+--------------------+----------+----+
|Mohan|12345|Best_emp_of_the_year|  Internal|2000|
|Mohan|12345|      Best_presenter|  External|2001|
+-----+-----+--------------------+----------+----+

