<a href="https://colab.research.google.com/github/balakumar-dataengineer/testrepo/blob/master/Pyspark_To_json_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Pyspark') \
                            .master('local') \
                            .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x796710217bd0>


In [7]:
from pyspark.sql.types import StringType,StructType,StructField,IntegerType
from pyspark.sql.functions import to_json

data=[
    (1,'bala',['sql','python','pyspark']),
    (2,'kumar',['postgressql','pandas','apachebeam'])
]

schema=StructType([
    StructField('id',IntegerType(),True),
    StructField('name',StringType(),True),
    StructField('skills',StructType([
        StructField('skill1',StringType(),True),
        StructField('skill2',StringType(),True),
        StructField('skill3',StringType(),True)
    ]))
])

df=spark.createDataFrame(data,schema)
df.show()
df.printSchema()

+---+-----+--------------------+
| id| name|              skills|
+---+-----+--------------------+
|  1| bala|{sql, python, pys...|
|  2|kumar|{postgressql, pan...|
+---+-----+--------------------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: struct (nullable = true)
 |    |-- skill1: string (nullable = true)
 |    |-- skill2: string (nullable = true)
 |    |-- skill3: string (nullable = true)



In [8]:
df.withColumn('skillsDetails',to_json('skills')).show(truncate=False)

+---+-----+---------------------------------+----------------------------------------------------------------+
|id |name |skills                           |skillsDetails                                                   |
+---+-----+---------------------------------+----------------------------------------------------------------+
|1  |bala |{sql, python, pyspark}           |{"skill1":"sql","skill2":"python","skill3":"pyspark"}           |
|2  |kumar|{postgressql, pandas, apachebeam}|{"skill1":"postgressql","skill2":"pandas","skill3":"apachebeam"}|
+---+-----+---------------------------------+----------------------------------------------------------------+



In [11]:
data1=[
    (1,'bala',{'skill1':'sql','skill2':'python','skill3':'pyspark'}),
    (2,'kumar',{'skill1':'postgressql','skill2':'pandas','skill3':'apachebeam'})
]

schema=['id','name','skills']

df = spark.createDataFrame(data1,schema)
df.show()
df.printSchema()

+---+-----+--------------------+
| id| name|              skills|
+---+-----+--------------------+
|  1| bala|{skill3 -> pyspar...|
|  2|kumar|{skill3 -> apache...|
+---+-----+--------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [14]:
df1=df.select('id','name','skills',to_json(df.skills).alias('skillsjs'))
df1.show(truncate=False)
df1.printSchema()

+---+-----+---------------------------------------------------------------+----------------------------------------------------------------+
|id |name |skills                                                         |skillsjs                                                        |
+---+-----+---------------------------------------------------------------+----------------------------------------------------------------+
|1  |bala |{skill3 -> pyspark, skill1 -> sql, skill2 -> python}           |{"skill3":"pyspark","skill1":"sql","skill2":"python"}           |
|2  |kumar|{skill3 -> apachebeam, skill1 -> postgressql, skill2 -> pandas}|{"skill3":"apachebeam","skill1":"postgressql","skill2":"pandas"}|
+---+-----+---------------------------------------------------------------+----------------------------------------------------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: map (nullable = true)
 |    |-- key: string
 |    |-- value: string