# Dataframe from JSON


In [1]:
import pyspark
import pyspark.sql.functions as F
import os
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType
from pyspark.sql.functions import udf

## pyarrow lib provides a considerable performance improvment

In [None]:
spark = SparkSession.builder \
.config("spark.sql.execution.arrow.pyspark.enabled", "false") \
.appName('test').master("spark://127.0.0.1:7077")\
.getOrCreate()

The JSON file test.js

```json
[
    {
        "name": "Andre",
        "id": 1,
        "sdictlist":[{"docid":"DOC001", "name":"bla001.txt"}, {"docid":"DOC002", "name":"bla002.txt"}],
        "sarraylist":["a","b","c"]
    },

    {
        "name": "Noé",
        "id": 1,
        "sdictlist":[{"docid":"DOC003", "name":"bla003.txt"}, {"docid":"DOC004", "name":"bla004.txt"}],
        "sarraylist":["a","b","c"]
     
    }
]

```

The easiest way to read a local file is import it using Pandas and convert it into a DataFrame object later.



In [10]:
# Reading file using Pandas
jdf = pd.read_json('test.js')
# Converting to Spark dataframe
sdf = spark.createDataFrame(jdf)
# Showing the result
sdf.show(truncate=False)

  for column, series in pdf.iteritems():


+-----+---+------------------------------------------------------------------------------+----------+
|name |id |sdictlist                                                                     |sarraylist|
+-----+---+------------------------------------------------------------------------------+----------+
|Andre|1  |[{name -> bla001.txt, docid -> DOC001}, {name -> bla002.txt, docid -> DOC002}]|[a, b, c] |
|Noé  |1  |[{name -> bla003.txt, docid -> DOC003}, {name -> bla004.txt, docid -> DOC004}]|[a, b, c] |
+-----+---+------------------------------------------------------------------------------+----------+



User Defined Functions(UDF) is a way to parse information from a column. In this case, the docs inside the JSON file is available in a list of objects which is parsed by pySpark and convenient converted into Python data structure objects. In this case, a list of dictionaries which is eaiser to manipulate. Brilliant!

In [4]:
@udf
def extract_doc_udf(data_list):
    n = list()
    for li in data_list:
        
        n += [v for k,v in li.items() if k == 'name']

    return ', '.join(n)


In [5]:
# Running the UDF called 'extract_doc_udf' and storing into a new column called 'udf_res'
dfu = sdf.withColumn('udf_res', extract_doc_udf(F.col('sdictlist')))
# Showing the result
dfu.show(truncate=False)



+-----+---+------------------------------------------------------------------------------+----------+----------------------+
|name |id |sdictlist                                                                     |sarraylist|udf_res               |
+-----+---+------------------------------------------------------------------------------+----------+----------------------+
|Andre|1  |[{name -> bla001.txt, docid -> DOC001}, {name -> bla002.txt, docid -> DOC002}]|[a, b, c] |bla001.txt, bla002.txt|
|Noé  |1  |[{name -> bla003.txt, docid -> DOC003}, {name -> bla004.txt, docid -> DOC004}]|[a, b, c] |bla003.txt, bla004.txt|
+-----+---+------------------------------------------------------------------------------+----------+----------------------+



                                                                                

Alterativelly, it's possible to use a simple Python function passing the dataframe row as a parameter. But, to do that is necessary to use RDD framework instead UDF and then convert it to DataFrame object later. This way is useful when you parse different fields in a row in iteractive way. But, note that the performance will drop considerably depending on data amount.

In [7]:
def extract_doc_rdd(row):
    d = row.asDict()
    n = list()
    if 'sdictlist' in d:
        for li in d['sdictlist']:
            n += [v for k,v in li.items() if k == 'name']

        d['doc_names'] = ', '.join(n)

    return Row(**d)
    


In [None]:
# Executing 'extract_doc_rdd' using map method from rdd object
rdd = sdf.rdd.map(extract_doc_rdd)
# Converting into a dataframe object
edf = rdd.toDF()
# Showing the result
edf.show(truncate=False)