In [126]:
from IPython.display   import display, HTML

import findspark
findspark.init()

import pyspark
from pyspark.sql           import SparkSession
from pyspark.sql.functions import col, explode, udf
from pyspark.sql.types     import Row, LongType, StringType, ArrayType


from datetime          import datetime

In [94]:
spark = SparkSession.builder  \
        .appName( 'json 01' ) \
        .getOrCreate()

In [95]:
df_01 = spark.read \
     .format( 'json' ) \
     .load( '/home/art/data/tiny/j01.json', multiLine= True )

In [96]:
df_01.show()

+------+--------------------+--------------------+---------+-------+
|animal|       date_of_birth|                food| fruit_id|   name|
+------+--------------------+--------------------+---------+-------+
|   cat|20221-05-20 08:00:00|[salmon, meat, cr...|[1, 2, 3]|Patitas|
+------+--------------------+--------------------+---------+-------+



In [97]:
df_01 = spark.read \
        .json( '/home/art/data/tiny/j01.json', multiLine= True )

In [98]:
df_01.show()

+------+--------------------+--------------------+---------+-------+
|animal|       date_of_birth|                food| fruit_id|   name|
+------+--------------------+--------------------+---------+-------+
|   cat|20221-05-20 08:00:00|[salmon, meat, cr...|[1, 2, 3]|Patitas|
+------+--------------------+--------------------+---------+-------+



In [99]:
jsonString="""{"Zipcode":704, "ZipCodeType":"STANDARD", "City":"PARC PARQUE", "State":"PR"}"""
df=spark.createDataFrame([(1, jsonString)],["id","value"])
df.show(truncate=False)

+---+-----------------------------------------------------------------------------+
|id |value                                                                        |
+---+-----------------------------------------------------------------------------+
|1  |{"Zipcode":704, "ZipCodeType":"STANDARD", "City":"PARC PARQUE", "State":"PR"}|
+---+-----------------------------------------------------------------------------+



## Reading an array of jsons

In [100]:
df_events = spark.read  \
            .json( '/home/art/data/tiny/events.json', multiLine= True )


In [101]:
df_events.show()

+-----+--------------------+------------+--------------------+----+
|event|     event_timestamp|    fruit_id|          fruit_name|user|
+-----+--------------------+------------+--------------------+----+
|login|20223-01-20 00:00:00|   [1, 2, 3]|[apple, banana, o...| Ana|
|click|20223-01-20 00:01:00|[4, 5, 6, 7]|[pineapple, melon...|Elsa|
| swap|20223-01-20 00:02:00|      [1, 2]|[strawberry, blue...|Olaf|
+-----+--------------------+------------+--------------------+----+



### register dataframe to use SQL queries

In [106]:
df_events.createOrReplaceTempView( 'df_events' )

In [107]:


df = spark.sql( ''' select *, fruit_id[0] as first_id
     from df_events

'''

)

In [108]:
df.show()

+-----+--------------------+------------+--------------------+----+--------+
|event|     event_timestamp|    fruit_id|          fruit_name|user|first_id|
+-----+--------------------+------------+--------------------+----+--------+
|login|20223-01-20 00:00:00|   [1, 2, 3]|[apple, banana, o...| Ana|       1|
|click|20223-01-20 00:01:00|[4, 5, 6, 7]|[pineapple, melon...|Elsa|       4|
| swap|20223-01-20 00:02:00|      [1, 2]|[strawberry, blue...|Olaf|       1|
+-----+--------------------+------------+--------------------+----+--------+



### create a simple UDF - return string

In [102]:
def my_stars( s ):
    r = '*** ' +  s.upper() + ' ***'
    return r

s1 = 'kitty'

s2 = my_stars( s1 )
print( s2 )



*** KITTY ***


In [105]:
#spark.udf.register( 'my_stars', 'my_stars', StringType() )

spark.udf.register( 'my_stars_udf', my_stars )


<function __main__.my_stars(s)>

In [110]:
df = spark.sql(
'''
select *, my_stars_udf( user ) as vip_user
from df_events
'''
)

df.show()


+-----+--------------------+------------+--------------------+----+------------+
|event|     event_timestamp|    fruit_id|          fruit_name|user|    vip_user|
+-----+--------------------+------------+--------------------+----+------------+
|login|20223-01-20 00:00:00|   [1, 2, 3]|[apple, banana, o...| Ana| *** ANA ***|
|click|20223-01-20 00:01:00|[4, 5, 6, 7]|[pineapple, melon...|Elsa|*** ELSA ***|
| swap|20223-01-20 00:02:00|      [1, 2]|[strawberry, blue...|Olaf|*** OLAF ***|
+-----+--------------------+------------+--------------------+----+------------+



### create udf - array of string

In [111]:
def my_list( ):
    return [ 'a', 'b', 'c' ]


a = my_list()
print( a )


['a', 'b', 'c']


In [112]:
spark.udf.register( 'my_list_udf', my_list, ArrayType( StringType() ) )


<function __main__.my_list()>

In [115]:
df = spark.sql(
'''
select *, my_list_udf() 
from df_events
'''
)

df.show()

+-----+--------------------+------------+--------------------+----+-------------+
|event|     event_timestamp|    fruit_id|          fruit_name|user|my_list_udf()|
+-----+--------------------+------------+--------------------+----+-------------+
|login|20223-01-20 00:00:00|   [1, 2, 3]|[apple, banana, o...| Ana|    [a, b, c]|
|click|20223-01-20 00:01:00|[4, 5, 6, 7]|[pineapple, melon...|Elsa|    [a, b, c]|
| swap|20223-01-20 00:02:00|      [1, 2]|[strawberry, blue...|Olaf|    [a, b, c]|
+-----+--------------------+------------+--------------------+----+-------------+



### create a UDF to zip the fruits and ids in one list

this udf return a string that represent a list of lists.

In [147]:
def zip_fruits( a1, a2 ):
    
    a = []
    for i in range( 0, len(a1)  ):
        pair = [ a1[i], a2[i] ]
        a.append( pair )
    a = a
    return a


a1 = [1, 2, 3]
a2 = [ 'apple', 'banana', 'orange' ]
a = zip_fruits( a1, a2 )

print( type( a ) )
for i in a:
    print( i )



<class 'list'>
[1, 'apple']
[2, 'banana']
[3, 'orange']


### register UDF to use it in SQL queries

In [148]:
spark.udf.register( 'zip_fruits_udf', zip_fruits, ArrayType( ArrayType( StringType() ) ) ) 

23/06/09 11:13:29 WARN SimpleFunctionRegistry: The function zip_fruits_udf replaced a previously registered function.


<function __main__.zip_fruits(a1, a2)>

In [149]:
df = spark.sql(
'''
select *, zip_fruits_udf( fruit_id, fruit_name ) as fruits
from df_events
'''
)

df.show()


+-----+--------------------+------------+--------------------+----+--------------------+
|event|     event_timestamp|    fruit_id|          fruit_name|user|              fruits|
+-----+--------------------+------------+--------------------+----+--------------------+
|login|20223-01-20 00:00:00|   [1, 2, 3]|[apple, banana, o...| Ana|[[1, apple], [2, ...|
|click|20223-01-20 00:01:00|[4, 5, 6, 7]|[pineapple, melon...|Elsa|[[4, pineapple], ...|
| swap|20223-01-20 00:02:00|      [1, 2]|[strawberry, blue...|Olaf|[[1, strawberry],...|
+-----+--------------------+------------+--------------------+----+--------------------+



### Explode (flatmap) the fruits column

In [150]:
df2 = df.select( 'event', 'event_timestamp', 'user', 'fruits') 
    
df2.show()

+-----+--------------------+----+--------------------+
|event|     event_timestamp|user|              fruits|
+-----+--------------------+----+--------------------+
|login|20223-01-20 00:00:00| Ana|[[1, apple], [2, ...|
|click|20223-01-20 00:01:00|Elsa|[[4, pineapple], ...|
| swap|20223-01-20 00:02:00|Olaf|[[1, strawberry],...|
+-----+--------------------+----+--------------------+



In [151]:
df2 = df.select( df.user, explode( df.fruits ) ) 
    
df2.show()




+----+---------------+
|user|            col|
+----+---------------+
| Ana|     [1, apple]|
| Ana|    [2, banana]|
| Ana|    [3, orange]|
|Elsa| [4, pineapple]|
|Elsa|     [5, melon]|
|Elsa|[6, watermelon]|
|Elsa|    [7, papaya]|
|Olaf|[1, strawberry]|
|Olaf| [2, blueberry]|
+----+---------------+



### explode in SQL

In [157]:
df.createOrReplaceTempView( 'df' )

In [163]:
df3 = spark.sql(
'''
select event, user, explode( fruits ) as fruits
from df
'''
)
df3.show()

+-----+----+---------------+
|event|user|         fruits|
+-----+----+---------------+
|login| Ana|     [1, apple]|
|login| Ana|    [2, banana]|
|login| Ana|    [3, orange]|
|click|Elsa| [4, pineapple]|
|click|Elsa|     [5, melon]|
|click|Elsa|[6, watermelon]|
|click|Elsa|    [7, papaya]|
| swap|Olaf|[1, strawberry]|
| swap|Olaf| [2, blueberry]|
+-----+----+---------------+

