In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("s8a-dataframes-arrays").getOrCreate()
# Hay columnas que se llaman igual pero con diferente case -> good for kids
spark.conf.set("spark.sql.caseSensitive", "true")

df = spark.read.option("inferSchema", "true").json("yelp_academic_dataset_business.json")
    
df.printSchema()

root
 |-- attributes: struct (nullable = true)
 |    |-- Accepts Credit Cards: string (nullable = true)
 |    |-- Accepts Insurance: boolean (nullable = true)
 |    |-- Ages Allowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: struct (nullable = true)
 |    |    |-- casual: boolean (nullable = true)
 |    |    |-- classy: boolean (nullable = true)
 |    |    |-- divey: boolean (nullable = true)
 |    |    |-- hipster: boolean (nullable = true)
 |    |    |-- intimate: boolean (nullable = true)
 |    |    |-- none: boolean (nullable = true)
 |    |    |-- romantic: boolean (nullable = true)
 |    |    |-- touristy: boolean (nullable = true)
 |    |    |-- trendy: boolean (nullable = true)
 |    |    |-- upscale: boolean (nullable = true)
 |    |-- Attire: string (nullable = true)
 |    |-- BYOB: boolean (nullable = true)
 |    |-- BYOB/Corkage: string (nullable = true)
 |    |-- By Appointment Only: boolean (nullable = true)
 |    |-- Caters:

In [2]:
df.count()

15585

In [3]:
from pyspark.sql.functions import *
df.select("name", "hours.Sunday", size("categories").alias("totalCategorias"),
               sort_array("categories").alias("categorias"),
               array_contains("categories", "Restaurants").alias("Restaurantes")).show(10, truncate=False)

+-------------------------------+--------------+---------------+---------------------------------------------------------------------------------+------------+
|name                           |Sunday        |totalCategorias|categorias                                                                       |Restaurantes|
+-------------------------------+--------------+---------------+---------------------------------------------------------------------------------+------------+
|Turf Paradise Race Course      |{18:00, 11:00}|4              |[Active Life, Arts & Entertainment, Horse Racing, Stadiums & Arenas]             |false       |
|Sam's Club Members Only        |null          |5              |[Automotive, Department Stores, Fashion, Shopping, Tires]                        |false       |
|Forever 21                     |{18:00, 11:00}|5              |[Accessories, Fashion, Men's Clothing, Shopping, Women's Clothing]               |false       |
|Loving Hands Pet Care          |{19:00,

In [4]:
# también podemos realizar un explode y separar las categorías en filas
df.select("name", explode("categories")).show(10, truncate=False)

+-------------------------+--------------------+
|name                     |col                 |
+-------------------------+--------------------+
|Turf Paradise Race Course|Active Life         |
|Turf Paradise Race Course|Arts & Entertainment|
|Turf Paradise Race Course|Stadiums & Arenas   |
|Turf Paradise Race Course|Horse Racing        |
|Sam's Club Members Only  |Tires               |
|Sam's Club Members Only  |Automotive          |
|Sam's Club Members Only  |Fashion             |
|Sam's Club Members Only  |Shopping            |
|Sam's Club Members Only  |Department Stores   |
|Forever 21               |Women's Clothing    |
+-------------------------+--------------------+
only showing top 10 rows



In [5]:
# Trabajando con JSON
# Supongamos que tenemos una cadena con una estructura JSON
tareas = ["""{"dia": "Lunes", "tareas": ["Corregir ejercicios", "Ir a nadar", "Comprar pan"]}"""]
tareas

['{"dia": "Lunes", "tareas": ["Corregir ejercicios", "Ir a nadar", "Comprar pan"]}']

In [6]:
tareasRDD = spark.sparkContext.parallelize(tareas)
tareasStrDF = tareasRDD.toDF("string")
# tareasStrDF es un DF con una columna con nombre value de tipo string
tareasStrDF.printSchema()

root
 |-- value: string (nullable = true)



In [7]:
tareasStrDF.show()

+--------------------+
|               value|
+--------------------+
|{"dia": "Lunes", ...|
+--------------------+



In [8]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

# in order to convert a JSON string into a Spark struct data type, we need to describe its structure to Spark
esquemaTareas = StructType([
    StructField("dia", StringType(), False),
    StructField("tareas", ArrayType(StringType(), False), False)
])

In [9]:
# use from_json to convert JSON string
todosDF = tareasStrDF.select(from_json("value", esquemaTareas).alias("datos"))
# todos is a struct data type that contains two fields: day and tasks
todosDF.printSchema()

root
 |-- datos: struct (nullable = true)
 |    |-- dia: string (nullable = true)
 |    |-- tareas: array (nullable = true)
 |    |    |-- element: string (containsNull = true)



In [10]:
    todosDF.show(truncate=False)

+-------------------------------------------------------+
|datos                                                  |
+-------------------------------------------------------+
|{Lunes, [Corregir ejercicios, Ir a nadar, Comprar pan]}|
+-------------------------------------------------------+



In [11]:
## retrieving value out of struct data type using the getItem function of Column class
todosDF.select(col("datos").getItem("dia"), "datos.tareas", (todosDF.datos.getItem("tareas")[0]).alias("tarea1")).show(truncate=False)

+---------+----------------------------------------------+-------------------+
|datos.dia|tareas                                        |tarea1             |
+---------+----------------------------------------------+-------------------+
|Lunes    |[Corregir ejercicios, Ir a nadar, Comprar pan]|Corregir ejercicios|
+---------+----------------------------------------------+-------------------+



In [14]:
# Lo volvemos a pasar a JSON
todosDF.select(to_json("datos")).show(truncate=False)

+---------------------------------------------------------------------------+
|to_json(datos)                                                             |
+---------------------------------------------------------------------------+
|{"dia":"Lunes","tareas":["Corregir ejercicios","Ir a nadar","Comprar pan"]}|
+---------------------------------------------------------------------------+

