Authors ages

In [1]:
// Create a DataFrame using a SparkSession

import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession

val spark = SparkSession
    .builder
    .appName("AuthorsAges")
    .getOrCreate()

val dataDF = spark.createDataFrame(Seq(("Brooke", 20), ("Brooke", 25),
 ("Denny", 31), ("Jules", 30), ("TD", 35))).toDF("name", "age")

val avgDF = dataDF.groupBy("name").agg(avg("age"))

avgDF.show()

Intitializing Scala interpreter ...

Spark Web UI available at http://L2203100.bosonit.local:4043
SparkContext available as 'sc' (version = 3.0.3, master = local[*], app id = local-1651070667389)
SparkSession available as 'spark'


+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@55d059b1
dataDF: org.apache.spark.sql.DataFrame = [name: string, age: int]
avgDF: org.apache.spark.sql.DataFrame = [name: string, avg(age): double]


In [7]:
// Defino el schema

import org.apache.spark.sql.types._
val schema = StructType(Array(StructField("author", StringType, false),
 StructField("title", StringType, false),
 StructField("pages", IntegerType, false)))

                         
// Usando DDL (Data Definition Language)
// val schema = "author STRING, title STRING, pages INT"

import org.apache.spark.sql.types._
schema: org.apache.spark.sql.types.StructType = StructType(StructField(author,StringType,false), StructField(title,StringType,false), StructField(pages,IntegerType,false))


Blog

In [9]:
// Read data from a JSON file

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._

val spark = SparkSession
 .builder
 .appName("Example-3_7")
 .getOrCreate()
    
val jsonFile = ("C:/Users/alice.marchi/Downloads/LearningSparkV2-master/chapter3/scala/data/blogs.json")

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@55d059b1
jsonFile: String = C:/Users/alice.marchi/Downloads/LearningSparkV2-master/chapter3/scala/data/blogs.json


In [10]:
// Defino el schema

val schema = StructType(Array(StructField("Id", IntegerType, false),
 StructField("First", StringType, false),
 StructField("Last", StringType, false),
 StructField("Url", StringType, false),
 StructField("Published", StringType, false),
 StructField("Hits", IntegerType, false),
 StructField("Campaigns", ArrayType(StringType), false)))

// Creo un DataFrame leyendo el file json y usando un schema predefinido

val blogsDF = spark.read.schema(schema).json(jsonFile)

blogsDF.show(false)
println(blogsDF.printSchema)
println(blogsDF.schema)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string

schema: org.apache.spark.sql.types.StructType = StructType(StructField(Id,IntegerType,false), StructField(First,StringType,false), StructField(Last,StringType,false), StructField(Url,StringType,false), StructField(Published,StringType,false), StructField(Hits,IntegerType,false), StructField(Campaigns,ArrayType(StringType,true),false))
blogsDF: org.apache.spark.sql.DataFrame = [Id: int, First: string ... 5 more fields]


Leer el CSV del ejemplo del cap2 y obtener la estructura del schema dado por defecto.

In [11]:
// Subo el fichero MnMs

val csv = ("C:/Users/alice.marchi/Downloads/LearningSparkV2-master/chapter2/scala/data/mnm_dataset.csv")

csv: String = C:/Users/alice.marchi/Downloads/LearningSparkV2-master/chapter2/scala/data/mnm_dataset.csv


In [15]:
// Obtengo la estructura del schema dado por defecto

val csvDF = spark.read.format("csv")
 .option("header", "true")
 .option("inferSchema", "true")
 .load(csv)

println(csvDF.schema)
println(csvDF.printSchema)

StructType(StructField(State,StringType,true), StructField(Color,StringType,true), StructField(Count,IntegerType,true))
root
 |-- State: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Count: integer (nullable = true)

()


csvDF: org.apache.spark.sql.DataFrame = [State: string, Color: string ... 1 more field]


El último parámetro booleano es "nullable", si el campo puede ser nulo o no. 

In [16]:
// Ejemplo blog

import org.apache.spark.sql.functions._
blogsDF.columns

import org.apache.spark.sql.functions._
res6: Array[String] = Array(Id, First, Last, Url, Published, Hits, Campaigns)


In [17]:
// Accedo a una columna con col
blogsDF.col("Id")

res7: org.apache.spark.sql.Column = Id


In [19]:
// Uso col para calcular un valor

blogsDF.select(col("Hits") * 2).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



In [20]:
// Añado una nueva columna llamada Big Hitters, se basa en una expresión condicional

blogsDF.withColumn("Big Hitters", (expr("Hits > 10000"))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [21]:
// Puedo concatenar 3 columnas para crear una columna

blogsDF
 .withColumn("AuthorsId", (concat(expr("First"), expr("Last"), expr("Id"))))
 .select(col("AuthorsId"))
 .show(4)

+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
+-------------+
only showing top 4 rows



In [22]:
//Los 3 me devuelven el mismo resultado, expr es lo mismo que col 

blogsDF.select(expr("Hits")).show(2)
blogsDF.select(col("Hits")).show(2)
blogsDF.select("Hits").show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



In [23]:
// col("Id") devuleve la columna y es lo mismo que usar $ antes del nombre de la columna, que es una 
// función de Spark que devuelve Id en una columna

blogsDF.sort(col("Id").desc).show()
blogsDF.sort($"Id".desc).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+

In [24]:
// A row in Spark is a generic Row object, containing one or more columns. 

import org.apache.spark.sql.Row

val blogRow = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", Array("twitter", "LinkedIn"))

blogRow(1)

import org.apache.spark.sql.Row
blogRow: org.apache.spark.sql.Row = [6,Reynold,Xin,https://tinyurl.6,255568,3/2/2015,[Ljava.lang.String;@458fd97a]
res14: Any = Reynold


In [25]:
// Creo un DataFrame usando un objeto Row

val rows = Seq(("Matei Zaharia", "CA"), ("Reynold Xin", "CA"))
val authorsDF = rows.toDF("Author", "State")
authorsDF.show()

+-------------+-----+
|       Author|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



rows: Seq[(String, String)] = List((Matei Zaharia,CA), (Reynold Xin,CA))
authorsDF: org.apache.spark.sql.DataFrame = [Author: string, State: string]


Fire calls example

In [5]:
// Para importar un fichero csv con DataFrameReader, primero defino el schema
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType, FloatType, BooleanType};

val fireSchema = StructType(Array(StructField("CallNumber", IntegerType, true),
     StructField("UnitID", StringType, true),
     StructField("IncidentNumber", IntegerType, true),
     StructField("CallType", StringType, true),
     StructField("CallDate", StringType, true), 
     StructField("WatchDate", StringType, true),
     StructField("CallFinalDisposition", StringType, true),
     StructField("AvailableDtTm", StringType, true),
     StructField("Address", StringType, true), 
     StructField("City", StringType, true), 
     StructField("Zipcode", IntegerType, true), 
     StructField("Battalion", StringType, true), 
     StructField("StationArea", StringType, true), 
     StructField("Box", StringType, true), 
     StructField("OriginalPriority", StringType, true), 
     StructField("Priority", StringType, true), 
     StructField("FinalPriority", IntegerType, true), 
     StructField("ALSUnit", BooleanType, true), 
     StructField("CallTypeGroup", StringType, true),
     StructField("NumAlarms", IntegerType, true),
     StructField("UnitType", StringType, true),
     StructField("UnitSequenceInCallDispatch", IntegerType, true),
     StructField("FirePreventionDistrict", StringType, true),
     StructField("SupervisorDistrict", StringType, true),
     StructField("Neighborhood", StringType, true),                             
     StructField("Location", StringType, true),
     StructField("RowID", StringType, true),
     StructField("Delay", FloatType, true)))

import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType, FloatType, BooleanType}
fireSchema: org.apache.spark.sql.types.StructType = StructType(StructField(CallNumber,IntegerType,true), StructField(UnitID,StringType,true), StructField(IncidentNumber,IntegerType,true), StructField(CallType,StringType,true), StructField(CallDate,StringType,true), StructField(WatchDate,StringType,true), StructField(CallFinalDisposition,StringType,true), StructField(AvailableDtTm,StringType,true), StructField(Address,StringType,true), StructField(City,StringType,true), StructField(Zipcode,IntegerType,true), StructField(Battalion,StringType,true), StructField(StationArea,StringType,true), StructField(Box,StringType,true), StructField(OriginalPriority,StringType,true), StructField...


In [7]:
val sfFireFile="C:/Users/alice.marchi/Downloads/LearningSparkV2-master/chapter3/data/sf-fire-calls.csv"
val fireDF = spark.read.schema(fireSchema)
 .option("header", "true")
 .csv(sfFireFile)

sfFireFile: String = C:/Users/alice.marchi/Downloads/LearningSparkV2-master/chapter3/data/sf-fire-calls.csv
fireDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


Guardar los datos en: parquet, json, csv, avro.

In [10]:
val parquetPath = "/tmp/output/fire_parquet"
fireDF.write.format("parquet").save(parquetPath)

parquetPath: String = /tmp/output/fire_parquet1


In [8]:
val parquet_table = "tbl_fire"
fireDF.write.format("parquet").saveAsTable(parquet_table)

parquet_table: String = tbl_fire


In [25]:
fireDF.write.format("json").save("/tmp/output/fire_json")

In [26]:
fireDF.write.format("csv").save("/tmp/output/fire_csv/test-fire.csv")

In [27]:
fireDF.write.format("avro").save("/tmp/output/fire_avro")

org.apache.spark.sql.AnalysisException:  Failed to find data source: avro. Avro is built-in but external data source module since Spark 2.4. Please deploy the application as per the deployment section of "Apache Avro Data Source Guide".;

In [28]:
// ¿Cómo obtener el número de particiones de un DataFrame?
fireDF.rdd.getNumPartitions

res16: Int = 8


In [30]:
// modificar el número de particiones de un DataFrame
val new_fireDF = fireDF.repartition(1)

new_fireDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [CallNumber: int, UnitID: string ... 26 more fields]


In [31]:
new_fireDF.rdd.getNumPartitions

res17: Int = 1


In [32]:
//  Llevar a cabo el ejemplo modificando el número de particiones a 1 y revisar de nuevo el/los ficheros guardados. 

new_fireDF.write.format("json").save("/tmp/output/new_fire_json")

In [5]:
val fewFireDF = fireDF
 .select("IncidentNumber", "AvailableDtTm", "CallType")
 .where($"CallType" =!= "Medical Incident") 
fewFireDF.show(5, false)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



fewFireDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [IncidentNumber: int, AvailableDtTm: string ... 1 more field]


In [6]:
fireDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .agg(countDistinct('CallType) as 'DistinctCallTypes)
 .show()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [12]:
val newFireDF = fireDF.withColumnRenamed("Delay", "ResponseDelayedinMins")
newFireDF
 .select("ResponseDelayedinMins")
 .where($"ResponseDelayedinMins" > 5)
 .show(5, false)

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



newFireDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [13]:
val fireTsDF = newFireDF
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
 .drop("CallDate")
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm")

fireTsDF
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, false)

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



fireTsDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [14]:
fireTsDF
 .select(year($"IncidentDate"))
 .distinct()
 .orderBy(year($"IncidentDate"))
 .show()

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [15]:
fireTsDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .groupBy("CallType")
 .count()
 .orderBy(desc("count"))
 .show(10, false)

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [16]:
import org.apache.spark.sql.{functions => F}
fireTsDF
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
 F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show()

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



import org.apache.spark.sql.{functions=>F}


DATASET

In [2]:
// Row is a generic object type in Spark, holding a collection of mixed types that can be accessed using an index. 

import org.apache.spark.sql.Row
val row = Row(350, true, "Learning Spark 2E", null)

row.getInt(0)

import org.apache.spark.sql.Row
row: org.apache.spark.sql.Row = [350,true,Learning Spark 2E,null]
res0: String = Learning Spark 2E


In [3]:
row.getBoolean(1)

res1: Boolean = true


In [4]:
row.getString(2)

res2: String = Learning Spark 2E


CREATING DATASET 

As with creating DataFrames from data sources, when creating a Dataset you have to know the schema. In other words, you need to know the data types. Although with JSON and CSV data it’s possible to infer the schema, for large data sets this is resource-intensive (expensive). When creating a Dataset in Scala, the easiest way to specify the schema for the resulting Dataset is to use a case class.

In [33]:
// Define a Scala case class 

case class DeviceIoTData (battery_level: Long, c02_level: Long,
cca2: String, cca3: String, cn: String, device_id: Long,
device_name: String, humidity: Long, ip: String, latitude: Double,
lcd: String, longitude: Double, scale:String, temp: Long,
timestamp: Long)


defined class DeviceIoTData


In [34]:
case class DeviceTempByCountry(temp: Long, device_name: String, device_id: Long, cca3: String)

defined class DeviceTempByCountry


In [35]:
// Once defined we can use it to read our file and convert the returned Dataset[Row] into Dataset[DeviceIoTData]

val ds = spark.read
.json("C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/iot-devices/iot_devices.json")
.as[DeviceIoTData]

ds.show(5, false)

+-------------+---------+----+----+-------------+---------+---------------------+--------+-------------+--------+------+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|cn           |device_id|device_name          |humidity|ip           |latitude|lcd   |longitude|scale  |temp|timestamp    |
+-------------+---------+----+----+-------------+---------+---------------------+--------+-------------+--------+------+---------+-------+----+-------------+
|8            |868      |US  |USA |United States|1        |meter-gauge-1xbYRYcj |51      |68.161.225.1 |38.0    |green |-97.0    |Celsius|34  |1458444054093|
|7            |1473     |NO  |NOR |Norway       |2        |sensor-pad-2n2Pea    |70      |213.161.254.1|62.47   |red   |6.15     |Celsius|11  |1458444054119|
|2            |1556     |IT  |ITA |Italy        |3        |device-mac-36TWSKiT  |44      |88.36.5.1    |42.83   |red   |12.83    |Celsius|19  |1458444054120|
|6            |1080     |US  |USA |United States|4  

ds: org.apache.spark.sql.Dataset[DeviceIoTData] = [battery_level: bigint, c02_level: bigint ... 13 more fields]


In [36]:
ds.printSchema

root
 |-- battery_level: long (nullable = true)
 |-- c02_level: long (nullable = true)
 |-- cca2: string (nullable = true)
 |-- cca3: string (nullable = true)
 |-- cn: string (nullable = true)
 |-- device_id: long (nullable = true)
 |-- device_name: string (nullable = true)
 |-- humidity: long (nullable = true)
 |-- ip: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- lcd: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- scale: string (nullable = true)
 |-- temp: long (nullable = true)
 |-- timestamp: long (nullable = true)



In [37]:
val filterTempDS = ds.filter(d => {d.temp > 30 && d.humidity > 70})

filterTempDS: org.apache.spark.sql.Dataset[DeviceIoTData] = [battery_level: bigint, c02_level: bigint ... 13 more fields]
