In [1]:
import spark.implicits._
case class Flight(DEST_COUNTRY_NAME: String,
                  ORIGIN_COUNTRY_NAME: String,
                  count: BigInt)
val flightsDF = spark.read
  .parquet("../data/flight-data/parquet/2010-summary.parquet/")
val flights = flightsDF.as[Flight]


Intitializing Scala interpreter ...

Spark Web UI available at http://4b3176cd1786:4042
SparkContext available as 'sc' (version = 2.4.7, master = local[*], app id = local-1613919624390)
SparkSession available as 'spark'


import spark.implicits._
defined class Flight
flightsDF: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]
flights: org.apache.spark.sql.Dataset[Flight] = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [2]:
flights
  .filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "Canada")
  .map(flight_row => flight_row)
  .take(5)

org.apache.spark.SparkException:  Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1, localhost, executor driver): java.lang.ClassCastException: $iw cannot be cast to $iw

In [6]:
val staticDataFrame = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("../data/retail-data/by-day/*.csv")

staticDataFrame: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]


In [7]:
staticDataFrame.createOrReplaceTempView("retail_data")

In [8]:
val staticSchema = staticDataFrame.schema

staticSchema: org.apache.spark.sql.types.StructType = StructType(StructField(InvoiceNo,StringType,true), StructField(StockCode,StringType,true), StructField(Description,StringType,true), StructField(Quantity,IntegerType,true), StructField(InvoiceDate,StringType,true), StructField(UnitPrice,DoubleType,true), StructField(CustomerID,DoubleType,true), StructField(Country,StringType,true))


In [9]:
import org.apache.spark.sql.functions.{window, column, desc, col}
staticDataFrame
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))
  .sum("total_cost")
  .show(5)

+----------+--------------------+-----------------+
|CustomerId|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|   16057.0|[2011-12-05 00:00...|            -37.6|
|   14126.0|[2011-11-29 00:00...|643.6300000000001|
|   13500.0|[2011-11-16 00:00...|497.9700000000001|
|   17160.0|[2011-11-08 00:00...|516.8499999999999|
|   15608.0|[2011-11-11 00:00...|            122.4|
+----------+--------------------+-----------------+
only showing top 5 rows



import org.apache.spark.sql.functions.{window, column, desc, col}


In [10]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [11]:
val streamingDataFrame = spark.readStream
    .schema(staticSchema)
    .option("maxFilesPerTrigger", 1)
    .format("csv")
    .option("header", "true")
    .load("../data/retail-data/by-day/*.csv")

streamingDataFrame: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]


In [12]:
streamingDataFrame.isStreaming // returns true

res5: Boolean = true


In [13]:
val purchaseByCustomerPerHour = streamingDataFrame
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))
  .sum("total_cost")

purchaseByCustomerPerHour: org.apache.spark.sql.DataFrame = [CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]


In [14]:
purchaseByCustomerPerHour.writeStream
    .format("memory") // memory = store in-memory table
    .queryName("customer_purchases") // the name of the in-memory table
    .outputMode("complete") // complete = all the counts should be in the table
    .start()

res6: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@39dc11d4


In [15]:
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|[2010-12-01 00:00...|12584.299999999988|
|   13777.0|[2010-12-01 00:00...|           6585.16|
|   16029.0|[2010-12-01 00:00...|           3702.12|
|   16210.0|[2010-12-01 00:00...|2474.7399999999993|
|   12433.0|[2010-12-01 00:00...|1919.1400000000008|
+----------+--------------------+------------------+
only showing top 5 rows



In [16]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [17]:
import org.apache.spark.sql.functions.date_format
val preppedDataFrame = staticDataFrame
  .na.fill(0)
  .withColumn("day_of_week", date_format($"InvoiceDate", "EEEE"))
  .coalesce(5)

import org.apache.spark.sql.functions.date_format
preppedDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 7 more fields]


In [18]:
val trainDataFrame = preppedDataFrame
  .where("InvoiceDate < '2011-07-01'")
val testDataFrame = preppedDataFrame
  .where("InvoiceDate >= '2011-07-01'")

trainDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 7 more fields]
testDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 7 more fields]


In [20]:
println(trainDataFrame.count())
println(testDataFrame.count())

245903
296006


In [21]:
import org.apache.spark.ml.feature.StringIndexer
val indexer = new StringIndexer()
  .setInputCol("day_of_week")
  .setOutputCol("day_of_week_index")

import org.apache.spark.ml.feature.StringIndexer
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_1fc60da9ccf7


In [22]:
import org.apache.spark.ml.feature.OneHotEncoder
val encoder = new OneHotEncoder()
  .setInputCol("day_of_week_index")
  .setOutputCol("day_of_week_encoded")

import org.apache.spark.ml.feature.OneHotEncoder
encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_422b856040bd


In [23]:
import org.apache.spark.ml.feature.VectorAssembler

val vectorAssembler = new VectorAssembler()
  .setInputCols(Array("UnitPrice", "Quantity", "day_of_week_encoded"))
  .setOutputCol("features")

import org.apache.spark.ml.feature.VectorAssembler
vectorAssembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_1aa1138d4195, handleInvalid=error, numInputCols=3


In [24]:
import org.apache.spark.ml.Pipeline

val transformationPipeline = new Pipeline()
  .setStages(Array(indexer, encoder, vectorAssembler))

import org.apache.spark.ml.Pipeline
transformationPipeline: org.apache.spark.ml.Pipeline = pipeline_b0881df25a03


In [25]:
val fittedPipeline = transformationPipeline.fit(trainDataFrame)

fittedPipeline: org.apache.spark.ml.PipelineModel = pipeline_b0881df25a03


In [26]:
val transformedTraining = fittedPipeline.transform(trainDataFrame)

transformedTraining: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 10 more fields]


In [27]:
transformedTraining.cache()

res11: transformedTraining.type = [InvoiceNo: string, StockCode: string ... 10 more fields]


In [28]:
import org.apache.spark.ml.clustering.KMeans
val kmeans = new KMeans()
  .setK(20)
  .setSeed(1L)

import org.apache.spark.ml.clustering.KMeans
kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_f981ae110c05


In [29]:
val kmModel = kmeans.fit(transformedTraining)

kmModel: org.apache.spark.ml.clustering.KMeansModel = KMeansModel: uid=kmeans_f981ae110c05, k=20, distanceMeasure=euclidean, numFeatures=7


In [30]:
kmModel.computeCost(transformedTraining)

<console>: 40: error: value computeCost is not a member of org.apache.spark.ml.clustering.KMeansModel

In [31]:
val transformedTest = fittedPipeline.transform(testDataFrame)

transformedTest: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 10 more fields]


In [32]:
kmModel.computeCost(transformedTest)

<console>: 40: error: value computeCost is not a member of org.apache.spark.ml.clustering.KMeansModel

In [33]:
spark.sparkContext.parallelize(Seq(1, 2, 3)).toDF()

res14: org.apache.spark.sql.DataFrame = [value: int]
