# Structured Streaming

In [1]:
from spark_init import start_spark
spark = start_spark()
spark.range(10).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [2]:
staticDataFrame = spark.read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv("../data/retail-data/by-day/*.csv")

staticSchema = staticDataFrame.schema


In [3]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("../data/retail-data/by-day/*.csv")


In [4]:
from pyspark.sql.functions import window

purchaseByCustomerPerHour = streamingDataFrame\
    .selectExpr("CustomerID",
                ("UnitPrice * Quantity AS total_cost"),
                "InvoiceDate")\
    .groupBy(
        "CustomerID",
        window("InvoiceDate", "1 hour"))\
    .sum("total_cost")\

purchaseByCustomerPerHour.writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()\
    

<pyspark.sql.streaming.query.StreamingQuery at 0x1b46ce88e90>

In [5]:
spark.sql("""
    SELECT CAST(CustomerID AS INT) AS CustomerID,
           window,
           `sum(total_cost)`
    FROM customer_purchases
    WHERE CustomerID IS NOT NULL
    ORDER BY `sum(total_cost)` DESC
    """)\
.show(5)


+----------+------+---------------+
|CustomerID|window|sum(total_cost)|
+----------+------+---------------+
+----------+------+---------------+



In [6]:
purchaseByCustomerPerHour.writeStream\
    .format("console")\
    .queryName("customer_purchases_4")\
    .outputMode("complete")\
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x1b46ce706d0>

# Machine Learning

In [7]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [8]:
from pyspark.sql.functions import date_format, col

preppedDataFrame = staticDataFrame\
    .na.fill(0)\
    .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
    .coalesce(5)

In [9]:
trainDataFrame = preppedDataFrame\
.where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
.where("InvoiceDate >= '2011-07-01'")

In [10]:
trainDataFrame.count()



245903

In [11]:
testDataFrame.count()

296006

In [12]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
.setInputCol("day_of_week")\
.setOutputCol("day_of_week_index")

In [13]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder()\
.setInputCol("day_of_week_index")\
.setOutputCol("day_of_week_encoded")

In [14]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
.setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
.setOutputCol("features")

In [15]:
from pyspark.ml import Pipeline
transformationPipeline = Pipeline()\
.setStages([indexer, encoder, vectorAssembler])

In [16]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [17]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [18]:
transformedTraining.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

In [19]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
.setK(20)\
.setSeed(1)

In [20]:
kmModel = kmeans.fit(transformedTraining)

In [None]:
# transformedTest = fittedPipeline.transform(testDataFrame)
# kmModel.computeCost(transformedTest)