In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, max

In [3]:
spark = SparkSession\
  .builder\
  .appName('SDG_Chapter03')\
  .getOrCreate()

In [4]:
staticDataFrame = spark.read.format('csv')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('/home/jagadeesh/git/Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv')

In [5]:
staticDataFrame.createOrReplaceTempView('retail_data')
staticSchema = staticDataFrame.schema

In [6]:
staticDataFrame

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [7]:
staticSchema

StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,IntegerType,true),StructField(InvoiceDate,TimestampType,true),StructField(UnitPrice,DoubleType,true),StructField(CustomerID,DoubleType,true),StructField(Country,StringType,true)))

In [8]:
staticDataFrame.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
|   580538|    21544|SKULLS  WATER TRA...|      48|2011-12-05 08:38:00|     0.85|   14075.0|United Kingdom|
|   580538|    23126|FELTCRA

In [10]:
staticSchema.typeName()

'struct'

In [11]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [12]:
from pyspark.sql.functions import window, column, desc, col

In [13]:
staticDataFrame\
  .selectExpr(
    'CustomerId',
    '(UnitPrice * Quantity) as total_cost',
    'InvoiceDate')\
  .groupBy(
    col('CustomerId'), window(col('InvoiceDate'), '1 day'))\
  .sum('total_cost')\
  .show(5)

+----------+--------------------+-----------------+
|CustomerId|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|   16057.0|[2011-12-05 05:30...|            -37.6|
|   14126.0|[2011-11-29 05:30...|643.6300000000001|
|   13500.0|[2011-11-16 05:30...|497.9700000000001|
|   17160.0|[2011-11-08 05:30...|516.8499999999999|
|   15608.0|[2011-11-11 05:30...|            122.4|
+----------+--------------------+-----------------+
only showing top 5 rows



In [14]:
spark.conf.set('spark.sql.shuffle.partitions', '5')

In [15]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option('maxFilesPerTrigger', 1)\
    .format('csv')\
    .option('header', 'true')\
    .load('/home/jagadeesh/git/Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv')

In [16]:
streamingDataFrame.isStreaming

True

In [18]:
purchaseByCustomerPerHour = streamingDataFrame\
   .selectExpr(
    'CustomerId',
    '(UnitPrice * Quantity) as total_cost',
    'InvoiceDate')\
   .groupBy(
     col("CustomerId"), window(col('InvoiceDate'), '1 day'))\
   .sum('total_cost')

In [19]:
purchaseByCustomerPerHour.writeStream\
     .format('memory')\
     .queryName('customer_purchases')\
     .outputMode('complete')\
     .start()

<pyspark.sql.streaming.StreamingQuery at 0x7fac73093a20>

In [20]:
spark.sql("""
 SELECT *
 FROM customer_purchases
 ORDER BY `sum(total_cost)` DESC
 """)\
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|[2011-03-29 05:30...| 33521.39999999998|
|      null|[2010-12-21 05:30...|31347.479999999938|
|      null|[2010-12-10 05:30...|25399.560000000012|
|      null|[2010-12-06 05:30...|23395.099999999904|
|   15749.0|[2011-01-11 05:30...|           22998.4|
+----------+--------------------+------------------+
only showing top 5 rows

