In [1]:
// Requiring all the Imports
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window._
import org.apache.spark.sql.types._



In [2]:
// creating a spark Session
val spark=SparkSession.builder().master("local[2]").appName("Streaming Word Count")
.config("spark.streaming.stopgracefullyOnShutdown",true)
.config("spark.sql.shuffle.partitions",3)
.config("spark.sql.streaming.schemaInterface",true)
.getOrCreate()

spark = org.apache.spark.sql.SparkSession@395d9942


org.apache.spark.sql.SparkSession@395d9942

In [3]:
// reading the data from socket
val orderdf=spark.readStream.format("socket")
.option("host","localhost")
.option("port","1334")
.load()

orderdf = [value: string]


[value: string]

In [4]:
// derfing the schema of  order_table
val order_schema=StructType(List(
StructField("order_id",IntegerType),
StructField("order_date",TimestampType),
StructField("order_customerid",IntegerType),
StructField("order_status",StringType),
StructField("amount",IntegerType)
))

order_schema = StructType(StructField(order_id,IntegerType,true), StructField(order_date,TimestampType,true), StructField(order_customerid,IntegerType,true), StructField(order_status,StringType,true), StructField(amount,IntegerType,true))


StructType(StructField(order_id,IntegerType,true), StructField(order_date,TimestampType,true), StructField(order_customerid,IntegerType,true), StructField(order_status,StringType,true), StructField(amount,IntegerType,true))

In [5]:
// giviing schema to the order_table
val orderdfnew=orderdf.select(from_json(col("value"),order_schema).alias("value"))
// giving the schema to the table as it is under the table
val refined_df=orderdfnew.select("value.*")
refined_df.printSchema()


root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customerid: integer (nullable = true)
 |-- order_status: string (nullable = true)
 |-- amount: integer (nullable = true)



orderdfnew = [value: struct<order_id: int, order_date: timestamp ... 3 more fields>]
refined_df = [order_id: int, order_date: timestamp ... 3 more fields]


[order_id: int, order_date: timestamp ... 3 more fields]

In [6]:
// as we can not maintain the state for all the transaction otherwise it will consume more than enough memory and may give OOM Error.
val watermark_df=refined_df.withWatermark("order_date","15 minute")

watermark_df = [order_id: int, order_date: timestamp ... 3 more fields]


[order_id: int, order_date: timestamp ... 3 more fields]

In [7]:
val windowdf=watermark_df.groupBy(window(col("order_date"),"15 minute")).agg(sum("amount").alias("Total Invoice"))
""" input --root 
 //|-- window: struct (nullable = false)
 //|    |-- start: timestamp (nullable = true)
 //|    |-- end: timestamp (nullable = true)
 //|-- Total Invoice: long (nullable = true)
 """


windowdf = [window: struct<start: timestamp, end: timestamp>, Total Invoice: bigint]


" input --root
 //|-- window: struct (nullable = false)
 //|    |-- start: timestamp (nullable = true)
 //|    |-- end: timestamp (nullable = true)
 //|-- Total Invoice: long (nullable = true)
 "


In [8]:
// colllecting the final_output
val final_df=windowdf.select("window.start","window.end","Total Invoice")


final_df = [start: timestamp, end: timestamp ... 1 more field]


[start: timestamp, end: timestamp ... 1 more field]

In [None]:
// collecting the ouput of the stream on the terminal
val outputdf=final_df.writeStream.format("console").outputMode("update")
.option("checkpointLocation","order_5")
.trigger(Trigger.ProcessingTime("15 Seconds"))
.start()
outputdf.awaitTermination()

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+---+-------------+
|start|end|Total Invoice|
+-----+---+-------------+
+-----+---+-------------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+-------------------+-------------+
|              start|                end|Total Invoice|
+-------------------+-------------------+-------------+
|2020-03-02 11:00:00|2020-03-02 11:15:00|          500|
+-------------------+-------------------+-------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+---+-------------+
|start|end|Total Invoice|
+-----+---+-------------+
+-----+---+-------------+

-------------------------------------------
Batch: 3
-------------------------------------------
+-------------------+-------------------+-------------+
|              start|                end|Total Invoice|
+--------

In [None]:
{"order_id":5852,"order_date":"2020-03-02 11:48:00","order_customer_id":9344,"order_status":"COMPLETE", "amount": 400}

{"order_id":5852,"order_date":"2020-03-02 11:14:00","order_customer_id":9344,"order_status":"COMPLETE", "amount": 400}

{"order_id":5852,"order_date":"2020-03-02 11:16:00","order_customer_id":9344,"order_status":"COMPLETE", "amount": 600}