In [1]:
from pyspark.streaming import StreamingContext

# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Create a DStream that will connect to localhost at port 9999
# Start Netcat server: nc -lk 9999 
lines = ssc.socketTextStream('localhost', 9999)

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
lines.pprint()
wordCounts.pprint()

ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(20)  # Wait for the computation to terminate
ssc.stop(stopSparkContext=False)  # Stop the StreamingContext without stopping the SparkContext

print("Finished")

Start


[Stage 0:>                                                          (0 + 1) / 1]

-------------------------------------------
Time: 2024-05-19 22:18:40
-------------------------------------------



                                                                                

-------------------------------------------
Time: 2024-05-19 22:18:40
-------------------------------------------



[Stage 0:>                                                          (0 + 1) / 1]

-------------------------------------------
Time: 2024-05-19 22:18:45
-------------------------------------------



                                                                                

-------------------------------------------
Time: 2024-05-19 22:18:45
-------------------------------------------



[Stage 0:>                                                          (0 + 1) / 1]

24/05/19 22:18:47 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/19 22:18:47 WARN BlockManager: Block input-0-1716128327400 replicated to only 0 peer(s) instead of 1 peers
24/05/19 22:18:49 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/19 22:18:49 WARN BlockManager: Block input-0-1716128328800 replicated to only 0 peer(s) instead of 1 peers


                                                                                

-------------------------------------------
Time: 2024-05-19 22:18:50
-------------------------------------------
ak p
ap k

-------------------------------------------
Time: 2024-05-19 22:18:50
-------------------------------------------
('p', 1)
('ap', 1)
('ak', 1)
('k', 1)



[Stage 0:>                                                          (0 + 1) / 1]

-------------------------------------------
Time: 2024-05-19 22:18:55
-------------------------------------------



                                                                                

-------------------------------------------
Time: 2024-05-19 22:18:55
-------------------------------------------



[Stage 0:>                                                          (0 + 1) / 1]

24/05/19 22:18:58 WARN SocketReceiver: Error receiving data
java.net.SocketException: Socket closed
	at java.net.SocketInputStream.socketRead0(Native Method)
	at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
	at java.net.SocketInputStream.read(SocketInputStream.java:171)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
	at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
	at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
	at java.io.InputStreamReader.read(InputStreamReader.java:184)
	at java.io.BufferedReader.fill(BufferedReader.java:161)
	at java.io.BufferedReader.readLine(BufferedReader.java:324)
	at java.io.BufferedReader.readLine(BufferedReader.java:389)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:121)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:119)
	at org.apache.spar

Exception in thread "receiver-supervisor-future-0" java.lang.Error: java.lang.InterruptedException: sleep interrupted
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1155)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.InterruptedException: sleep interrupted
	at java.lang.Thread.sleep(Native Method)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.$anonfun$restartReceiver$1(ReceiverSupervisor.scala:196)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurren

In [3]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)

# split the rdd into 5 equal-size parts
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
        
# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Feed the rdd queue to a DStream
lines = ssc.queueStream(rddQueue)

# Do word-counting as before
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Use transform() to access any rdd transformations not directly available in SparkStreaming
topWords = wordCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
topWords.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(25)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

                                                                                

-------------------------------------------
Time: 2024-04-15 19:57:30
-------------------------------------------
('other', 15486)
('first', 10815)
('many', 9773)
('new', 6272)
('system', 5063)
('american', 4744)
('several', 4545)
('century', 4492)
('same', 4394)
('=', 4313)
...



                                                                                

-------------------------------------------
Time: 2024-04-15 19:57:35
-------------------------------------------
('other', 15319)
('first', 10709)
('many', 9575)
('new', 6242)
('system', 5111)
('american', 4777)
('century', 4538)
('several', 4515)
('same', 4453)
('=', 4361)
...



                                                                                

-------------------------------------------
Time: 2024-04-15 19:57:40
-------------------------------------------
('other', 15346)
('first', 10517)
('many', 9706)
('new', 6218)
('system', 5266)
('american', 4940)
('several', 4615)
('=', 4451)
('century', 4438)
('same', 4270)
...



                                                                                

-------------------------------------------
Time: 2024-04-15 19:57:45
-------------------------------------------
('other', 15196)
('first', 10617)
('many', 9848)
('new', 6289)
('system', 5093)
('american', 4824)
('several', 4519)
('century', 4493)
('=', 4332)
('same', 4260)
...



                                                                                

-------------------------------------------
Time: 2024-04-15 19:57:50
-------------------------------------------
('other', 15091)
('first', 10463)
('many', 9703)
('new', 6175)
('system', 5102)
('american', 4829)
('several', 4523)
('century', 4520)
('=', 4369)
('same', 4325)
...

Finished


In [4]:
# Find the most positive words in windows of 5 seconds from streaming data
# URL for data: https://cse.hkust.edu.hk/msbd5003/data/AFINN-111.txt

from pyspark.streaming import StreamingContext

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = sc.textFile("../data/AFINN-111.txt") \
                    .map(parse_line).cache()
    
ssc = StreamingContext(sc, 5)
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
lines = ssc.queueStream(rddQueue)

word_counts = lines.flatMap(lambda line: line.split(" ")) \
                   .map(lambda word: (word, 1)) \
                   .reduceByKey(lambda a, b: a + b)

# Determine the words with the highest sentiment values by joining the streaming RDD
# with the static RDD inside the transform() method and then multiplying
# the frequency of the words by its sentiment value
happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \
                            .map(lambda t:
                                 (t[1][0] * t[1][1], t[0])) \
                            .transform(lambda rdd: rdd.sortByKey(False))

happiest_words.pprint()

ssc.start()
ssc.awaitTermination(25)
ssc.stop(False)
print("Finished")

                                                                                

-------------------------------------------
Time: 2024-04-15 20:02:50
-------------------------------------------
(7890.0, 'great')
(6180.0, 'popular')
(5544.0, 'best')
(4662.0, 'good')
(4242.0, 'important')
(2340.0, 'strong')
(2322.0, 'greater')
(2058.0, 'successful')
(1850.0, 'novel')
(1790.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2024-04-15 20:02:55
-------------------------------------------
(7578.0, 'great')
(6126.0, 'popular')
(5604.0, 'best')
(4749.0, 'good')
(4172.0, 'important')
(2253.0, 'greater')
(2252.0, 'strong')
(2001.0, 'successful')
(1948.0, 'novel')
(1804.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2024-04-15 20:03:00
-------------------------------------------
(7893.0, 'great')
(6009.0, 'popular')
(5574.0, 'best')
(4677.0, 'good')
(4298.0, 'important')
(2326.0, 'strong')
(2172.0, 'greater')
(1944.0, 'successful')
(1868.0, 'novel')
(1761.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2024-04-15 20:03:05
-------------------------------------------
(7776.0, 'great')
(5925.0, 'popular')
(5538.0, 'best')
(4671.0, 'good')
(4156.0, 'important')
(2362.0, 'strong')
(2205.0, 'greater')
(1932.0, 'successful')
(1892.0, 'novel')
(1803.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2024-04-15 20:03:10
-------------------------------------------
(7566.0, 'great')
(5916.0, 'popular')
(5334.0, 'best')
(4548.0, 'good')
(4268.0, 'important')
(2415.0, 'greater')
(2314.0, 'strong')
(2109.0, 'successful')
(2018.0, 'novel')
(1821.0, 'natural')
...

Finished


In [7]:
from pyspark.streaming import StreamingContext

# Stateful word count

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)

counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    print("Total distinct words: ", rdd.count())
    print(rdd.take(5))
    print('refinery:', rdd.lookup('refinery')[0])

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

                                                                                

Total distinct words:  51430
[('other', 7782), ('first', 5404), ('many', 4878), ('new', 3219), ('system', 2539)]
refinery: 1


                                                                                

Total distinct words:  76917
[('other', 15486), ('first', 10815), ('many', 9773), ('new', 6272), ('system', 5063)]
refinery: 6


                                                                                

Total distinct words:  97338
[('other', 23129), ('first', 16145), ('many', 14534), ('new', 9363), ('system', 7636)]
refinery: 11


                                                                                

Total distinct words:  114786
[('other', 30805), ('first', 21524), ('many', 19348), ('new', 12514), ('system', 10174)]
refinery: 16


                                                                                

Total distinct words:  130393
[('other', 38452), ('first', 26792), ('many', 24239), ('new', 15618), ('system', 12777)]
refinery: 19


                                                                                

Total distinct words:  144898
[('other', 46151), ('first', 32041), ('many', 29054), ('new', 18732), ('system', 15440)]
refinery: 23


                                                                                

Total distinct words:  158013
[('other', 53728), ('first', 37395), ('many', 33933), ('new', 21787), ('system', 17946)]
refinery: 26


                                                                                

Total distinct words:  170610
[('other', 61347), ('first', 42658), ('many', 38902), ('new', 25021), ('system', 20533)]
refinery: 29


                                                                                

Total distinct words:  182330
[('other', 68982), ('first', 47856), ('many', 43780), ('new', 28095), ('system', 23031)]
refinery: 37


                                                                                

Total distinct words:  193450
[('other', 76438), ('first', 53121), ('many', 48605), ('new', 31196), ('system', 25635)]
refinery: 40
Finished


In [8]:
# MG algorithm for approximate word count

from pyspark.streaming import StreamingContext

k = 10000
threshold = 0
total_decrement = 0

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    newValue = sum(newValues, runningCount) - threshold
    return newValue if newValue > 0 else None
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)
            
counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    global threshold, total_decrement 
    rdd.cache()
    print("Total distinct words: ", rdd.count())
    print(rdd.map(lambda x: (x[0], x[1], x[1]+total_decrement)).take(5))
    lower_bound = rdd.lookup('refinery')
    if len(lower_bound) > 0:
        lower_bound = lower_bound[0]
    else:
        lower_bound = 0
    print('refinery:', lower_bound, ',', lower_bound + total_decrement)
    if rdd.count() > k:
        threshold = rdd.zipWithIndex().map(lambda x: (x[1], x[0])).lookup(k)[0][1]
    else:
        threhold = 0
    print("Next threshold = ", threshold)
    total_decrement += threshold
    rdd.unpersist()

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

                                                                                

Total distinct words:  51430
[('other', 7782, 7782), ('first', 5404, 5404), ('many', 4878, 4878), ('new', 3219, 3219), ('system', 2539, 2539)]
refinery: 1 , 1
Next threshold =  5


                                                                                

Total distinct words:  13971
[('other', 15481, 15486), ('first', 10810, 10815), ('many', 9768, 9773), ('new', 6267, 6272), ('system', 5058, 5063)]
refinery: 1 , 6
Next threshold =  5


                                                                                

Total distinct words:  12164
[('other', 23119, 23129), ('first', 16135, 16145), ('many', 14524, 14534), ('new', 9353, 9363), ('system', 7626, 7636)]
refinery: 1 , 11
Next threshold =  4


                                                                                

Total distinct words:  12317
[('other', 30791, 30805), ('first', 21510, 21524), ('many', 19334, 19348), ('new', 12500, 12514), ('system', 10160, 10174)]
refinery: 2 , 16
Next threshold =  5


                                                                                

Total distinct words:  11650
[('other', 38433, 38452), ('first', 26773, 26792), ('many', 24220, 24239), ('new', 15599, 15618), ('system', 12758, 12777)]
refinery: 0 , 19
Next threshold =  5


                                                                                

Total distinct words:  11396
[('other', 46127, 46151), ('first', 32017, 32041), ('many', 29030, 29054), ('new', 18708, 18732), ('system', 15416, 15440)]
refinery: 0 , 24
Next threshold =  4


                                                                                

Total distinct words:  11963
[('other', 53700, 53728), ('first', 37367, 37395), ('many', 33905, 33933), ('new', 21759, 21787), ('system', 17918, 17946)]
refinery: 0 , 28
Next threshold =  5


                                                                                

Total distinct words:  11415
[('other', 61314, 61347), ('first', 42625, 42658), ('many', 38869, 38902), ('new', 24988, 25021), ('system', 20500, 20533)]
refinery: 0 , 33
Next threshold =  5


                                                                                

Total distinct words:  11314
[('other', 68944, 68982), ('first', 47818, 47856), ('many', 43742, 43780), ('new', 28057, 28095), ('system', 22993, 23031)]
refinery: 3 , 41
Next threshold =  5


                                                                                

Total distinct words:  11204
[('other', 76395, 76438), ('first', 53078, 53121), ('many', 48562, 48605), ('new', 31153, 31196), ('system', 25592, 25635)]
refinery: 1 , 44
Next threshold =  5
Finished


In [10]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rddQueue = []
for i in range(5):
    rdd = sc.parallelize([i, i, i, i, i])
    rddQueue += [rdd]
        
# Create a StreamingContext with batch interval of 3 seconds
ssc = StreamingContext(sc, 3)

ssc.checkpoint("checkpoint")

# Feed the rdd queue to a DStream
nums = ssc.queueStream(rddQueue)

# Compute the sum over a sliding window of 9 seconds for every 3 seconds
slidingSum = nums.reduceByWindow(lambda x, y: max(x,y), None, 9, 3)
#slidingSum = nums.reduceByWindow(lambda x, y: x + y, lambda x, y: x - y, 9, 3)

slidingSum.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(24)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

-------------------------------------------
Time: 2024-04-15 21:27:54
-------------------------------------------
0

-------------------------------------------
Time: 2024-04-15 21:27:57
-------------------------------------------
1

-------------------------------------------
Time: 2024-04-15 21:28:00
-------------------------------------------
2

-------------------------------------------
Time: 2024-04-15 21:28:03
-------------------------------------------
3

-------------------------------------------
Time: 2024-04-15 21:28:06
-------------------------------------------
4

-------------------------------------------
Time: 2024-04-15 21:28:09
-------------------------------------------
4

-------------------------------------------
Time: 2024-04-15 21:28:12
-------------------------------------------
4

-------------------------------------------
Time: 2024-04-15 21:28:15
-------------------------------------------

Finished


In [1]:
# Word count using structured streaming: Complete mode vs update mode

from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'))

word_counts = words.groupBy('word').count()

# Start running the query
query = word_counts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

# try update mode; append mode not supported

query.awaitTermination(25)
query.stop()
print("Finished")

24/04/29 22:08:48 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
24/04/29 22:08:50 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b7e17fa6-4901-4f16-b270-bd3f8f2c2252. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/04/29 22:08:50 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.




24/04/29 22:09:15 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3925c7a4 is aborting.
24/04/29 22:09:15 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3925c7a4 aborted.
24/04/29 22:09:15 WARN Shell: Interrupted while joining on: Thread[Thread-1809,5,main]
java.lang.InterruptedException
	at java.lang.Object.wait(Native Method)
	at java.lang.Thread.join(Thread.java:1257)
	at java.lang.Thread.join(Thread.java:1331)
	at org.apache.hadoop.util.Shell.joinThread(Shell.java:1042)
	at org.apache.hadoop.util.Shell.runCommand(Shell.java:1002)
	at org.apache.hadoop.util.Shell.run(Shell.java:900)
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1212)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:1306)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:1288)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermissi



24/04/29 22:09:15 WARN TaskSetManager: Lost task 159.0 in stage 1.0 (TID 165) (192.168.1.60 executor driver): TaskKilled (Stage cancelled)
24/04/29 22:09:16 WARN TaskSetManager: Lost task 160.0 in stage 1.0 (TID 166) (192.168.1.60 executor driver): TaskKilled (Stage cancelled)
24/04/29 22:09:16 WARN TaskSetManager: Lost task 158.0 in stage 1.0 (TID 164) (192.168.1.60 executor driver): TaskKilled (Stage cancelled)
Finished




In [4]:
# Append mode with selection condition
# Note: complete mode not supported if no aggregation

from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'))

long_words = words.filter(length(words['word'])>=3)

# Start running the query 
query = long_words\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

24/04/29 22:11:07 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
24/04/29 22:11:07 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ddb10c12-977c-4b27-9232-c48780e8f753. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/04/29 22:11:07 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+----+
|word|
+----+
+----+

-------------------------------------------
Batch: 1
-------------------------------------------
+--------+
|word    |
+--------+
|aeasfaef|
+--------+

-------------------------------------------
Bat

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/bigbenchung/Conda/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/bigbenchung/Conda/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/bigbenchung/Conda/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [3]:
from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

windowedCounts = words.groupBy(
    window(words.timestamp, "10 seconds", "5 seconds"),
    words.word)\
    .count()

# Start running the query 
query = windowedCounts\
        .writeStream\
        .outputMode('update')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

24/04/29 21:53:30 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
24/04/29 21:53:30 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-5c032ec6-235b-4868-b523-70c4fa3153dd. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/04/29 21:53:30 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.




24/04/29 21:53:56 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@732abb5d is aborting.
24/04/29 21:53:56 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@732abb5d aborted.
24/04/29 21:53:56 WARN Shell: Interrupted while joining on: Thread[Thread-4307,5,]
java.lang.InterruptedException
	at java.lang.Object.wait(Native Method)
	at java.lang.Thread.join(Thread.java:1257)
	at java.lang.Thread.join(Thread.java:1331)
	at org.apache.hadoop.util.Shell.joinThread(Shell.java:1042)
	at org.apache.hadoop.util.Shell.runCommand(Shell.java:1002)
	at org.apache.hadoop.util.Shell.run(Shell.java:900)
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1212)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:1306)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:1288)
	at org.apache.hadoop.fs.FileUtil.readLink(FileUtil.java:21

24/04/29 21:53:56 ERROR Utils: Aborting task
java.lang.IllegalStateException: Error committing version 1 into HDFSStateStore[id=(op=0,part=185),dir=file:/tmp/temporary-5c032ec6-235b-4868-b523-70c4fa3153dd/state/0/185]
	at org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$HDFSBackedStateStore.commit(HDFSBackedStateStoreProvider.scala:148)
	at org.apache.spark.sql.execution.streaming.state.StreamingAggregationStateManagerBaseImpl.commit(StreamingAggregationStateManager.scala:89)
	at org.apache.spark.sql.execution.streaming.StateStoreSaveExec$$anon$2.$anonfun$close$3(statefulOperators.scala:481)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:642)
	at org.apache.spark.sql.execution.streaming.StateStoreWriter.timeTakenMs(statefulOperators.scala:142)
	at org.apache.spark.sql.execution.streaming.StateStoreWriter.timeTakenMs$(statefulOperators.scala:142)
	at org.apache.spark.sql.execu

In [20]:
# Rate source (for testing) - Generates data at the specified number of rows per second, 
# each output row contains a timestamp and value. Where timestamp is a Timestamp type 
# containing the time of message dispatch, and value is of Long type containing the message count, 
# starting from 0 as the first row. This source is intended for testing and benchmarking.

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()
        
# Start running the query 
query = streamingDf\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2024-04-15 21:57:48.456|0    |
+-----------------------+-----+

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2024-04-15 21:57:49.456|1    |
|2024-04-15 21:57:50.456|2    |
|2024-04-15 21:57:51.456|3    |
|2024-04-15 21:57:52.456|4    |
|2024-04-15 21:57:53.456|5    |
+-----------------------+-----+

-------------------------------------------
Batch: 3
-------------------------------------------
+-----------------------+-----+
|timestamp              |value|
+-----------------------+

In [27]:
# Stream–Static Joins

from pyspark.sql.functions import *

staticDf = spark.createDataFrame([(1, 'apple'), (2, 'orange'), (10, 'banana'), (12, 'apple')], ['id', 'name'])

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

# Start running the query 
query = streamingDf.join(staticDf, streamingDf['value'] == staticDf['id']).groupBy('name').count()\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()
# also support leftOuter, but not rightOuter

query.awaitTermination(25)
query.stop()
print("Finished")

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|name|count|
+----+-----+
+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-----+
|name  |count|
+------+-----+
|orange|1    |
|apple |1    |
+------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------+-----+
|name  |count|
+------+-----+
|orange|1    |
|apple |1    |
+------+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------+-----+
|name  |count|
+------+-----+
|orange|1    |
|apple |2    |
|banana|1    |
+------+-----+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------+-----+
|name  |count|
+------+-----+
|orange|1    |
|apple |2    |
|banana|1    |
+------+-----+

Finished


24/04/15 22:02:25 ERROR TorrentBroadcast: Store broadcast broadcast_594 fail, remove all pieces of the broadcast


In [28]:
# Stream-stream Joins
spark.conf.set('spark.sql.shuffle.partitions', 4) 

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

streamingDf2 = spark\
        .readStream\
        .format('rate')\
        .option('rowsPerSecond', 2) \
        .load()

# Start running the query 
query = streamingDf.join(streamingDf2, 'value')\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+---------+---------+
|value|timestamp|timestamp|
+-----+---------+---------+
+-----+---------+---------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+-----------------------+-----------------------+
|value|timestamp              |timestamp              |
+-----+-----------------------+-----------------------+
|2    |2024-04-15 22:03:33.939|2024-04-15 22:03:32.967|
|0    |2024-04-15 22:03:31.939|2024-04-15 22:03:31.967|
|1    |2024-04-15 22:03:32.939|2024-04-15 22:03:32.467|
+-----+-----------------------+-----------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+-----------------------+-----------------------+
|value|timestamp              |timestamp              |
+-----+-----------------------+-----------------------+
|4    |2024-04-15 22:03:35.939|2024

In [28]:
# Stream–stream Joins with watemarking
spark.conf.set('spark.sql.shuffle.partitions', 4) 

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

streamingDf2 = spark\
        .readStream\
        .format('rate')\
        .option('rowsPerSecond', 2) \
        .load()

# Start running the query 
query = streamingDf.withWatermark('timestamp', '3 seconds').join(streamingDf2.withWatermark('timestamp', '3 seconds'), 'value')\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(50)
query.stop()
print("Finished")

#The current watermark is computed by looking at the MAX(eventTime) seen across all of the partitions in 
#the query minus a user specified delayThreshold. Due to the cost of coordinating this value across partitions,
#the actual watermark used is only guaranteed to be at least delayThreshold behind the actual event time. 
#In some cases we may still process records that arrive more than delayThreshold late.

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+---------+---------+
|value|timestamp|timestamp|
+-----+---------+---------+
+-----+---------+---------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+-----------------------+-----------------------+
|value|timestamp              |timestamp              |
+-----+-----------------------+-----------------------+
|2    |2024-04-15 17:16:03.386|2024-04-15 17:16:01.414|
|0    |2024-04-15 17:16:01.386|2024-04-15 17:16:01.413|
|1    |2024-04-15 17:16:02.386|2024-04-15 17:16:01.414|
+-----+-----------------------+-----------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+-----------------------+-----------------------+
|value|timestamp              |timestamp              |
+-----+-----------------------+-----------------------+
|4    |2024-04-15 17:16:05.386|2024

In [None]:
from pyspark.streaming import StreamingContext

# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Create a DStream that will connect to localhost at port 9999
# Start Netcat server: nc -lk 9999 
lines = ssc.socketTextStream('localhost', 9999)

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
lines.pprint()
wordCounts.pprint()

ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(20)  # Wait for the computation to terminate
ssc.stop(stopSparkContext=False)  # Stop the StreamingContext without stopping the SparkContext

print("Finished")

from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)

# split the rdd into 5 equal-size parts
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
        
# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Feed the rdd queue to a DStream
lines = ssc.queueStream(rddQueue)

# Do word-counting as before
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Use transform() to access any rdd transformations not directly available in SparkStreaming
topWords = wordCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
topWords.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(25)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

# Find the most positive words in windows of 5 seconds from streaming data
# URL for data: https://cse.hkust.edu.hk/msbd5003/data/AFINN-111.txt

from pyspark.streaming import StreamingContext

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = sc.textFile("../data/AFINN-111.txt") \
                    .map(parse_line).cache()
    
ssc = StreamingContext(sc, 5)
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
lines = ssc.queueStream(rddQueue)

word_counts = lines.flatMap(lambda line: line.split(" ")) \
                   .map(lambda word: (word, 1)) \
                   .reduceByKey(lambda a, b: a + b)

# Determine the words with the highest sentiment values by joining the streaming RDD
# with the static RDD inside the transform() method and then multiplying
# the frequency of the words by its sentiment value
happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \
                            .map(lambda t:
                                 (t[1][0] * t[1][1], t[0])) \
                            .transform(lambda rdd: rdd.sortByKey(False))

happiest_words.pprint()

ssc.start()
ssc.awaitTermination(25)
ssc.stop(False)
print("Finished")

from pyspark.streaming import StreamingContext

# Stateful word count

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)

counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    print("Total distinct words: ", rdd.count())
    print(rdd.take(5))
    print('refinery:', rdd.lookup('refinery')[0])

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

# MG algorithm for approximate word count

from pyspark.streaming import StreamingContext

k = 10000
threshold = 0
total_decrement = 0

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    newValue = sum(newValues, runningCount) - threshold
    return newValue if newValue > 0 else None
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)
            
counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    global threshold, total_decrement 
    rdd.cache()
    print("Total distinct words: ", rdd.count())
    print(rdd.map(lambda x: (x[0], x[1], x[1]+total_decrement)).take(5))
    lower_bound = rdd.lookup('refinery')
    if len(lower_bound) > 0:
        lower_bound = lower_bound[0]
    else:
        lower_bound = 0
    print('refinery:', lower_bound, ',', lower_bound + total_decrement)
    if rdd.count() > k:
        threshold = rdd.zipWithIndex().map(lambda x: (x[1], x[0])).lookup(k)[0][1]
    else:
        threhold = 0
    print("Next threshold = ", threshold)
    total_decrement += threshold
    rdd.unpersist()

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rddQueue = []
for i in range(5):
    rdd = sc.parallelize([i, i, i, i, i])
    rddQueue += [rdd]
        
# Create a StreamingContext with batch interval of 3 seconds
ssc = StreamingContext(sc, 3)

ssc.checkpoint("checkpoint")

# Feed the rdd queue to a DStream
nums = ssc.queueStream(rddQueue)

# Compute the sum over a sliding window of 9 seconds for every 3 seconds
slidingSum = nums.reduceByWindow(lambda x, y: max(x,y), None, 9, 3)
#slidingSum = nums.reduceByWindow(lambda x, y: x + y, lambda x, y: x - y, 9, 3)

slidingSum.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(24)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

# Word count using structured streaming: Complete mode vs update mode

from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'))

word_counts = words.groupBy('word').count()

# Start running the query
query = word_counts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

# try update mode; append mode not supported

query.awaitTermination(25)
query.stop()
print("Finished")

# Append mode with selection condition
# Note: complete mode not supported if no aggregation

from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'))

long_words = words.filter(length(words['word'])>=3)

# Start running the query 
query = long_words\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

windowedCounts = words.groupBy(
    window(words.timestamp, "10 seconds", "5 seconds"),
    words.word)\
    .count()

# Start running the query 
query = windowedCounts\
        .writeStream\
        .outputMode('update')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

# Rate source (for testing) - Generates data at the specified number of rows per second, 
# each output row contains a timestamp and value. Where timestamp is a Timestamp type 
# containing the time of message dispatch, and value is of Long type containing the message count, 
# starting from 0 as the first row. This source is intended for testing and benchmarking.

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()
        
# Start running the query 
query = streamingDf\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

# Stream–Static Joins

from pyspark.sql.functions import *

staticDf = spark.createDataFrame([(1, 'apple'), (2, 'orange'), (10, 'banana'), (12, 'apple')], ['id', 'name'])

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

# Start running the query 
query = streamingDf.join(staticDf, streamingDf['value'] == staticDf['id']).groupBy('name').count()\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()
# also support leftOuter, but not rightOuter

query.awaitTermination(25)
query.stop()
print("Finished")

# Stream-stream Joins
spark.conf.set('spark.sql.shuffle.partitions', 4) 

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

streamingDf2 = spark\
        .readStream\
        .format('rate')\
        .option('rowsPerSecond', 2) \
        .load()

# Start running the query 
query = streamingDf.join(streamingDf2, 'value')\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

# Stream–stream Joins with watemarking
spark.conf.set('spark.sql.shuffle.partitions', 4) 

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

streamingDf2 = spark\
        .readStream\
        .format('rate')\
        .option('rowsPerSecond', 2) \
        .load()

# Start running the query 
query = streamingDf.withWatermark('timestamp', '3 seconds').join(streamingDf2.withWatermark('timestamp', '3 seconds'), 'value')\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(50)
query.stop()
print("Finished")

#The current watermark is computed by looking at the MAX(eventTime) seen across all of the partitions in 
#the query minus a user specified delayThreshold. Due to the cost of coordinating this value across partitions,
#the actual watermark used is only guaranteed to be at least delayThreshold behind the actual event time. 
#In some cases we may still process records that arrive more than delayThreshold late.