## RDD

In [8]:
from pyspark.sql.functions import *

In [20]:
rdd = sc.parallelize([("b",3), ("b",4), ("a",5)])

In [29]:
rdd.reduceByKey(lambda x, y: x if x >= y else y).collect()

[('b', 4), ('a', 5)]

## GraphFrame

In [1]:
sc.addPyFile("/home/bigbenchung/spark-3.3.2-bin-hadoop3/jars/graphframes-0.8.2-spark3.0-s_2.12.jar")

In [2]:
from graphframes import *
from pyspark.sql.functions import *

In [17]:
# Vertics DataFrame
v = spark.createDataFrame([
  ("a", "Alice"),
  ("b", "Bob"),
  ("c", "Charlie"),
  ("d", "David"),
  ("e", "Esther"),
  ("f", "Fanny"),
  ("g", "Gabby")
], ["id", "name"])

# Edges DataFrame
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "a", "friend"),
  ("b", "c", "friend"),
  ("c", "b", "friend"),
  ("a", "c", "friend"),
  ("c", "a", "friend"),
  ("a", "d", "friend"),
  ("b", "c", "friend"),
  ("c", "b", "friend"),
  ("d", "e", "friend"),
  ("e", "d", "friend"),
  ("b", "a", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend"),
  ("g", "e", "follow")
], ["src", "dst", "relationship"])

# Create a GraphFrame
g = GraphFrame(v, e)

g.vertices.show()
g.edges.show()

+---+-------+
| id|   name|
+---+-------+
|  a|  Alice|
|  b|    Bob|
|  c|Charlie|
|  d|  David|
|  e| Esther|
|  f|  Fanny|
|  g|  Gabby|
+---+-------+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  a|      friend|
|  b|  c|      friend|
|  c|  b|      friend|
|  a|  c|      friend|
|  c|  a|      friend|
|  a|  d|      friend|
|  b|  c|      friend|
|  c|  b|      friend|
|  d|  e|      friend|
|  e|  d|      friend|
|  b|  a|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
|  g|  e|      follow|
+---+---+------------+



In [28]:
result = g.find("(a)-[e1]->(b);(b)-[e2]->(c);(c)-[e3]->(a)") \
        .filter('e1.relationship = "friend" and e2.relationship = "friend" and e3.relationship = "friend"') \
        .filter('a.id < b.id').filter('b.id < c.id') \
        .select("a.name", "b.name", "c.name")

In [29]:
result.show()

+-----+----+-------+
| name|name|   name|
+-----+----+-------+
|Alice| Bob|Charlie|
|Alice| Bob|Charlie|
+-----+----+-------+



## Streaming

In [1]:
from pyspark.streaming import StreamingContext

In [2]:
# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)
ssc.checkpoint("checkpoint")

In [3]:
# Create a DStream that will connect to localhost at port 9999
# Start Netcat server: nc -lk 9999 
# lines = ssc.socketTextStream('localhost', 9999)

rdd = sc.textFile("data/adj_noun_pairs.txt", 8).map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

In [7]:
def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count
    
running_counts = lines.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y).updateStateByKey(updateFunc)

counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResult(rdd):
    print(rdd.take(10))
    
counts_sorted.foreachRDD(printResult)

ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(50)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

24/05/20 18:14:01 WARN QueueInputDStream: queueStream doesn't support checkpointing
Start


                                                                                

24/05/20 18:14:09 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 836), (('19th', 'century'), 327), (('same', 'time'), 280), (('20th', 'century'), 266), (('first', 'time'), 264), (('other', 'hand'), 236), (('large', 'number'), 227), (('civil', 'war'), 211), (('political', 'party'), 201), (('recent', 'year'), 189)]


                                                                                

24/05/20 18:14:14 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:14 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 1622), (('19th', 'century'), 608), (('same', 'time'), 555), (('20th', 'century'), 544), (('first', 'time'), 532), (('other', 'hand'), 426), (('large', 'number'), 419), (('civil', 'war'), 412), (('recent', 'year'), 404), (('political', 'party'), 391)]


                                                                                

24/05/20 18:14:19 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:19 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 2427), (('19th', 'century'), 897), (('20th', 'century'), 834), (('same', 'time'), 830), (('first', 'time'), 799), (('civil', 'war'), 654), (('large', 'number'), 630), (('other', 'hand'), 629), (('political', 'party'), 571), (('recent', 'year'), 564)]
24/05/20 18:14:19 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:14:24 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 3248), (('19th', 'century'), 1187), (('20th', 'century'), 1128), (('same', 'time'), 1115), (('first', 'time'), 1059), (('civil', 'war'), 909), (('large', 'number'), 852), (('other', 'hand'), 836), (('political', 'party'), 762), (('recent', 'year'), 756)]


                                                                                

[(('external', 'link'), 4075), (('19th', 'century'), 1457), (('20th', 'century'), 1388), (('same', 'time'), 1359), (('first', 'time'), 1310), (('civil', 'war'), 1137), (('large', 'number'), 1065), (('other', 'hand'), 1036), (('political', 'party'), 953), (('recent', 'year'), 944)]


                                                                                

24/05/20 18:14:36 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:36 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[(('external', 'link'), 4890), (('19th', 'century'), 1741), (('20th', 'century'), 1676), (('same', 'time'), 1665), (('first', 'time'), 1558), (('civil', 'war'), 1352), (('large', 'number'), 1282), (('other', 'hand'), 1228), (('political', 'party'), 1149), (('recent', 'year'), 1129)]


                                                                                

24/05/20 18:14:47 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:47 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[(('external', 'link'), 5727), (('19th', 'century'), 2027), (('20th', 'century'), 1950), (('same', 'time'), 1937), (('first', 'time'), 1826), (('civil', 'war'), 1561), (('large', 'number'), 1498), (('other', 'hand'), 1443), (('political', 'party'), 1337), (('other', 'country'), 1301)]


[Stage 55:>                                                       (0 + 12) / 12]

24/05/20 18:14:51 WARN BatchedWriteAheadLog: BatchedWriteAheadLog Writer queue interrupted.




24/05/20 18:14:55 ERROR JobScheduler: Error generating jobs for time 1716200085000 ms
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/util.py", line 71, in call
    r = self.func(t, *rdds)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/dstream.py", line 410, in func
    return oldfunc(rdd)  # type: ignore[arg-type, call-arg]
  File "/tmp/ipykernel_21948/1068134567.py", line 9, in <lambda>
    counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 1037, in sortBy
    self.keyBy(keyfunc)  # type: ignore[type-var]
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 995, in sortByKey
    rddSize = self.count()
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 1521, in c

                                                                                

[(('external', 'link'), 6503), (('19th', 'century'), 2328), (('20th', 'century'), 2238), (('same', 'time'), 2221), (('first', 'time'), 2096), (('civil', 'war'), 1775), (('large', 'number'), 1688), (('other', 'hand'), 1654), (('political', 'party'), 1537), (('other', 'country'), 1463)]
Finished




In [4]:
def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count
    
running_counts = lines.map(lambda word: (word, 1)).updateStateByKey(updateFunc)

counts_sorted = running_counts.map(lambda x: (x[0][1], x)) \
                .reduceByKey(lambda x,y: x if x[1] > y[1] else y) \
                .map(lambda x: x[1]) \
                .transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResult(rdd):
    print(rdd.take(10))
    
counts_sorted.foreachRDD(printResult)

ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(50)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

24/05/20 18:32:47 WARN QueueInputDStream: queueStream doesn't support checkpointing
Start


                                                                                

[(('external', 'link'), 836), (('19th', 'century'), 327), (('same', 'time'), 280), (('other', 'hand'), 236), (('large', 'number'), 227), (('civil', 'war'), 211), (('political', 'party'), 201), (('recent', 'year'), 189), (('other', 'country'), 179), (('many', 'people'), 174)]


                                                                                

24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 1622), (('19th', 'century'), 608), (('same', 'time'), 555), (('other', 'hand'), 426), (('large', 'number'), 419), (('civil', 'war'), 412), (('recent', 'year'), 404), (('political', 'party'), 391), (('other', 'country'), 360), (('many', 'people'), 333)]
24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:33:03 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 2427), (('19th', 'century'), 897), (('same', 'time'), 830), (('civil', 'war'), 654), (('large', 'number'), 630), (('other', 'hand'), 629), (('political', 'party'), 571), (('recent', 'year'), 564), (('other', 'country'), 549), (('many', 'people'), 500)]
24/05/20 18:33:03 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:33:09 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 3248), (('19th', 'century'), 1187), (('same', 'time'), 1115), (('civil', 'war'), 909), (('large', 'number'), 852), (('other', 'hand'), 836), (('political', 'party'), 762), (('recent', 'year'), 756), (('other', 'country'), 741), (('many', 'people'), 653)]
24/05/20 18:33:09 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:33:14 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 4075), (('19th', 'century'), 1457), (('same', 'time'), 1359), (('civil', 'war'), 1137), (('large', 'number'), 1065), (('other', 'hand'), 1036), (('political', 'party'), 953), (('recent', 'year'), 944), (('other', 'country'), 921), (('many', 'people'), 811)]
24/05/20 18:33:14 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[(('external', 'link'), 4890), (('19th', 'century'), 1741), (('same', 'time'), 1665), (('civil', 'war'), 1352), (('large', 'number'), 1282), (('other', 'hand'), 1228), (('political', 'party'), 1149), (('recent', 'year'), 1129), (('other', 'country'), 1095), (('many', 'people'), 1015)]


                                                                                

24/05/20 18:33:25 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 5727), (('19th', 'century'), 2027), (('same', 'time'), 1937), (('civil', 'war'), 1561), (('large', 'number'), 1498), (('other', 'hand'), 1443), (('political', 'party'), 1337), (('other', 'country'), 1301), (('recent', 'year'), 1283), (('many', 'people'), 1207)]


                                                                                

24/05/20 18:33:32 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:33:32 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 6503), (('19th', 'century'), 2328), (('same', 'time'), 2221), (('civil', 'war'), 1775), (('large', 'number'), 1688), (('other', 'hand'), 1654), (('political', 'party'), 1537), (('other', 'country'), 1463), (('recent', 'year'), 1452), (('many', 'people'), 1399)]


[Stage 88:====>                                                   (1 + 11) / 12]

24/05/20 18:33:37 WARN BatchedWriteAheadLog: BatchedWriteAheadLog Writer queue interrupted.




24/05/20 18:33:38 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 7340), (('19th', 'century'), 2595), (('same', 'time'), 2482), (('civil', 'war'), 2005), (('large', 'number'), 1895), (('other', 'hand'), 1838), (('political', 'party'), 1712), (('other', 'country'), 1662), (('recent', 'year'), 1640), (('many', 'people'), 1564)]
Finished
