## RDD

In [8]:
from pyspark.sql.functions import *

In [20]:
rdd = sc.parallelize([("b",3), ("b",4), ("a",5)])

In [29]:
rdd.reduceByKey(lambda x, y: x if x >= y else y).collect()

[('b', 4), ('a', 5)]

## DataFrame

In [1]:
from pyspark.sql.functions import *

In [2]:
dfCustomer = spark.read.csv('data/Customer.csv', header=True, inferSchema=True)
dfDetail = spark.read.csv('data/SalesOrderDetail.csv', header=True, inferSchema=True)
dfHeader = spark.read.csv('data/SalesOrderHeader.csv', header=True, inferSchema=True)

In the example relational database used in the lectures, each customer is served by a salesperson, which is recorded in the "SalesPerson" column in the "Customer" table (see the database schema at the header of each file).

Use DataFrame API to generate a report of each salesperson's performance, which is the total net price of his/her customer's orders.  Note that an order consists of multiple order details, and the net price of an order detail is UnitPrice * OrderQty * (1 - UnitPriceDiscount).

All salespeople should be reported, even if they have 0 order.

In [2]:
dfCustomer.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- NameStyle: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- MiddleName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Suffix: string (nullable = true)
 |-- CompanyName: string (nullable = true)
 |-- SalesPerson: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- PasswordHash: string (nullable = true)
 |-- PasswordSalt: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [3]:
dfHeader.printSchema()

root
 |-- SalesOrderID: integer (nullable = true)
 |-- RevisionNumber: integer (nullable = true)
 |-- OrderDate: timestamp (nullable = true)
 |-- DueDate: timestamp (nullable = true)
 |-- ShipDate: timestamp (nullable = true)
 |-- Status: integer (nullable = true)
 |-- OnlineOrderFlag: integer (nullable = true)
 |-- SalesOrderNumber: string (nullable = true)
 |-- PurchaseOrderNumber: string (nullable = true)
 |-- AccountNumber: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- ShipToAddressID: integer (nullable = true)
 |-- BillToAddressID: integer (nullable = true)
 |-- ShipMethod: string (nullable = true)
 |-- CreditCardApprovalCode: string (nullable = true)
 |-- SubTotal: double (nullable = true)
 |-- TaxAmt: double (nullable = true)
 |-- Freight: double (nullable = true)
 |-- TotalDue: double (nullable = true)
 |-- Comment: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [4]:
dfDetail.printSchema()

root
 |-- SalesOrderID: integer (nullable = true)
 |-- SalesOrderDetailID: integer (nullable = true)
 |-- OrderQty: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- UnitPriceDiscount: double (nullable = true)
 |-- LineTotal: double (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [5]:
dfCustomer.select("SalesPerson", "CustomerID") \
    .join(dfHeader.select("SalesOrderID", "CustomerID"), "CustomerID", "left_outer") \
    .join(dfDetail.select("SalesOrderID", "OrderQty", "UnitPrice", "UnitPriceDiscount"), "SalesOrderID", "left_outer") \
    .na.fill(value=0,subset=["OrderQty", "UnitPrice", "UnitPriceDiscount"]) \
    .withColumn('NetPrice', col("OrderQty") * col("UnitPrice") * (1 - col("UnitPriceDiscount"))) \
    .groupBy("SalesPerson").agg(sum("NetPrice").alias("TotalNetPrice")) \
    .show()

+--------------------+------------------+
|         SalesPerson|     TotalNetPrice|
+--------------------+------------------+
|adventure-works\l...|170064.83797000017|
|adventure-works\p...|               0.0|
|adventure-works\d...|               0.0|
|adventure-works\shu0|112646.20642400002|
|adventure-works\m...|               0.0|
|adventure-works\j...|               0.0|
|adventure-works\j...|               0.0|
|adventure-works\jae0| 426024.1086639994|
|adventure-works\g...|               0.0|
+--------------------+------------------+



## GraphFrame

In [1]:
sc.addPyFile("/home/bigbenchung/spark-3.3.2-bin-hadoop3/jars/graphframes-0.8.2-spark3.0-s_2.12.jar")

In [2]:
from graphframes import *
from pyspark.sql.functions import *

Given a directed graph without self-loops, write a Spark program using GraphFrame and DataFrame to find all pairs (x, y) such that:

1. x and y are different nodes in the graph. 
2. There exists a unique path of length 3 from x to y, i.e., x->z->w->y, where z and w cannot be equal to x or y.

Sample Input: Given in the code below
Expected Output: 

x y

a d<br>
b e<br>
b f<br>

Explanation:
1. There is only one path of length 3 from a to d, which is a->b->c->d.
2. There is only one path of length 3 from b to e, which is b->c->d->e.
3. There is only one path of length 3 from b to f, which is b->c->d->f.
4. Note that (c, g) is not included in the result because there are 2 paths of length 3 from c to g, which are c->d->e->g and c->d->f->g.

In [5]:
from graphframes import * 
from pyspark.sql.functions import * 

v = spark.createDataFrame([ 
    ("a", "node_a"), 
    ("b", "node_b"),
    ("c", "node_c"),
    ("d", "node_d"),
    ("e", "node_e"),
    ("f", "node_f"),
    ("g", "node_g")
], ["id", "name"])  

e = spark.createDataFrame([
    ("a", "b"),
    ("b", "c"),
    ("c", "d"),
    ("d", "c"),
    ("d", "e"),
    ("d", "f"),
    ("e", "g"),
    ("f", "g")
], ["src", "dst"]) 

g = GraphFrame(v, e)

In [20]:
result = g.find("(a)-[e1]->(b);(b)-[e2]->(c);(c)-[e3]->(d)") \
        .filter("b != a AND b != d AND c != a AND c != d") \
        .select(col("a.id").alias("x"), col("d.id").alias("y")) \
        .groupBy("x", "y").count() \
        .filter("count = 1") \
        .select("x", "y")
result.show()

+---+---+
|  x|  y|
+---+---+
|  b|  e|
|  a|  d|
|  b|  f|
+---+---+



## Streaming

Grade(<u>cid</u>, <u>sid</u>, score)

Columns with underscores are primary keys. The records of Grade are in a streaming fashion, with each record representing a student with student ID sid achieving a score for course ID cid, where the score can be one of 'A', 'B', 'C', or 'F'. 

Given a DStream representing the Grade table, write a Spark Streaming program to dynamically maintain the number of students achieving 'A', 'B', 'C', and 'F' for each course. After each RDD, print the current state(order doesn't matter).

Example:
RDD at time 1
('103', '1', 'B')

RDD at time 2
('101', '2', 'A')

RDD at time 3
('101', '1', 'A')
('101', '3', 'B')
('102', '2', 'F')

Expected output:
After RDD at time 1
('103', {'B': 1})

After RDD at time 2
('101', {'A': 1})
('103', {'B': 1})

After RDD at time 3
('102', {'F': 1})
('101', {'A': 2, 'B': 1})
('103', {'B': 1})

Explanation:
After time 1, we know that there is 1 student in Course 103 who received a grade 'B'.
After time 2, there is 1 student in Course 101 who received a grade 'A', and 1 student in Course 103 who received a grade 'B'.
After time 3, there are 2 students in Course 101 who received a grade 'A', and 1 student who received a grade 'B'. In Course 103, there is 1 student who received a grade 'B'. In Course 102, there is 1 student who received a grade 'F'.

In [9]:
from pyspark.streaming import StreamingContext

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory. Required for stateful transformations
ssc.checkpoint("checkpoint")

numPartitions = 8
rdd = sc.textFile('data/grade.txt', numPartitions)
rddQueue = rdd.randomSplit([1]*3, 123)
lines = ssc.queueStream(rddQueue)
grade = lines.map(lambda l: tuple(l.strip().split(',')))

In [8]:
def updateFunc(score, cumulativeGrades):
    
    if cumulativeGrades is None:
        cumulativeGrades = list()
    
    return cumulativeGrades + score

running_tracker = grade.map(lambda x: (x[0], x[2])).updateStateByKey(updateFunc)

def printResult(rdd):
    print(rdd.collect())
    
running_tracker.foreachRDD(printResult)

ssc.start()
ssc.awaitTermination(30)
ssc.stop(False)
print("Finished")

24/05/20 21:20:03 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:20:05 WARN QueueInputDStream: queueStream doesn't support checkpointing
[]
24/05/20 21:20:05 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:20:10 WARN QueueInputDStream: queueStream doesn't support checkpointing
[('103', ['B']), ('101', ['A'])]
24/05/20 21:20:10 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:20:15 WARN QueueInputDStream: queueStream doesn't support checkpointing
[('102', ['F']), ('103', ['B']), ('101', ['A', 'A', 'B'])]
24/05/20 21:20:15 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:20:20 WARN QueueInputDStream: queueStream doesn't support checkpointing
[('102', ['F']), ('103', ['B']), ('101', ['A', 'A', 'B'])]
24/05/20 21:20:20 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:20:25 WARN QueueInputDStream: queueStream doesn't support checkpointing
[('102

In [10]:
def updateFunc(scores, cumulativeGrades):
    
    if cumulativeGrades is None:
        cumulativeGrades = dict()
    
    for score in scores:
        if score not in cumulativeGrades.keys():
            cumulativeGrades[score] = 1
        else:
            cumulativeGrades[score] += 1
    
    return cumulativeGrades

running_tracker = grade.map(lambda x: (x[0], x[2])).updateStateByKey(updateFunc)

def printResult(rdd):
    print(rdd.collect())
    
running_tracker.foreachRDD(printResult)

ssc.start()
ssc.awaitTermination(30)
ssc.stop(False)
print("Finished")

24/05/20 21:21:24 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:21:25 WARN QueueInputDStream: queueStream doesn't support checkpointing
[]
24/05/20 21:21:25 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:21:30 WARN QueueInputDStream: queueStream doesn't support checkpointing
[('103', {'B': 1}), ('101', {'A': 1})]
24/05/20 21:21:30 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:21:35 WARN QueueInputDStream: queueStream doesn't support checkpointing
[('102', {'F': 1}), ('103', {'B': 1}), ('101', {'A': 2, 'B': 1})]
24/05/20 21:21:35 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:21:40 WARN QueueInputDStream: queueStream doesn't support checkpointing
[('102', {'F': 1}), ('103', {'B': 1}), ('101', {'A': 2, 'B': 1})]
24/05/20 21:21:40 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:21:45 WARN QueueInputDStream: queueStream doesn't support 

In [20]:
# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)
ssc.checkpoint("checkpoint")

24/05/20 21:09:55 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:09:55 WARN BlockManager: Putting block rdd_366_0 failed due to exception org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 2714, in map_values_fn
    return kv[0], f(kv[1])
  File "/home/bigbenchun



24/05/20 21:09:56 ERROR JobScheduler: Error running job streaming job 1716210595000 ms.0
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/util.py", line 71, in call
    r = self.func(t, *rdds)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/dstream.py", line 236, in func
    return old_func(rdd)  # type: ignore[call-arg, arg-type]
  File "/tmp/ipykernel_31597/3653572892.py", line 18, in printResult
    print(rdd.collect())
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 1197, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/sql/utils.py", l

24/05/20 21:10:00 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 21:10:00 WARN BlockManager: Putting block rdd_354_1 failed due to exception org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 2714, in map_values_fn
    return kv[0], f(kv[1])
  File "/home/bigbenchun

24/05/20 21:10:00 ERROR JobScheduler: Error running job streaming job 1716210600000 ms.0
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/util.py", line 71, in call
    r = self.func(t, *rdds)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/dstream.py", line 236, in func
    return old_func(rdd)  # type: ignore[call-arg, arg-type]
  File "/tmp/ipykernel_31597/3653572892.py", line 18, in printResult
    print(rdd.collect())
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 1197, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/sql/utils.py", l

In [3]:
# Create a DStream that will connect to localhost at port 9999
# Start Netcat server: nc -lk 9999 
# lines = ssc.socketTextStream('localhost', 9999)

rdd = sc.textFile("data/adj_noun_pairs.txt", 8).map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

In [7]:
def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count
    
running_counts = lines.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y).updateStateByKey(updateFunc)

counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResult(rdd):
    print(rdd.take(10))
    
counts_sorted.foreachRDD(printResult)

ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(50)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

24/05/20 18:14:01 WARN QueueInputDStream: queueStream doesn't support checkpointing
Start


                                                                                

24/05/20 18:14:09 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 836), (('19th', 'century'), 327), (('same', 'time'), 280), (('20th', 'century'), 266), (('first', 'time'), 264), (('other', 'hand'), 236), (('large', 'number'), 227), (('civil', 'war'), 211), (('political', 'party'), 201), (('recent', 'year'), 189)]


                                                                                

24/05/20 18:14:14 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:14 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 1622), (('19th', 'century'), 608), (('same', 'time'), 555), (('20th', 'century'), 544), (('first', 'time'), 532), (('other', 'hand'), 426), (('large', 'number'), 419), (('civil', 'war'), 412), (('recent', 'year'), 404), (('political', 'party'), 391)]


                                                                                

24/05/20 18:14:19 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:19 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 2427), (('19th', 'century'), 897), (('20th', 'century'), 834), (('same', 'time'), 830), (('first', 'time'), 799), (('civil', 'war'), 654), (('large', 'number'), 630), (('other', 'hand'), 629), (('political', 'party'), 571), (('recent', 'year'), 564)]
24/05/20 18:14:19 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:14:24 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 3248), (('19th', 'century'), 1187), (('20th', 'century'), 1128), (('same', 'time'), 1115), (('first', 'time'), 1059), (('civil', 'war'), 909), (('large', 'number'), 852), (('other', 'hand'), 836), (('political', 'party'), 762), (('recent', 'year'), 756)]


                                                                                

[(('external', 'link'), 4075), (('19th', 'century'), 1457), (('20th', 'century'), 1388), (('same', 'time'), 1359), (('first', 'time'), 1310), (('civil', 'war'), 1137), (('large', 'number'), 1065), (('other', 'hand'), 1036), (('political', 'party'), 953), (('recent', 'year'), 944)]


                                                                                

24/05/20 18:14:36 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:36 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[(('external', 'link'), 4890), (('19th', 'century'), 1741), (('20th', 'century'), 1676), (('same', 'time'), 1665), (('first', 'time'), 1558), (('civil', 'war'), 1352), (('large', 'number'), 1282), (('other', 'hand'), 1228), (('political', 'party'), 1149), (('recent', 'year'), 1129)]


                                                                                

24/05/20 18:14:47 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:14:47 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[(('external', 'link'), 5727), (('19th', 'century'), 2027), (('20th', 'century'), 1950), (('same', 'time'), 1937), (('first', 'time'), 1826), (('civil', 'war'), 1561), (('large', 'number'), 1498), (('other', 'hand'), 1443), (('political', 'party'), 1337), (('other', 'country'), 1301)]


[Stage 55:>                                                       (0 + 12) / 12]

24/05/20 18:14:51 WARN BatchedWriteAheadLog: BatchedWriteAheadLog Writer queue interrupted.




24/05/20 18:14:55 ERROR JobScheduler: Error generating jobs for time 1716200085000 ms
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/util.py", line 71, in call
    r = self.func(t, *rdds)
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/streaming/dstream.py", line 410, in func
    return oldfunc(rdd)  # type: ignore[arg-type, call-arg]
  File "/tmp/ipykernel_21948/1068134567.py", line 9, in <lambda>
    counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 1037, in sortBy
    self.keyBy(keyfunc)  # type: ignore[type-var]
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 995, in sortByKey
    rddSize = self.count()
  File "/home/bigbenchung/spark-3.3.2-bin-hadoop3/python/pyspark/rdd.py", line 1521, in c

                                                                                

[(('external', 'link'), 6503), (('19th', 'century'), 2328), (('20th', 'century'), 2238), (('same', 'time'), 2221), (('first', 'time'), 2096), (('civil', 'war'), 1775), (('large', 'number'), 1688), (('other', 'hand'), 1654), (('political', 'party'), 1537), (('other', 'country'), 1463)]
Finished




In [4]:
def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count
    
running_counts = lines.map(lambda word: (word, 1)).updateStateByKey(updateFunc)

counts_sorted = running_counts.map(lambda x: (x[0][1], x)) \
                .reduceByKey(lambda x,y: x if x[1] > y[1] else y) \
                .map(lambda x: x[1]) \
                .transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResult(rdd):
    print(rdd.take(10))
    
counts_sorted.foreachRDD(printResult)

ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(50)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

24/05/20 18:32:47 WARN QueueInputDStream: queueStream doesn't support checkpointing
Start


                                                                                

[(('external', 'link'), 836), (('19th', 'century'), 327), (('same', 'time'), 280), (('other', 'hand'), 236), (('large', 'number'), 227), (('civil', 'war'), 211), (('political', 'party'), 201), (('recent', 'year'), 189), (('other', 'country'), 179), (('many', 'people'), 174)]


                                                                                

24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 1622), (('19th', 'century'), 608), (('same', 'time'), 555), (('other', 'hand'), 426), (('large', 'number'), 419), (('civil', 'war'), 412), (('recent', 'year'), 404), (('political', 'party'), 391), (('other', 'country'), 360), (('many', 'people'), 333)]
24/05/20 18:32:59 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:33:03 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 2427), (('19th', 'century'), 897), (('same', 'time'), 830), (('civil', 'war'), 654), (('large', 'number'), 630), (('other', 'hand'), 629), (('political', 'party'), 571), (('recent', 'year'), 564), (('other', 'country'), 549), (('many', 'people'), 500)]
24/05/20 18:33:03 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:33:09 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 3248), (('19th', 'century'), 1187), (('same', 'time'), 1115), (('civil', 'war'), 909), (('large', 'number'), 852), (('other', 'hand'), 836), (('political', 'party'), 762), (('recent', 'year'), 756), (('other', 'country'), 741), (('many', 'people'), 653)]
24/05/20 18:33:09 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

24/05/20 18:33:14 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 4075), (('19th', 'century'), 1457), (('same', 'time'), 1359), (('civil', 'war'), 1137), (('large', 'number'), 1065), (('other', 'hand'), 1036), (('political', 'party'), 953), (('recent', 'year'), 944), (('other', 'country'), 921), (('many', 'people'), 811)]
24/05/20 18:33:14 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[(('external', 'link'), 4890), (('19th', 'century'), 1741), (('same', 'time'), 1665), (('civil', 'war'), 1352), (('large', 'number'), 1282), (('other', 'hand'), 1228), (('political', 'party'), 1149), (('recent', 'year'), 1129), (('other', 'country'), 1095), (('many', 'people'), 1015)]


                                                                                

24/05/20 18:33:25 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 5727), (('19th', 'century'), 2027), (('same', 'time'), 1937), (('civil', 'war'), 1561), (('large', 'number'), 1498), (('other', 'hand'), 1443), (('political', 'party'), 1337), (('other', 'country'), 1301), (('recent', 'year'), 1283), (('many', 'people'), 1207)]


                                                                                

24/05/20 18:33:32 WARN QueueInputDStream: queueStream doesn't support checkpointing
24/05/20 18:33:32 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 6503), (('19th', 'century'), 2328), (('same', 'time'), 2221), (('civil', 'war'), 1775), (('large', 'number'), 1688), (('other', 'hand'), 1654), (('political', 'party'), 1537), (('other', 'country'), 1463), (('recent', 'year'), 1452), (('many', 'people'), 1399)]


[Stage 88:====>                                                   (1 + 11) / 12]

24/05/20 18:33:37 WARN BatchedWriteAheadLog: BatchedWriteAheadLog Writer queue interrupted.




24/05/20 18:33:38 WARN QueueInputDStream: queueStream doesn't support checkpointing
[(('external', 'link'), 7340), (('19th', 'century'), 2595), (('same', 'time'), 2482), (('civil', 'war'), 2005), (('large', 'number'), 1895), (('other', 'hand'), 1838), (('political', 'party'), 1712), (('other', 'country'), 1662), (('recent', 'year'), 1640), (('many', 'people'), 1564)]
Finished
