# Jupyter Notebook with Kafka producer and Spark Streaming Processor
<img src="work/notebook.png">

In [1]:
import sys, os, json

# to get the appropriate version of spark
import findspark
findspark.init()
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils

topic_name= 'alerts'

In [2]:
## create a spark context 
## create a streaming context using that spark context 
### Connect to kafk 

# Important when using in jupyter notebook, since we are not submitting the job via command line
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] ='--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 pyspark-shell'

In [3]:
# Create SPARK Context
sc = SparkContext(appName="PythonSparkStreamingReadKafka")
sc.setLogLevel("WARN")
## Create Streaming context , with 30 second interval 
ssc = StreamingContext(sc,  30) 

## Connect to KAFKA
### consumer group id = spark-streaming 
### zookeeper quorum = localhost: 2181 
### topic: 'alerts', use 1 cluster for this topic, so {alerts: 1}

#  After a context is defined, you have to do the following.
# Define the input sources by creating input DStreams. here KafkaStream is a Dtream type object

<img src="work/streaming-dstream.png">

In [4]:
# Direct structured stream, new 
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("pySparkDirectStream").getOrCreate()
# # Subscribe to 1 topic
# df = spark.readStream.format("kafka")\
# .option("zookeeper.connect", "localhost:2181")\
# .option("kafka.bootstrap.servers", "localhost:9092")\
# .option("subscribe", "ingest")\
# .option("failOnDataLoss", "false")\
# .option("startingOffsets", "latest")\
# .option("subscribe", topic_name)\
# .load()

# df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

# df.cache() # Cache data for faster reuse
# df = df.dropna() # drop rows with missing values

# print("Schema from files:", df.printSchema())
# print("stream: ", df)


<img src="work/structured-streaming.png">

In [5]:
# Stream receiver 
#  Listens to the kafka topic that we just created, so it is getting stream of alerts json data 
kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming', {topic_name: 1})
parsed = kafkaStream.map(lambda v: json.loads(v[1]))

parsed.count().map(lambda x: 'Alerts in this batch: %s' % x).pprint()

# Get source
source_dstream = parsed.map(lambda alert: alert['src'])

# Count each value and number of occurences 
#  Each batch is 30 seconds of data from kafka 
print("Source counts this batch:")
count_source_values_this_batch = source_dstream.countByValue()\
    .transform(lambda rdd:rdd )\
    .map(lambda x:"\tValue %s\tCount %s" % (x[0],x[1]))
    
# Print alert source counts to stdout
count_source_values_this_batch.pprint()   

# Get status
status_dstream = parsed.map(lambda alert: alert['status'])

# Count each value and number of occurences 
print("Status counts this batch:")

count_status_values_this_batch = status_dstream.countByValue()\
    .transform(lambda rdd:rdd)\
    .map(lambda x:"\tValue %s\tCount %s" % (x[0],x[1]))      
    
count_status_values_this_batch.pprint()          

Source counts this batch:
Status counts this batch:


In [6]:
ssc.start()
ssc.awaitTermination()

-------------------------------------------
Time: 2018-10-03 11:13:00
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:13:00
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:13:00
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:13:30
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:13:30
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:13:30
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:14:00
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:14:00
-------------------------------------------

-------------------------------------------
Time: 2018-10-03 11:14:00
----------

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 49681)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:root:Exception while sending command.
T

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:49659)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-18f3db416f1c>", line 2, in <module>
    ssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/streaming/context.py", line 206, in awaitTermination
    self._jssc.awaitTermination()
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value
    format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.awaitTermination

During handl

Py4JError: An error occurred while calling o26.awaitTermination

In [None]:
print('done')