# 8.1 Spark Streaming

## Apache Kafka

### Kafka Producer

In [3]:
import time
import json
from kafka import KafkaProducer


producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)
 
data = {'hello': 'world', 'time': time.time()}
producer.send('dsp', data)

<kafka.producer.future.FutureRecordMetadata at 0x7f017418acf8>

### Kafka Consumer

In [None]:
import json
from kafka import KafkaConsumer


consumer = KafkaConsumer(
    'dsp',
    bootstrap_servers=['localhost:9092'],
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)
 
for x in consumer:
    print(x.value)

## Sklearn Streaming

In [None]:
import os
import json

external_ip = os.getenv('kafka.bootstrap.servers')
df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", f"{external_ip}:9092")
    .option("subscribe", "dsp")
    .option("startingOffsets", "earliest")
    .load()
    .withColumn("value_deserialized", 
                udf(lambda x: json.loads(x.decode('utf-8')))("value"))
)

display(df)

### Model Producer

In [1]:

from kafka import KafkaProducer
from json import dumps
import time
import uuid

producer = KafkaProducer(bootstrap_servers=['54.166.148.190:9092'],
          value_serializer=lambda x: dumps(x).encode('utf-8'))

data = { 'G1': 1, 'G2': 0, 'G3': 0, 'G4': 0, 'G5': 0, 
         'G6': 0, 'G7': 0, 'G8': 0, 'G9': 0, 'G10': 0, 
        'User_ID': str(uuid.uuid1())}
result = producer.send('dsp', data)
result.get()


RecordMetadata(topic='dsp', partition=0, topic_partition=TopicPartition(topic='dsp', partition=0), offset=109, timestamp=1576709681368, checksum=None, serialized_key_size=-1, serialized_value_size=142, serialized_header_size=-1)

## Streaming Pipeline 

In [None]:

from pyspark.sql.types import StringType
import json 
import pandas as pd
from sklearn.linear_model import LogisticRegression

# build a logsitic regression model 
gamesDF = pd.read_csv("https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv")
model = LogisticRegression() 
model.fit(gamesDF.iloc[:,0:10], gamesDF['label'])

# read from Kafka 
df = spark .readStream.format("kafka") \
  .option("kafka.bootstrap.servers", "54.166.148.190:9092") \
  .option("subscribe", "dsp").load()

# define the UDF for scoring users 
def score(row):
    d = json.loads(row)
    p = pd.DataFrame.from_dict(d, orient = "index").transpose()        
    pred = model.predict_proba(p.iloc[:,0:10])[0][0]
    result = {'User_ID': d['User_ID'], 'pred': pred }
    return str(json.dumps(result))
    
# select the value field and apply the UDF     
df = df.selectExpr("CAST(value AS STRING)")
score_udf = udf(score, StringType())    
df = df.select( score_udf("value").alias("value"))

# Write results to Kafka 
query = df.writeStream.format("kafka") \
  .option("kafka.bootstrap.servers", "54.166.148.190:9092") \
  .option("topic", "preds") \
  .option("checkpointLocation", "/temp").start()


## Model Consumer

In [None]:
from kafka import KafkaConsumer
from json import loads

consumer = KafkaConsumer('preds',
     bootstrap_servers=['54.166.148.190:9092'],
     value_deserializer=lambda x: loads(x))

for x in consumer:
    print(x.value)

# Dataflow Streaming

## PubSub Consumer

In [None]:
import time
from google.cloud import pubsub_v1

subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path("gameanalytics-199018", "dsp")

def callback(message):
    print(message.data)
    message.ack()

subscriber.subscribe(subscription_path, callback=callback)

while True:
    time.sleep(10)

## PubSub Producer

In [None]:
from google.cloud import pubsub_v1

publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path("gameanalytics-199018", "natality")

data = "Hello World!".encode('utf-8')
publisher.publish(topic_path, data=data)

## Streaming Pipeline

In [None]:
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigquery import parse_table_schema_from_json
import json

class ApplyDoFn(beam.DoFn):

    def __init__(self):
        self._model = None
        from google.cloud import storage
        import pandas as pd
        import pickle as pkl
        import json as js
        self._storage = storage
        self._pkl = pkl
        self._pd = pd
        self._json = js
     
    def process(self, element):
        if self._model is None:
            bucket = self._storage.Client().get_bucket(
                                                 'dsp_model_store')
            blob = bucket.get_blob('natality/sklearn-linear')
            self._model =self._pkl.loads(blob.download_as_string())
        
        element = self._json.loads(element.decode('utf-8'))
        new_x = self._pd.DataFrame.from_dict(element, 
                            orient = "index").transpose().fillna(0)   
        weight = self._model.predict(new_x.iloc[:,1:8])[0]
        return [ { 'guid': element['guid'], 'weight': weight, 
                                   'time': str(element['time']) } ]
             
class PublishDoFn(beam.DoFn):
    
    def __init__(self):
        from google.cloud import datastore       
        self._ds = datastore
    
    def process(self, element):
        client = self._ds.Client()
        key = client.key('natality-guid', element['guid'])
        entity = self._ds.Entity(key)
        entity['weight'] = element['weight']         
        entity['time'] = element['time']
        client.put(entity)
            
# set up pipeline parameters 
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)

# define the topics 
topic = "projects/{project}/topics/{topic}"
topic = topic.format(project = "gameanalytics-199018", topic = "natality")

schema = parse_table_schema_from_json(json.dumps({'fields':
            [ { 'name': 'guid', 'type': 'STRING'},
              { 'name': 'weight', 'type': 'FLOAT64'},
              { 'name': 'time', 'type': 'STRING'} ]}))

# define the pipeline steps 
p = beam.Pipeline(options=pipeline_options)
lines = p | 'Read PubSub' >> beam.io.ReadFromPubSub(topic=topic)
scored = lines | 'apply' >> beam.ParDo(ApplyDoFn())
scored | 'Create entities' >> beam.ParDo(PublishDoFn())

# run the pipeline 
result = p.run()
result.wait_until_finish()

## Streaming Producer

In [2]:

import json
from google.cloud import pubsub_v1
import time 

publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path("gameanalytics-199018", "natality")

data = json.dumps({'year': 2001, 'plurality': 1, 'apgar_5min': 99, 'mother_age': 33, 
     'father_age': 40, 'gestation_weeks': 38, 'ever_born': 8, 
     'mother_married': 1, 'weight': 6.8122838958, 
     'time': str(time.time()), 
     'guid': 'b281c5e8-85b2-4cbd-a2d8-e501ca816363'}
).encode('utf-8') 

publisher.publish(topic_path, data=data)

<google.cloud.pubsub_v1.publisher.futures.Future at 0x7f0f866bebd0>