## This notebook is part of Hadoop and Spark training delivered by IT-DB group
### SPARK Streaming Hands-On Lab
_ by Prasanth Kothuri _

### Hands-On 1 - Stream processing using Spark Streaming and Kafka
*This demonstrates processing of unbounded data from Kafka topic and perform simple string manipulations and aggregations*

#### Import the required modules

In [1]:
import os
import json
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

#### Make spark streaming kafka module available to Spark executors

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 pyspark-shell'

#### Create SparkContext

In [3]:
conf = SparkConf().setMaster("local[*]").set("spark.driver.memory", "4g")
sc = SparkContext(conf = conf)

#### Create streaming context

In [4]:
ssc = StreamingContext(sc, 60)

#### Hook upto kafka topic

In [5]:
kafkaStream = KafkaUtils.createStream(ssc, 'sstreaming:2181', 'spark-streaming-pkothuri', {'twitter_json':1})

#### Parse the messages into json

In [6]:
tweets_json = kafkaStream.map(lambda x: json.loads(x[1]))

#### Number of tweets in each batch

In [7]:
tweets_json.count().map(lambda x:'Number of tweets in this batch: %s' % x).pprint()

#### json returned from kafka
```json
{
  "payload": {
    "id": 935610640634449900,
    "created_at": "2017-11-28T20:45:34.000+0000",
    "user": {
      "id": 290028823,
      "name": "TDB68",
      "screen_name": "tbell1968",
      "location": "Dallas",
      "verified": false,
      "friends_count": 3031,
      "followers_count": 2755,
      "statuses_count": 60606
    },
    "text": "RT @paulkrugman: At this point everyone should know that dynamic scoring won't make much difference; JCT estimate likely to look a l… ",
    "lang": "en",
    "is_retweet": true,
    "entities": {
      "hashtags": [],
      "media": [],
      "urls": [],
      "user_mentions": [
        {
          "id": 17006157,
          "name": "Paul Krugman",
          "screen_name": "paulkrugman"
        }
      ]
    }
  }
}
```

#### Count tweets by location

In [8]:
location_counts = tweets_json.map(lambda tweet: tweet['payload']['user']['location']).countByValue()

In [9]:
top_locations = location_counts \
    .transform( (lambda foo:foo .sortBy(lambda x:( -x[1]))) ) \
    .transform(lambda rdd:sc.parallelize(rdd.take(5)))

In [10]:
top_locations.pprint()

#### High frequency words in the tweets

In [11]:
tweets_json \
    .flatMap(lambda tweet:tweet['payload']['text'].split(" ")) \
    .countByValue() \
    .transform(lambda rdd:rdd.sortBy(lambda x:-x[1])) \
    .pprint()

#### Start the streaming context

In [None]:
ssc.start()
ssc.awaitTermination(timeout=180)

-------------------------------------------
Time: 2017-11-29 07:01:00
-------------------------------------------
Number of tweets in this batch: 1720

-------------------------------------------
Time: 2017-11-29 07:01:00
-------------------------------------------
(None, 548)
(u'Los Angeles, CA', 23)
(u'United States', 16)
(u'Worldwide', 14)
(u'Sydney, New South Wales', 13)

-------------------------------------------
Time: 2017-11-29 07:01:00
-------------------------------------------
(u'RT', 1000)
(u'', 733)
(u'the', 443)
(u'to', 324)
(u'a', 258)
(u'of', 231)
(u'and', 206)
(u'in', 205)
(u'-', 200)
(u'is', 188)
...

-------------------------------------------
Time: 2017-11-29 07:02:00
-------------------------------------------
Number of tweets in this batch: 2423

-------------------------------------------
Time: 2017-11-29 07:02:00
-------------------------------------------
(None, 829)
(u'CN', 23)
(u'United States', 19)
(u'India', 17)
(u'Gurgaon', 15)

---------------------------

#### stop the streaming context

In [None]:
ssc.stop()

### Hands-On 2 - Stream processing using Spark Streaming and Kafka
*This demonstrates spark streaming window operations*

#### Restart the kernel to clear all the variables; we are going to create streaming context again
In the top menu, Kernel -> Restart

#### Import the required modules

In [None]:
import os
import json
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

#### Make spark streaming kafka module available to Spark executors

In [None]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 pyspark-shell'

#### Window functionality stream processing code

In [None]:
def createStreamingContext():  
    conf = SparkConf().setMaster("local[*]").set("spark.driver.memory", "2g").set("spark.executor.memory", "2g")
    sc = SparkContext(conf = conf)
    ssc = StreamingContext(sc, 10)

    # Hook upto kafka topic (streamingcontext, zookeeper endpoint, consumer_name, dictionary of topic and offset)
    kafkaStream = KafkaUtils.createStream(ssc, 'sstreaming:2181', 'spark-streaming-pkothuri', {'twitter_json':1})

    # Extract and parse the tweets
    tweets_json = kafkaStream.map(lambda x: json.loads(x[1]))
    
    # Number of tweets in this batch
    count_this_batch = kafkaStream.count().map(lambda x:('Tweets this batch: %s' % x))

    # One minute rolling counts
    count_windowed = kafkaStream.countByWindow(60,10).map(lambda x:('Tweets total (One minute rolling count): %s' % x))
    
    # Top 10 hashtags
    hashTags = tweets_json.map(lambda tweet: tweet['payload']['text']) \
                            .flatMap(lambda x : x.split(" ")) \
                            .filter(lambda x: x.startswith("#"))
                                                  
    count_hashTags = hashTags.countByValue() \
                                .transform(lambda rdd:rdd .sortBy(lambda x:-x[1])) \
                                .map(lambda x:"hashTag counts this batch:\tValue %s\tCount %s" % (x[0],x[1]))
                                                  
    count_hashTags_windowed = hashTags.countByValue() \
                                .transform(lambda rdd:rdd .sortBy(lambda x:-x[1])) \
                                .map(lambda x:"hashTag counts one minute rolling:\tValue %s\tCount %s" % (x[0],x[1]))

    # Write totals to stdout
    count_this_batch.pprint()
    count_windowed.pprint()                              
    count_hashTags.pprint(5)
    count_hashTags_windowed.pprint(5)

    return ssc

#### Start the streaming context

In [None]:
# checkpoint directory required for windowing functionality
ssc = StreamingContext.getOrCreate('/tmp/checkpoint001',lambda: createStreamingContext())  
ssc.start()  
ssc.awaitTermination() 

In [None]:
ssc.stop()