# pySpark : Streaming Tweet (RDD version)

https://www.toptal.com/apache/apache-spark-streaming-twitter

In [1]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.master('spark://localhost:7077')\
.appName('StreamingBCO').getOrCreate()
sc = spark.sparkContext

In [3]:
from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row,SQLContext
import sys
import requests
from operator import add

sc.setLogLevel("ERROR")
# create the Streaming Context from the above spark context with interval size 2 seconds
ssc = StreamingContext(sc, 2)
# setting a checkpoint to allow RDD recovery
ssc.checkpoint("checkpoint_TwitterApp")
# read data from port 9009
dataStream = ssc.socketTextStream("localhost",9009)

In [4]:
def send_df_to_dashboard(df):
    # extract the hashtags from dataframe and convert them into array
    top_tags = [str(t.hashtag) for t in df.select("hashtag").collect()]
    # extract the counts from dataframe and convert them into array
    tags_count = [p.hashtag_count for p in df.select("hashtag_count").collect()]
    # initialize and send the data through REST API
    url = 'http://localhost:5001/updateData'
    request_data = {'label': str(top_tags), 'data': str(tags_count)}
    response = requests.post(url, data=request_data)

In [5]:
def aggregate_tags_count(new_values, total_sum):
 return sum(new_values) + (total_sum or 0)

def get_sql_context_instance(spark_context):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
    return globals()['sqlContextSingletonInstance']


def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
    try:
        # Get spark sql singleton context from the current context
        sql_context = get_sql_context_instance(rdd.context)
        # convert the RDD to Row RDD
        row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
        # create a DF from the Row RDD
        hashtags_df = sql_context.createDataFrame(row_rdd)
        # Register the dataframe as table
        hashtags_df.registerTempTable("hashtags")
        # get the top 10 hashtags from the table using SQL and print them
        hashtag_counts_df = sql_context.sql("select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10")
        hashtag_counts_df.show()
        # call this method to prepare top 10 hashtags DF and send them
        send_df_to_dashboard(hashtag_counts_df)
    except:
        e = sys.exc_info()[0]
        print("process_rdd Error: %s" % e)

In [6]:
# def process_rdd(time, rdd):
#     print("----------- %s -----------" % str(time))
#     # Get spark sql singleton context from the current context
#     sql_context = get_sql_context_instance(rdd.context)
#     # convert the RDD to Row RDD
#     row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
#     # create a DF from the Row RDD
#     hashtags_df = sql_context.createDataFrame(row_rdd)
#     # Register the dataframe as table
#     hashtags_df.registerTempTable("hashtags")
#     # get the top 10 hashtags from the table using SQL and print them
#     hashtag_counts_df = sql_context.sql("select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10")
#     hashtag_counts_df.show()
#     # call this method to prepare top 10 hashtags DF and send them
#     send_df_to_dashboard(hashtag_counts_df)

In [None]:
# split each tweet into words
words = dataStream.flatMap(lambda line: line.split(" "))
# filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1)
hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1))
#aggregate_tags_count = hashtags.reduceByKey(add)
# adding the count of each hashtag to its last count
tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
# do processing for each RDD generated in each interval
tags_totals.foreachRDD(process_rdd)
# start the streaming computation
ssc.start()
# wait for the streaming to finish
ssc.awaitTermination()

----------- 2020-11-27 09:43:50 -----------
+---------------+-------------+
|        hashtag|hashtag_count|
+---------------+-------------+
|       #INDvAUS|           25|
|#TheMandalorian|           23|
|       #ForSale|           23|
|       #AUSvIND|           17|
|          #Flat|           17|
|   #Residential|           17|
|     #Apartment|           17|
|      #Property|           15|
|       #ForRent|           14|
|    #AhsokaTano|           13|
+---------------+-------------+

----------- 2020-11-27 09:43:52 -----------
+---------------+-------------+
|        hashtag|hashtag_count|
+---------------+-------------+
|       #INDvAUS|           25|
|#TheMandalorian|           23|
|       #ForSale|           23|
|       #AUSvIND|           17|
|          #Flat|           17|
|   #Residential|           17|
|     #Apartment|           17|
|      #Property|           15|
|       #ForRent|           14|
|    #AhsokaTano|           13|
+---------------+-------------+

----------- 20

In [None]:
sc.stop()

```
----------- 2020-11-25 21:40:18 -----------
+--------------------+-------------+
|             hashtag|hashtag_count|
+--------------------+-------------+
|            #COVID19|            4|
|         #earthquake|            3|
|        #FlynnPardon|            3|
|         #LifeGoesOn|            2|
|   #Thanksgiving2020|            2|
|              #Labor|            2|
|       #WITHAPURPOSE|            2|
|                   #|            2|
|#OhTheWeatherOuts...|            2|
|             #pdx911|            2|
+--------------------+-------------+
```