# Tweets streaming - Consumer
Read Stream into dataframe, add timestamp, sentiment and price, save aggregated window

In [0]:
# !pip install textblob

In [0]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob
import re


In [0]:
# Funktions to split away the timestamp from the tweet message
@udf
def split_start(text):
  stamp = text[:-27]
  return stamp
@udf
def split_back(text):
  stamp = text[-21:]
  return stamp


In [0]:
# Additional cleaning and splitting of the incoming text
def preprocessing(lines):
    words = lines.select(explode(split(lines.value, "t_end")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    words = words.withColumn("tweet_txt", split_start('word'))
    words = words.withColumn("timestamp", split_back('word'))
    return words
  

In [0]:
# Apply Textblob Sentiment Analysis
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity
def text_classification(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("tweet_txt"))
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType())
    words = words.withColumn("subjectivity", subjectivity_detection_udf("tweet_txt"))
    return words
  

In [0]:
# Create Spark session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

# Read the tweet data from socket
lines = spark.readStream.format("socket") \
        .option("host", "localhost") \
        .option("port", 9997) \
        .load()
   
# Preprocess the data
words = preprocessing(lines)

# text classification to define polarity and subjectivity
words = text_classification(words)

# up to here all is ok and can be viewed with display():
# display(words)


In [0]:
# but the stream does not want to be loaded into the filesystem...

# Minimal Version of code to write to parquet
words.writeStream\
        .format("parquet")\
        .option("path", "dbfs:/FileStore/original")\
        .option("checkpointLocation", "./check")\
        .start()


In [0]:
'''
# Alternative version we tried
words = words.repartition(1)
words.writeStream\
        .queryName("all_tweets")\
        .outputMode("append")\
        .format("parquet")\
        .option("path", "dbfs:/FileStore/original")\
        .option("checkpointLocation", "./check")\
        .trigger(processingTime='30 seconds') \
        .start()
'''

In [0]:
'''
Previous attempt to write the aggregated stream - not relevant at the time as we do not have an aggragated window
# debugging aggregated stream: write to memory
words \
    .writeStream \
    .queryName("aggregates") \
    .outputMode("complete") \
    .format("memory") \
    .start()

spark.sql("select * from aggregates").show()   # interactively query in-memory table
'''


In [0]:
# %fs rm -r dbfs:/FileStore/original