# Tweets streaming - Consumer
Read Stream into dataframe, add timestamp, sentiment and price, save aggregated window

In [0]:
#!pip install textblob
#!pip install pycountry

In [0]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob
import re
import pycountry


In [0]:
# Funktions to split away the timestamp from the tweet message
@udf
def split_start(text):
  stamp = text[:-27]
  return stamp
@udf
def split_back(text):
  stamp = text[-21:]
  return stamp


In [0]:
# Additional cleaning and splitting of the incoming text
def preprocessing(lines):
    words = lines.select(explode(split(lines.value, "t_end")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    words = words.withColumn("tweet_txt", split_start('word'))
    words = words.withColumn("timestamp", split_back('word'))
    return words
  

In [0]:
# Apply Textblob Sentiment Analysis
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity
def language_detection(text):
    try:
      iso_code = TextBlob(text).detect_language()
      language = pycountry.languages.get(alpha_2=iso_code)
      language_name = language.name
    except:
      language_name = 'no language detected'
    return language_name
def text_classification(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("tweet_txt"))
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType())
    words = words.withColumn("subjectivity", subjectivity_detection_udf("tweet_txt"))
    # language detection
    language_detection_udf = udf(language_detection, StringType())
    words = words.withColumn("language", language_detection_udf("tweet_txt"))
    return words
  

In [0]:
# Create Spark session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

# Read the tweet data from socket
lines = spark.readStream.format("socket") \
        .option("host", "localhost") \
        .option("port", 9997) \
        .load()
   
# Preprocess the data
words = preprocessing(lines)

# text classification to define polarity and subjectivity
words = text_classification(words)

# up to here all is ok and can be viewed with display():
# display(words)


In [0]:
display(words)

word,tweet_txt,timestamp,polarity,subjectivity,language
"If you dont own any bitcoin, you wont truly understand it. stop 2021-05-30T095935.691","If you dont own any bitcoin, you wont truly understand it.",2021-05-30T095935.691,0.6,1.0,English
"We are presenting Lamborghini! While the scammers buy lambas for themselves, we give the lamba to the safehamsterarmy stop 2021-05-30T095937.760","We are presenting Lamborghini! While the scammers buy lambas for themselves, we give the lamba to the safehamsterarmy",2021-05-30T095937.760,0.0,0.0,English
entrepreneurs DigitalMarketing EmailMarketing InfluencerMarketing contentmarketing affiliatemarketing Check my stop 2021-05-30T095940.134,entrepreneurs DigitalMarketing EmailMarketing InfluencerMarketing contentmarketing affiliatemarketing Check my,2021-05-30T095940.134,0.0,0.0,English
Safebite Airdrop Task 500 SBITE (~$50) Referral 10 SBITE (~$1) Airdrop Link & Informatio stop 2021-05-30T095935.859,Safebite Airdrop Task 500 SBITE (~$50) Referral 10 SBITE (~$1) Airdrop Link & Informatio,2021-05-30T095935.859,0.0,0.0,English
stop 2021-05-30T095938.398,,2021-05-30T095938.398,0.0,0.0,no language detected
"24,521,824,423 ELON up for grabs in Dogelon Mars Airdrop! 12 hours to go! CoinMarketCap Crypto Cryptocurrency stop 2021-05-30T095940.683","24,521,824,423 ELON up for grabs in Dogelon Mars Airdrop! 12 hours to go! CoinMarketCap Crypto Cryptocurrency",2021-05-30T095940.683,0.0,0.0,English
The Bitcoin Bubble Is Popping. Heres How To Play It. by stop 2021-05-30T095936.091,The Bitcoin Bubble Is Popping. Heres How To Play It. by,2021-05-30T095936.091,0.0,0.0,English
Good and Great project Amazing and success project stop 2021-05-30T095938.476,Good and Great project Amazing and success project,2021-05-30T095938.476,0.6,0.5625,English
. Bitcoin 10 Bitcoin stop 2021-05-30T095940.794,. Bitcoin 10 Bitcoin,2021-05-30T095940.794,0.0,0.0,English
20 doge giveaway Join our server and share your own meme and participate Join us and share your dogeco stop 2021-05-30T095936.287,20 doge giveaway Join our server and share your own meme and participate Join us and share your dogeco,2021-05-30T095936.287,0.6,1.0,English


In [0]:
# but the stream does not want to be loaded into the filesystem...

# Minimal Version of code to write to parquet
words.writeStream\
        .format("parquet")\
        .option("path", "dbfs:/FileStore/original")\
        .option("checkpointLocation", "./check")\
        .start()


In [0]:
'''
# Alternative version we tried
words = words.repartition(1)
words.writeStream\
        .queryName("all_tweets")\
        .outputMode("append")\
        .format("parquet")\
        .option("path", "dbfs:/FileStore/original")\
        .option("checkpointLocation", "./check")\
        .trigger(processingTime='30 seconds') \
        .start()
'''

In [0]:
'''
Previous attempt to write the aggregated stream - not relevant at the time as we do not have an aggragated window
# debugging aggregated stream: write to memory
words \
    .writeStream \
    .queryName("aggregates") \
    .outputMode("complete") \
    .format("memory") \
    .start()

spark.sql("select * from aggregates").show()   # interactively query in-memory table
'''


In [0]:
# %fs rm -r dbfs:/FileStore/original