# Tweets streaming - Consumer
Read Stream into dataframe, add timestamp, sentiment and price, save aggregated window

In [0]:
#!pip install textblob
#!pip install pycountry

In [0]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob
import re
import pycountry
from datetime import datetime

In [0]:
# Funktions to split away the timestamp from the tweet message
@udf
def split_start(text):
  stamp = text[:-27]
  return stamp
@udf
def split_back(text):
  stamp = text[-21:-4]     # -4 because reading it without milliseconds 
  return stamp


In [0]:
# Additional cleaning and splitting of the incoming text
@udf
def preprocessing(lines):
    words = lines.select(explode(split(lines.value, "t_end")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    words = words.withColumn('tweet_txt', split_start('word'))
    words = words.withColumn('stamp', split_back('word'))
    return words
  

In [0]:
# Apply Textblob Sentiment Analysis
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity
def language_detection(text):
    try:
      iso_code = TextBlob(text).detect_language()
      language = pycountry.languages.get(alpha_2=iso_code)
      language_name = language.name
    except:
      language_name = 'no language detected'
    return language_name
def text_classification(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("tweet_txt").cast('float'))
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType())
    words = words.withColumn("subjectivity", subjectivity_detection_udf("tweet_txt").cast('float'))
    # language detection
    language_detection_udf = udf(language_detection, StringType())
    words = words.withColumn("language", language_detection_udf("tweet_txt"))
    return words
  

In [0]:
# Funtcion to call the current bitcoint price
@udf
def get_actual_crypto_price(crypto):
  base_url = 'https://coinmarketcap.com'
  request = requests.get(base_url)
  soup = BeautifulSoup(request.content, 'html.parser')
  data = soup.find('script', id="__NEXT_DATA__", type="application/json")
  coins = {}

  coin_data = json.loads(data.contents[0])
  listings = coin_data['props']['initialState']['cryptocurrency']['listingLatest']['data']

  for i in listings:
    crypto_curr = i['name']
    if crypto_curr.lower() == crypto.lower():
      evaluate_price = i['quotes'][2]
      #print(evaluate_price)
      coins[str(i['id'])] = i['slug']
      coins['currency'] = evaluate_price['name']
      coins['actual_price'] = evaluate_price['price']
      coins['percentChange24h'] = evaluate_price['percentChange24h']
  return coins['actual_price']

In [0]:
# Create Spark session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

# Read the tweet data from socket
lines = spark.readStream.format("socket") \
        .option("host", "localhost") \
        .option("port", 9997) \
        .load()

# Preprocess the data
words = preprocessing(lines)

# Re-format timestamp from string to TimestampType
format = '%Y-%m-%dT%H%M%S'
time_udf = udf(lambda x: datetime.strptime(x, format), TimestampType())
words = words.withColumn('t_stamp', time_udf('stamp'))

# Apply comulmn with text classification to define polarity and subjectivity
words = text_classification(words)

# Add a column with the currency lable 'bitcoin'
words = words.withColumn("crypto", lit('bitcoin'))


In [0]:
# Create a 6 Ssecond window as basis for all subsequent analysis
windowedStream = words.groupBy(window('t_stamp', '6 seconds', '6 seconds'))

# Aggregate the figures we need and add the bitcoin price
aggregationsStream = windowedStream \
        .agg(count('tweet_txt').alias('count_tweets') \
           , avg('subjectivity').alias('sub_avg') \
           , avg('polarity').alias('pol_avg') \
           , get_actual_crypto_price(first(col('crypto'))).alias('window_price') \
           , max('t_stamp').alias('t_stamp')
           )


In [0]:
#display(words)
display(aggregationsStream)


window,count_tweets,sub_avg,pol_avg
"List(2021-06-01T20:44:12.000+0000, 2021-06-01T20:44:18.000+0000)",21,0.3112433886244183,0.1777943136791388
"List(2021-06-01T20:43:00.000+0000, 2021-06-01T20:43:06.000+0000)",13,0.2153846122897588,0.1397435917304112
"List(2021-06-01T20:43:30.000+0000, 2021-06-01T20:43:36.000+0000)",15,0.266801347831885,0.0880555561433235
"List(2021-06-01T20:43:36.000+0000, 2021-06-01T20:43:42.000+0000)",17,0.3401960822589257,0.0691176462261115
"List(2021-06-01T20:42:00.000+0000, 2021-06-01T20:42:06.000+0000)",25,0.2020678198337555,0.034510822892189
"List(2021-06-01T20:41:36.000+0000, 2021-06-01T20:41:42.000+0000)",30,0.2202777780592441,0.0352714657783508
"List(2021-06-01T20:41:00.000+0000, 2021-06-01T20:41:06.000+0000)",24,0.2414162472511331,0.0543276518583297
"List(2021-06-01T20:41:48.000+0000, 2021-06-01T20:41:54.000+0000)",20,0.3223106075078249,0.113712123222649
"List(2021-06-01T20:42:48.000+0000, 2021-06-01T20:42:54.000+0000)",14,0.2360389621130057,0.0694805202739579
"List(2021-06-01T20:42:12.000+0000, 2021-06-01T20:42:18.000+0000)",26,0.2026279273514564,0.1153762916532846


In [0]:
# Loading the words to parquet
# this needs to be adapted to aggregated stream / see below
'''
# Minimal Version of code to write to parquet
words.writeStream\
        .format("parquet")\
        .option("path", "dbfs:/FileStore/original")\
        .option("checkpointLocation", "./check")\
        .start()
'''

In [0]:
'''
# Alternative version we tried
words = words.repartition(1)
words.writeStream\
        .queryName("all_tweets")\
        .outputMode("append")\
        .format("parquet")\
        .option("path", "dbfs:/FileStore/original")\
        .option("checkpointLocation", "./check")\
        .trigger(processingTime='30 seconds') \
        .start()
'''

In [0]:
'''
Previous attempt to write the aggregated stream - not relevant at the time as we do not have an aggragated window
# debugging aggregated stream: write to memory
words \
    .writeStream \
    .queryName("aggregates") \
    .outputMode("complete") \
    .format("memory") \
    .start()

spark.sql("select * from aggregates").show()   # interactively query in-memory table
'''


In [0]:
'''
# Testing of UDF to extract datetime from string
df = spark.createDataFrame(
    [('2021-06-01T201633', ), ('2021-06-01T201635', )], 
    ["stamp"]
    )
df.show(truncate=False)

format = '%Y-%m-%dT%H%M%S'
time_udf = udf (lambda x: datetime.strptime(x, format), TimestampType())
df_new = df.withColumn('t_stamp', time_udf('stamp'))
df_new.show(truncate=False)
'''

In [0]:
# %fs rm -r dbfs:/FileStore/original