# Tweets streaming - Consumer
Read Stream into dataframe, add timestamp, sentiment and price, save aggregated window

In [0]:
!pip install textblob
!pip install pycountry
!pip install bs4

In [0]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob
import re
import pycountry
from datetime import datetime
import requests
import pandas as pd
import json
import time
from bs4 import BeautifulSoup

In [0]:
# Funktions to split away the timestamp from the tweet message
@udf
def split_start(text):
  stamp = text[:-27]
  return stamp
@udf
def split_back(text):
  stamp = text[-21:-4]     # -4 because reading it without milliseconds 
  return stamp


In [0]:
# Additional cleaning and splitting of the incoming text
def preprocessing(lines):
    words = lines.select(explode(split(lines.value, "t_end")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    words = words.withColumn('tweet_txt', split_start('word'))
    words = words.withColumn('stamp', split_back('word'))
    return words
  

In [0]:
# Apply Textblob Sentiment Analysis
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def language_detection(text):
    try:
      iso_code = TextBlob(text).detect_language()
      language = pycountry.languages.get(alpha_2=iso_code)
      language_name = language.name
    except:
      language_name = 'no language detected'
    return language_name
# Define sentiment score
def getTextAnalysis(polarity):
    pol = float(polarity)
    if pol < 0:
        return "Negative"
    elif pol == 0:
        return "Neutral"
    else:
        return "Positive"
def text_classification(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("tweet_txt"))
    # language detection
    language_detection_udf = udf(language_detection, StringType())
    words = words.withColumn("language", language_detection_udf("tweet_txt"))
    # Score sentiment definition
    score_sentiment_udf = udf(getTextAnalysis, StringType())
    words = words.withColumn("score", score_sentiment_udf("polarity"))
    return words
  

In [0]:
# Create Spark session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

# Read the tweet data from socket
lines = spark.readStream.format("socket") \
        .option("host", "localhost") \
        .option("port", 9997) \
        .load()

# Preprocess the data
words = preprocessing(lines)

# Apply comulmn with text classification to define polarity and subjectivity
words = text_classification(words)

# Filter all tweets in English
words = words.filter(words.language == "English")


In [0]:
display(words)
#display(aggregationsStream)

word,tweet_txt,stamp,polarity,language,score
You tell me Safemoon ElSalvador Bitcoin stop 2021-06-09T202922.087,You tell me Safemoon ElSalvador Bitcoin,2021-06-09T202922,0.0,English,Neutral
El Salvador just dug a new geo-thermal well capable of generating 95MW from a volcano 0% emissions! Theyre already d stop 2021-06-09T202922.884,El Salvador just dug a new geo-thermal well capable of generating 95MW from a volcano 0% emissions! Theyre already d,2021-06-09T202922,0.1931818181818181,English,Positive
"Our engineers just informed me that they dug a new well, that will provide approximately 95MW of 100% clean, 0 emissions g stop 2021-06-09T202923.473","Our engineers just informed me that they dug a new well, that will provide approximately 95MW of 100% clean, 0 emissions g",2021-06-09T202923,0.0343434343434343,English,Positive
U.K. payments provider Bottlepay offers Europeans low-cost euro transfers and bitcoin payments. stop 2021-06-09T202924.178,U.K. payments provider Bottlepay offers Europeans low-cost euro transfers and bitcoin payments.,2021-06-09T202924,0.1,English,Positive
Now on sale. Bitcoin SAFEMOONARMY stop 2021-06-09T202926.096,Now on sale. Bitcoin SAFEMOONARMY,2021-06-09T202926,0.0,English,Neutral
"Ive just instructed the president of (our state-owned geothermal electric company), to put up a plan to offer fa stop 2021-06-09T202927.433","Ive just instructed the president of (our state-owned geothermal electric company), to put up a plan to offer fa",2021-06-09T202927,0.0,English,Neutral
Volcano-Powered Bitcoin Mining Goes From Twitter Idea to State Policy in ElSalvador stop 2021-06-09T202928.746,Volcano-Powered Bitcoin Mining Goes From Twitter Idea to State Policy in ElSalvador,2021-06-09T202928,0.0,English,Neutral
"You cant be neutral in TwitterBanNigeria. To be neutral is to side with Buhari, the bully with a deflated ego. Take a sta stop 2021-06-09T202929.314","You cant be neutral in TwitterBanNigeria. To be neutral is to side with Buhari, the bully with a deflated ego. Take a sta",2021-06-09T202929,0.0,English,Neutral
nfts nft digitalart art cryptoart l raredigitalart ethereu artoftheday cryptoartist blockchain stop 2021-06-09T202929.957,nfts nft digitalart art cryptoart l raredigitalart ethereu artoftheday cryptoartist blockchain,2021-06-09T202929,0.0,English,Neutral
"Ive just instructed the president of (our state-owned geothermal electric company), to put up a plan to offer fa stop 2021-06-09T202930.605","Ive just instructed the president of (our state-owned geothermal electric company), to put up a plan to offer fa",2021-06-09T202930,0.0,English,Neutral


In [0]:
#%fs rm -r dbfs:/FileStore/original

In [0]:
#%fs ls dbfs:/FileStore/original/tweet_sparksql/