# Tweets streaming - Consumer
Read Stream into dataframe, add timestamp, sentiment and price, save aggregated window

In [0]:
!pip install textblob
!pip install pycountry
!pip install tensorflow
!pip install keras
!pip install gensim

In [0]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob
import re
import pycountry
from datetime import datetime
import requests
import pandas as pd
import time
import tqdm

In [0]:
# Funktions to split away the timestamp from the tweet message
@udf
def split_start(text):
  stamp = text[:-27]
  return stamp
@udf
def split_back(text):
  stamp = text[-21:-4]     # -4 because reading it without milliseconds 
  return stamp


In [0]:
# Additional cleaning and splitting of the incoming text
def preprocessing(lines):
    words = lines.select(explode(split(lines.value, "t_end")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    words = words.withColumn('tweet_txt', split_start('word'))
    words = words.withColumn('stamp', split_back('word'))
    return words
  

In [0]:
# Apply Textblob Sentiment Analysis
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def language_detection(text):
    try:
      iso_code = TextBlob(text).detect_language()
      language = pycountry.languages.get(alpha_2=iso_code)
      language_name = language.name
    except:
      language_name = 'no language detected'
    return language_name
# Define sentiment score
def getTextAnalysis(polarity):
    pol = float(polarity)
    if pol < 0:
        return "Negative"
    elif pol == 0:
        return "Neutral"
    else:
        return "Positive"
def text_classification(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("tweet_txt"))
    # language detection
    language_detection_udf = udf(language_detection, StringType())
    words = words.withColumn("language", language_detection_udf("tweet_txt"))
    # Score sentiment definition
    score_sentiment_udf = udf(getTextAnalysis, StringType())
    words = words.withColumn("score", score_sentiment_udf("polarity"))
    return words
  

In [0]:
# Create Spark session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

# Read the tweet data from socket
lines = spark.readStream.format("socket") \
        .option("host", "localhost") \
        .option("port", 9997) \
        .load()

# Preprocess the data
words = preprocessing(lines)

# Apply comulmn with text classification to define polarity and subjectivity
words = text_classification(words)

# Filter all tweets in English
words = words.filter(words.language == "English")


In [0]:
display(words)

In [0]:
# Write new data to Parquet files
words \
    .writeStream \
    .format("parquet") \
    .option("checkpointLocation", "dbfs:/FileStore/checkpoint/") \
    .option("path", "dbfs:/FileStore/project/") \
    .start()

In [0]:
# Read parquet files
parqDF = spark.read.parquet("dbfs:/FileStore/project/")

In [0]:
# Show all entries
parqDF.createOrReplaceTempView("ParquetTable")
parkSQL = spark.sql("select * from ParquetTable")
#parkSQL.collect()
parkSQL.show()

In [0]:
#%fs rm -r dbfs:/FileStore/project/export

In [0]:
#%fs ls dbfs:/FileStore/original/tweet_sparksql/

In [0]:
# Write DataFrame data to CSV file
path = "dbfs:/FileStore/project/export"
export_csv = parkSQL.select("tweet_txt","score")
#export_csv.write.csv(path)

In [0]:
export_csv.write.option("header",True)\
          .option("delimiter",";")\
          .csv(path)

In [0]:
export_csv.coalesce(1).write.csv(path)

In [0]:
df = spark.read.csv(path)
df.show()