# Tweets abnehmen und Sentiment analyse anwenden

Die Tweets aus dem Producer werden eingelesen und als PArquet abgelegt.
Darauf wird via *textblob* eine Sentiment analyse gemacht und jedem Tweet ein Rating zugewiesen.

In [0]:
# !pip install textblob

In [0]:
# Libraries einlesen

from pyspark.sql import SparkSession
from pyspark.sql.functions import *   # including udf
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql.types import *   
from pyspark.sql import functions as F
from textblob import TextBlob
import random


In [0]:
# nur zum testen: stream sichtbar machen / 

# Create a local/private StreamingContext and SparkContext.
ssc = StreamingContext(sc, 5)   # batch interval = 5
stream = ssc.socketTextStream("localhost", 9997)

# hier die funktionen zum stream in df wandelt einfügen
# dann writestream machen 
stream.pprint()

# Start the computation with timeout function
try:
  ssc.start()                             
  ssc.awaitTerminationOrTimeout(200)  # Ausgabe im consumer erst nach timeout möglich (sekunden)
finally:
  ssc.stop(False)


In [0]:
# Definiton Tokenizer und Text-preprocess

def preprocessing(lines):
    words = lines.select(explode(split(lines.value, " _t_end_ ")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    return words


In [0]:
# Definition der Textklassification via Textblob

def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
  
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity


In [0]:
# nur für Test: erzeugen eines Dummy Preises für Bitcoint

def bitcoint_price():
    x = 30000 + random.randrange(100, 1000, 2) 
    return x


In [0]:
# Add sentiment und Preis

def add_newcolumns(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("word"))
    
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType())
    words = words.withColumn("subjectivity", subjectivity_detection_udf("word"))
  
    # add the price to the dataframe'''
    add_price_udf = udf(bitcoint_price, StringType())
    words = words.withColumn("price", add_price_udf())  
    return words
  

In [0]:
# Laden der Tweets und Anwendung der textblob_Analyse

# Lokale Spark Session erstellen
spark = SparkSession \
    .builder \
    .appName("TwitterBitcoin") \
    .getOrCreate()

# Tweet data aus dem Socket lesen
lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9997) \
    .load()

# Split the lines into words
words = lines.select(explode(split(lines.value, " ")).alias("word"))

# Preprocess the data
# words = preprocessing(lines)

# apply text classification
# words = add_newcolumns(words)


In [0]:
words

In [0]:
query = words \
    .writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "dbfs:/FileStore/bd_project") \
    .option("checkpointLocation", "./check") \
    .trigger(processingTime='5 seconds') \
    .start()

query.awaitTermination()

In [0]:
# Tweets zusemman mit Sentiment und Preis in ein Parquet Files schreiben (5 sec Fenster)

# words = words.repartition(1)
query = words \
    .writeStream \
    .queryName("all_tweets") \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "dbfs:/FileStore/bd_project") \
    .option("checkpointLocation", "./check") \
    .trigger(processingTime='60 seconds') \
    .start()

query.awaitTermination()
