# Tweets abnehmen und Sentiment analyse anwenden

Die Tweets aus dem Producer werden eingelesen und als PArquet abgelegt.
Darauf wird via *textblob* eine Sentiment analyse gemacht und jedem Tweet ein Rating zugewiesen.

In [0]:
# !pip install textblob

In [0]:
# Libraries einlesen

from pyspark.sql import SparkSession
from pyspark.sql.functions import *   # including udf
from pyspark.sql.functions import desc
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql.types import *   
from pyspark.sql import functions as F
from textblob import TextBlob
from datetime import datetime
import random


In [0]:
# Definition der Textklassification via Textblob

@udf
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity

@udf
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity


In [0]:
# nur für Test: erzeugen eines Dummy Preises für Bitcoint

@udf
def bitcoint_price():
    x = 30000 + random.randrange(100, 1000, 2) 
    return x


In [0]:
# Erstellen eines local/private StreamingContext (SparkContext 'sc' besteht in databricks bereits)

ssc = StreamingContext(sc, 2)   # batch interval = 2
stream = ssc.socketTextStream("localhost", 9997)


'''# Ausgabe des stream in die Konsole für 3 Minuten, danach Abbruch
stream.pprint()
try:
  ssc.start()                             
  ssc.awaitTerminationOrTimeout(180)  # Ausgabe im consumer erst nach timeout möglich (sekunden)
finally:
  ssc.stop(False)'''


In [0]:
# Funktionen um Dstream in Dataframe mit 5-Sekunden-Fenster zu transformieren

# Dstream abnehmen
lines = spark \
        .readStream.format("socket") \
        .option("host", "localhost") \
        .option("port", 9997) \
        .load()

# Aufsplitten des Streams in Haupttweet und Re-Tweets
structuredStream = lines \
        .select(split(lines.value, "_t_end_")[0].alias("text") \
                , split(lines.value, "_t_end_")[1].alias("rt_text")
                , split(lines.value, "_t_end_")[2].alias("rt_text2")
               ) \

# Timestamp einfügen
now = datetime.now()
structuredStream = structuredStream.withColumn("timestamp", lit(str(now)[:19]))

# Sentiment einfügen
structuredStream = structuredStream.withColumn("subjectivity", subjectivity_detection("text").cast('float'))
structuredStream = structuredStream.withColumn("polarity", polarity_detection("text").cast('float'))

# Preis Einfügen
structuredStream = structuredStream.withColumn("price", bitcoint_price().cast('float'))

# Erstellen eines 6 Sekunden-Fensters (als Basis für alle Analysen)
windowedStream = structuredStream \
        .groupBy(window("timestamp", "6 seconds", "6 seconds"))

# Aggregationsfunktion
aggregationsStream = windowedStream \
        .agg(count('timestamp').alias("count_tweets") \
           , avg('subjectivity').alias('sub_avg') \
           , avg('polarity').alias('pol_avg') \
           , avg('price').alias('price_avg') \
           )


window,count_tweets,sub_avg,pol_avg,price_avg
"List(2021-05-24T09:28:05.000+0000, 2021-05-24T09:28:10.000+0000)",1442,0.0724579363224327,0.0292780689375885,30553.332871012484


In [0]:
# Anzeige des aggregierten Streams 
display(aggregationsStream.sort(desc("window.start")))


ab hier ist aller experimentell

In [0]:
# Erweitertes preprocessing 
# könnten wir oben bei der erstellung des lines einsetzen
# putzt zusätzlich noch spezial character raus

def preprocessing(lines):
    words = lines.select(explode(split(lines.value, " _t_end_ ")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    return words


In [0]:
# Sink der Daten in ein Parquet file
# dieser SCH.. kommt nicht zum laufen...

query = aggregationsStream \
    .writeStream \
    .queryName("bc_table") \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "dbfs:/FileStore/bd_project") \
    .option("checkpointLocation", "./check") \
    .trigger(processingTime='30 seconds') \
    .start()

query.awaitTermination()

File System functions

In [0]:
%fs ls dbfs:/FileStore/bd_project/

path,name,size
dbfs:/FileStore/bd_project/_spark_metadata/,_spark_metadata/,0


In [0]:
# Directory anzeigen
dbutils.fs.ls("dbfs:/dbfs")

In [0]:
# Folder erstellen
dbutils.fs.mkdirs("dbfs/FileStore/bd_project/test")

In [0]:
# Folder loeschen
dbutils.fs.rm("dbfs:/dbfs")

In [0]:
%fs rm -r local_disk0

In [0]:
%fs ls dbfs:/local_disk0/tmp

path,name,size
dbfs:/local_disk0/tmp/temporary-5546d96c-2644-48f9-8e65-7dbaa6d47ad1/,temporary-5546d96c-2644-48f9-8e65-7dbaa6d47ad1/,0
dbfs:/local_disk0/tmp/temporary-5c61b8a4-c638-4436-81fd-9d865132f3a3/,temporary-5c61b8a4-c638-4436-81fd-9d865132f3a3/,0
dbfs:/local_disk0/tmp/temporary-62083902-41cd-425b-9396-275d5ddf7d17/,temporary-62083902-41cd-425b-9396-275d5ddf7d17/,0
dbfs:/local_disk0/tmp/temporary-a05181ac-72b4-4ecf-b59f-1e65c1dfdddb/,temporary-a05181ac-72b4-4ecf-b59f-1e65c1dfdddb/,0
dbfs:/local_disk0/tmp/temporary-ae243fbf-3f38-4688-af45-e2d43e8db440/,temporary-ae243fbf-3f38-4688-af45-e2d43e8db440/,0
dbfs:/local_disk0/tmp/temporary-b3feb3aa-3441-4f1c-9c10-33062c7dafaa/,temporary-b3feb3aa-3441-4f1c-9c10-33062c7dafaa/,0
dbfs:/local_disk0/tmp/temporary-b679164d-4349-4d1e-a514-fb304bb1925f/,temporary-b679164d-4349-4d1e-a514-fb304bb1925f/,0
dbfs:/local_disk0/tmp/temporary-ef8c0713-44eb-49aa-83c5-53c44e9fbfa8/,temporary-ef8c0713-44eb-49aa-83c5-53c44e9fbfa8/,0
