In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
# reading parquet files from the curated data
dirPath = '/mnt/root/COVID19_TWEETS/CURATED/'

In [3]:
# read the data in batch
tweets = spark.read.parquet(dirPath)

In [4]:
# just to get a feel of how much data we are dealing with:
tweets.where(tweets['user_location'] =='US').count()

In [5]:
tweets.where(tweets["text"]=='').count()

In [6]:
tweets = tweets.where(tweets["text"]!='')

In [7]:
tweets.where(tweets["text"].isNull()).count()

In [8]:
tweets.limit(25).toPandas()

#Leveraging Spark ML:
Data wrangling, feature exctraction and mormalization:

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



# example of feature exctraction - split text and hash string values
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

#build linear regression model - with label col - if the tweet was retweeted or not, this will help us undertand if more people feel the urge to share the message
lr = LinearRegression(featuresCol= 'features', labelCol='is_retweet')

#Constructe Spark ML pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])


model = pipeline.fit(tweets)

prediction = model.transform(tweets)


In [11]:
selected = prediction.select("text","is_retweet","prediction")
for row in selected.collect():
    text, ups, prediction = row
    print("(%s) --> is_retweet=%s, prediction=%f" % (text, str(ups), prediction))

In [13]:
from pyspark.sql.functions import split
# clsffity real time data:

# Event Hubs Connection Configuration
ehConf = {
  'eventhubs.connectionString': dbutils.secrets.get(scope="mle2ebigdatakv", key="mirrortweetstreamingkey") }


input = spark.readStream.format("eventhubs").options(**ehConf).load()
casted = input.withColumn("body",input["body"].cast("string"))
input_text = casted.withColumn("text",split('body',",")[0]).select("text")


In [14]:
prediction_stream = model.transform(input_text)

selected = prediction_stream.select("text","prediction")


selected.writeStream.outputMode("append").format("console").option("truncate", false).start().awaitTermination()

# we print it to the console to get a feel of the model, if we are happy with the results, we will save it file system 
for row in selected:
    text, prediction = row
    print("(%s) --> prediction=%f" % (text, prediction))
    
#for traking the model we use mlflow, read more about it here: https://bit.ly/3jUqHUi



model.write.save("/mnt/root/COVID19_TWEETS/ML-Models/V1")
