# Data Wrangling on Streaming data
Stream data from eventhubs into Azure Databricks
Leverage Spark functionality to transform, clean and normalize the data
to prepare it for Machine Learning modeling, tuning and classification

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Event Hubs Connection Configuration
# for more deatils -> https://bit.ly/2Zw4qED
ehConf = {
  'eventhubs.connectionString': dbutils.secrets.get(scope="mle2ebigdatakv", key="twitterstreamingkey") }

In [3]:
input = spark.readStream.format("eventhubs").options(**ehConf).load()
casted = input.withColumn("body",input["body"].cast("string"))


In [4]:
# when using azure databricks, use this call to visualize the data
#display(inputStream)


# Parse event body and set schema
expectedSchema = StructType([
  StructField("user_name", StringType(), True),
  StructField("user_location", StringType(), True),
  StructField("user_description", StringType(), True),
  StructField("user_created", StringType(), True),
  StructField("user_followers", FloatType(), True),
  StructField("user_friends", FloatType(), True),
  StructField("user_favourites", FloatType(), True),
  StructField("user_verified", BooleanType(), True),
  StructField("date", StringType(), True),
  StructField("text", StringType(), True),
  StructField("hashtags", StringType(), True),
  StructField("source", StringType(), True),
  StructField("is_retweet", BooleanType(), True)
])

# Split the body into an array
comments_stream = inputStream.select(
  inputStream.enqueuedTime.alias('timestamp'),
  split(inputStream.body.cast('string'), ',').alias('splitted_body')
)

# Map the body array to columns
for index, field in enumerate(expectedSchema):
  comments_stream = comments_stream.withColumn(
    field.name, comments_stream.splitted_body.getItem(index)
  )

# Drop irrelevant columns
comments_stream = comments_stream.drop('timestamp', 'splitted_body')

# Set data types

comments_stream = comments_stream \
  .withColumn("user_name", comments_stream["user_name"].cast("string")) \
  .withColumn("user_location", comments_stream["user_location"].cast("flstringoat")) \
  .withColumn("user_description", comments_stream["user_description"].cast("string")) \
  .withColumn("user_created", comments_stream["user_created"].cast("string")) \
  .withColumn("user_followers", comments_stream["user_followers"].cast("float")) \
  .withColumn("user_friends", comments_stream["user_friends"].cast("float")) \
  .withColumn("user_favourites", comments_stream["user_favourites"].cast("float")) \
  .withColumn("user_verified", comments_stream["user_verified"].cast("boolean")) \
  .withColumn("date", comments_stream["date"].cast("string")) \
  .withColumn("text", comments_stream["text"].cast("string")) \
  .withColumn("hashtags", comments_stream["hashtags"].cast("string")) \
  .withColumn("source", comments_stream["source"].cast("string")) \
  .withColumn("is_retweet", comments_stream["is_retweet"].cast("boolean")) \


#Data is partitioned by location, for faster queries based on GEO location

#,Write processed streaming data to storage
# Stream processed data to parquet for the Data Science to explore and build ML models
comments_stream.writeStream \
  .trigger(processingTime = "30 seconds") \
  .format("parquet") \
  .outputMode("append") \
  .partitionBy("user_location") \
  .option("compression", "none") \
  .option("checkpointLocation", "/mnt/stream/_checkpoints/covid19twitter") \
  .start("/mnt/root/COVID19_TWEETS/CURATED/")