In [None]:
# import findspark
import findspark
# initialize findspark with spark directory
findspark.init("C:\Program Files\Spark\spark-3.3.1-bin-hadoop3")
#findspark.init("/Users/wouterdewitte/spark/")
# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

In [None]:
# import packages
import os 
import pickle
import re
from datetime import datetime
import requests
import pytz
import emojis
import pandas as pd
import numpy as np
import ast
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import array_contains
import matplotlib.pyplot as plt 
import emojis

import pandas as pd
import numpy as np
import ast

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
import csv
import pandas as pd
import os
import json
import plotly.express as px
from pandas.tseries.holiday import nearest_workday, \
    AbstractHolidayCalendar, Holiday, \
    USMartinLutherKingJr, USPresidentsDay, GoodFriday, \
    USMemorialDay, USLaborDay, USThanksgivingDay
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification

## General

In this notebook we will buid a model that predicts if the trend of a certain topic goes up or down on a certain day based on Twitter data of that day.

## 1. Import Data

### 1.1 Google Trends

In [None]:
# read trend data 
trend = spark.read.csv(".././../data/Google_trends/daily_trends.csv", header=True, inferSchema=True, sep=';')

In [None]:
trend 

In [None]:
from pyspark.sql.window import Window

w = Window().partitionBy().orderBy(col("date"))
trend.withColumn("dependent_vegan", lag("dependent_vegan", -1, 0).over(w)).show()

In [None]:
# create SQL view
trend.createOrReplaceTempView("trendSQL")

The binary variable indicates if the trend goes up or down.

### 1.2 Twitter

In [None]:
# define data dir
data_dir = "../../data/Topic/"

# get all twitter files
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)] 

In [None]:
# import twitter data 
#twitter_df = spark.read.json(tweet_files)

In [None]:
list_hashtags = ["vegan"]

data_dir = ".././../data/Topic/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]
files_hashtags = [file for file in tweet_files if (file.find(list_hashtags[0]) != -1)]             
twitter_df = spark.read.option("multiline","true").json(files_hashtags) 
twitter_df.count()

In [None]:
# select interesting features
twitter_df = twitter_df.select(F.col('user.name'),
                                F.col('user.screen_name'),
                                F.col('user.followers_count'),
                                F.col('user.following'),
                                F.col('user.statuses_count'),
                                F.col('user.listed_count'),
                                F.col('created_at'),
                                F.col('full_text'),
                                F.col('entities.hashtags'),
                                F.col('favorite_count'),
                                F.col('retweet_count'),
                                F.col('user.friends_count'))

## 2. Data Preprocessing

### 2.1 Check time period

In [None]:
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
twitter_df = twitter_df.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

In [None]:
# get first post
first_post = F.min('post_created_at').alias('earliest')
# get latest post
latest_post = F.max('post_created_at').alias('latest')
# show tweet period in our dataset
twitter_df.select(first_post, latest_post).show()

### 2.2 Remove retweets and duplicates

In [None]:
# drop all retweets from dataset
no_retweets_df = twitter_df.filter(~F.col("full_text").startswith("RT"))

In [None]:
# first sort no_retweets_df based on date in chronological order (most recent ones on top)
no_retweets_sorted_df = no_retweets_df.sort("post_created_at", ascending=False)

In [None]:
# number of observations before dropping duplicates
no_retweets_sorted_df.count()

In [None]:
# drop duplicates based on tweet text and the profile it was posted from
final_no_duplicates_df = no_retweets_sorted_df.drop_duplicates(["full_text", "screen_name"])

In [None]:
# number of observations after dropping duplicates
final_no_duplicates_df.count()

In [None]:
# rename dataframe
final_twitter_df = final_no_duplicates_df

## 3. Independent Variables

For our independent variables we need to design a pipeline that transforms the data into the desired aggregated metrics per day.

### 3.0 Feature Engineering

#### Define Functions

In [None]:
# define function to count hashtags
def get_hashtags(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "#" in word:
            counter += 1
    return(counter)

In [None]:
# define function to count mentions
def get_mentions(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "@" in word:
            counter += 1
    return(counter)

In [None]:
# define function to count exclamation marks
def get_exclamation_marks(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "!" in word:
            counter += 1
    return(counter)

In [None]:
# define function to count number of emojis used
def emoji_counter(text):
    nr_emojis = emojis.count(text)
    return(nr_emojis)

In [None]:
# define function to calculate engagement rate
def engagement_rate(favorite_count, retweet_count, followers_count):
    if(followers_count == 0):
        eng_rate = 0
    else:
        eng_rate = (favorite_count + retweet_count)/followers_count
    
    return eng_rate

In [None]:
# register functions as udf
get_hashtags_UDF = F.udf(get_hashtags, IntegerType())
get_mentions_UDF = F.udf(get_mentions, IntegerType())
get_exclamation_marks_UDF = F.udf(get_exclamation_marks, IntegerType())
emoji_counter_UDF = F.udf(emoji_counter, IntegerType())
engagement_rate_UDF = F.udf(engagement_rate, DoubleType())

In [None]:
# apply functions to create new features
final_twitter_df = final_twitter_df.withColumn("emoji_count", emoji_counter_UDF("full_text")) \
        .withColumn("text_tokenized", F.split("full_text", " ")) \
        .withColumn("num_words", F.size("text_tokenized")) \
        .withColumn("num_hashtags", get_hashtags_UDF("text_tokenized")) \
        .withColumn("num_mentions", get_mentions_UDF("text_tokenized")) \
        .withColumn("num_exclamation_marks", get_exclamation_marks_UDF("text_tokenized")) \
        .withColumn("engagement_rate", engagement_rate_UDF("favorite_count", "retweet_count", "followers_count"))

#### Text Cleaning

In [None]:
# filter for english tweets (NOTE: for the assignment you can translate non-english tweets using an API)
final_twitter_df = final_twitter_df.filter(F.col("lang") == "en")

In [None]:
# check number of observations
final_twitter_df.count()

In [None]:
# define function to clean text
def clean_text(string):
    
    # define numbers
    NUMBERS = '0123456789'
    PUNCT = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    
    # convert text to lower case
    cleaned_string = string.lower()
    
    # remove URLS
    cleaned_string = re.sub(r'http\S+', ' ', cleaned_string)
    
    # replace emojis by words
    cleaned_string = emojis.decode(cleaned_string)
    cleaned_string = cleaned_string.replace(":"," ").replace("_"," ")
    cleaned_string = ' '.join(cleaned_string.split())
    
    # remove numbers
    cleaned_string = "".join([char for char in cleaned_string if char not in NUMBERS])
    
    # remove punctuation
    cleaned_string = "".join([char for char in cleaned_string if char not in PUNCT])
    
    # remove words conisting of one character (or less)
    cleaned_string = ' '.join([w for w in cleaned_string.split() if len(w) > 1])
    
    # return
    return(cleaned_string)

In [None]:
# convert to udf
clean_text_udf = F.udf(clean_text, StringType())

In [None]:
# clean string
final_twitter_df = final_twitter_df.withColumn("cleaned_text", clean_text_udf(F.col("full_text")))

In [None]:
# check
pd.set_option('display.max_colwidth', None)
final_twitter_df.select("full_text", "cleaned_text").limit(5).toPandas()

#### Sentiment

VADER sentimental analysis relies on a dictionary that maps lexical features to emotion intensities known as sentiment scores. The sentiment score of a text can be obtained by summing up the intensity of each word in the text.

In [None]:
#using the vaderSentiment package 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [None]:
# define the function to extract the sentiment
def get_sentiment(sentence):

    # initialize sentiment analyzer
    sid_obj = SentimentIntensityAnalyzer()

    # get sentiment dict
    sentiment_dict = sid_obj.polarity_scores(sentence)
    
    # get positive sentiment score
    pos_sentiment = sentiment_dict["pos"]
    
    # return positive sentiment score
    return(pos_sentiment)

get_sentiment_udf = udf(get_sentiment, DoubleType())

In [None]:
final_twitter_df = final_twitter_df.withColumn("sentiment_vader", get_sentiment_udf(F.col("cleaned_text")))

In [None]:
final_twitter_df.select("cleaned_text", "sentiment_vader").show()

#### TextBlob

TextBlob returns polarity and subjectivity of a sentence. 

**Polarity** lies between [-1,1],  -1 defines a negative sentiment and 1 defines a positive sentiment.  

**Subjectivity** quantifies the amount of personal opinion and factual information contained in the text. Subjectivity lies between [0,1]. The higher subjectivity means that the text contains personal opinion rather than factual information. 

In [None]:
#use polarity and subjectivity from TextBlob 
#https://textblob.readthedocs.io/en/dev/
from textblob import TextBlob

# define function to get polarity score of text document
def get_polarity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[0]
# define function to get subjectivity score of text document
def get_subjectivity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[1]
get_polarity_udf = F.udf(get_polarity, DoubleType())
get_subjectivity_udf = F.udf(get_subjectivity, DoubleType())

final_twitter_df = final_twitter_df.withColumn('polarity', get_polarity_udf(F.col('cleaned_text')))\
        .withColumn('subjectivity', get_subjectivity_udf(F.col('cleaned_text')))

final_twitter_df.select("cleaned_text", "sentiment_vader", "polarity", "subjectivity").show()

In [None]:
final_twitter_df = final_twitter_df.withColumn('polarity', get_polarity_udf(F.col('cleaned_text')))\
        .withColumn('subjectivity', get_subjectivity_udf(F.col('cleaned_text')))

In [None]:
final_twitter_df.select("cleaned_text", "sentiment_vader", "polarity", "subjectivity").show()

In [None]:
# create SQL view
final_twitter_df.createOrReplaceTempView("twitterSQL")

### 3.1 Volume of tweets 

In [None]:
# select the relevant data
volume = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, COUNT(*) as volume \
                                    FROM twitterSQL \
                                    GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                    ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
volume.show(100)

In [None]:
# create SQL view
volume.createOrReplaceTempView("volumeSQL")

### 3.2 Average likes

We exclude tweets with 0 likes.

In [None]:
# select the relevant data
avg_likes = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(favorite_count) as avg_likes \
                           FROM twitterSQL \
                           WHERE favorite_count > 0 \
                           GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                           ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_likes.show()

In [None]:
# create SQL view
avg_likes.createOrReplaceTempView("avg_likesSQL")

### 3.3 Average Retweets

We exclude tweets with 0 retweets.

In [None]:
# select the relevant data
avg_retweets = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(retweet_count) as avg_retweets \
                          FROM twitterSQL \
                          WHERE retweet_count > 0 \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_retweets.show()

In [None]:
# create SQL view
avg_retweets.createOrReplaceTempView("avg_retweetsSQL")

### 3.4 Average Engagement rate

We define engagement rate of a tweet as the sum of likes and retweets divided by the amount of followers of the account that sent out the tweet. For our purpose we will take the avergage engagement rate per day. We exclude accounts who have no followers and we only take tweets into account which are liked and retweeted at least once.

In [None]:
# select the relevant data
avg_engagement_rate = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(engagement_rate) as avg_engagement_rate \
                                     FROM twitterSQL \
                                     GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                     ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_engagement_rate.show()

In [None]:
# create SQL view
avg_engagement_rate.createOrReplaceTempView("avg_engagement_rateSQL")

### 3.5 Number of influencers

We will calculate how many influencers actively tweeted a certain day. We define an influencer as someone with:
- followers > 1000 
- engagement_rate > 0.20 
- weekly tweet frequency > 5

In [None]:
def get_influencers(follower_count_tresh, eng_rate_tresh, freq_week_tresh, data):

    #df
    df = data
    
    # get all users with their amount of followers
    influencers = df.groupBy("screen_name") \
                    .agg(first("followers_count").alias("followers_count"))

    # average engagement rate for each user
    eng_rate = df.withColumn('eng_rate', ((df['favorite_count'] + df['retweet_count'])/df['followers_count']))

    eng_rate_user = eng_rate.groupBy("screen_name") \
                            .agg(avg("eng_rate").alias("eng_rate"))

    # average freq_weekly per user
    freq_week = df.withColumn("year", year(df["post_created_at"]))
    freq_week = freq_week.withColumn('week', weekofyear('post_created_at'))

    freq_week = freq_week.groupBy('screen_name', 'year', 'week').agg(countDistinct("full_text"))\
                    .withColumnRenamed("count(full_text)", "freq") \
                        .sort('screen_name', 'year', 'week', ascending = True)
    freq_week = freq_week.select('screen_name', 'freq')

    freq_week = freq_week.groupby("screen_name").agg(avg(freq_week.freq).alias('freq'))

    # put the data together
    data_joined = eng_rate_user.join(influencers, "screen_name").join(freq_week, "screen_name")

    # filter the data
    data_joined = data_joined.filter((data_joined.followers_count > follower_count_tresh) & (data_joined.eng_rate > eng_rate_tresh) & (data_joined.freq > freq_week_tresh))
    
    # show the data
    data_joined.show()
    return data_joined

In [None]:
influencers = get_influencers(1000, 0.002, 2, final_twitter_df)

In [None]:
# create SQL view
influencers.createOrReplaceTempView("influencersSQL")

In [None]:
# select the relevant data
number_of_influencers = spark.sql(" SELECT DATE_FORMAT(a.post_created_at, 'Y-M-dd') as date, COUNT(b.screen_name) as influencers \
                                    FROM twitterSQL a \
                                    RIGHT OUTER JOIN influencersSQL b ON a.screen_name = b.screen_name\
                                    GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                    ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
number_of_influencers.show()

In [None]:
# create SQL view
number_of_influencers.createOrReplaceTempView("number_of_influencersSQL")

### 3.6 Average Followers 

In [None]:
# select the relevant data
avg_followers = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(followers_count) as avg_followers \
                          FROM twitterSQL \
                          WHERE followers_count > 0 \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_followers.show()

In [None]:
# create SQL view
number_of_influencers.createOrReplaceTempView("followersSQL")

### 3.7 Average Emoji Counts

In [None]:
# select the relevant data
avg_emoji = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(emoji_count) as avg_emojis \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_emoji.show()

In [None]:
# create SQL view
number_of_influencers.createOrReplaceTempView("emojiSQL")

### 3.8 Avergae Number of Words

In [None]:
# select the relevant data
avg_words = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_words) as avg_words \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_words.show()

In [None]:
# create SQL view
avg_words.createOrReplaceTempView("wordsSQL")

### 3.9 Avergae Number of Hashtags

In [None]:
# select the relevant data
avg_hashtags = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_hashtags) as avg_hashtags \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_hashtags.show()

In [None]:
# create SQL view
avg_hashtags.createOrReplaceTempView("hashtagsSQL")

### 3.10 Average Number of Mentions

In [None]:
# select the relevant data
avg_mentions = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_mentions) as avg_mentions \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_mentions.show()

In [None]:
# create SQL view
avg_mentions.createOrReplaceTempView("mentionsSQL")

### 3.11 Average Number of Exclamation Marks

In [None]:
# select the relevant data
avg_marks = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_exclamation_marks) as avg_exclamation_marks \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_marks.show()

In [None]:
# create SQL view
avg_marks.createOrReplaceTempView("marksSQL")

### 3.12 Sentiment 

In [None]:
# select the relevant data
avg_sentiment = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(sentiment_vader) as avg_sentiment \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_sentiment.show()

In [None]:
# create SQL view
avg_sentiment.createOrReplaceTempView("sentimentSQL")

### 3.13 Polarity

In [None]:
# select the relevant data
avg_polarity = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(polarity) as avg_polarity \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_polarity.show()

In [None]:
# create SQL view
avg_polarity.createOrReplaceTempView("polaritySQL")

### 3.13 Subjectivity

In [None]:
# select the relevant data
avg_subjectivity = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(subjectivity) as avg_subjectivity \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [None]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_subjectivity.show()

In [None]:
# create SQL view
avg_subjectivity.createOrReplaceTempView("subjectivitySQL")

## 4. Basetable

In [None]:
# create basetable
basetable = volume.join(avg_likes, "date", how="inner") \
                    .join(avg_retweets, "date", how="inner") \
                    .join(avg_engagement_rate, "date", how="inner") \
                    .join(number_of_influencers, "date", how="inner") \
                    .join(avg_followers, "date", how="inner") \
                    .join(avg_emoji, "date", how="inner") \
                    .join(avg_words, "date", how="inner") \
                    .join(avg_hashtags, "date", how="inner") \
                    .join(avg_mentions, "date", how="inner") \
                    .join(avg_marks, "date", how="inner") \
                    .join(avg_sentiment, "date", how="inner") \
                    .join(avg_polarity, "date", how="inner") \
                    .join(avg_subjectivity, "date", how="inner") \
                    .join(trend, "date", how="left")

In [None]:
# take a look at the basetable
basetable.show()

In [None]:
basetable = basetable.toPandas()

In [None]:
# export basetable as a .json file
basetable.to_json("./../../data/basetable_vegan_trend_prediction.json", orient="records", force_ascii=False, lines=True)

#### Read table in

In [None]:
# read in the saved basetable (.json)
basetable_df = spark.read.json("./../../data/basetable_vegan_trend_prediction.json")

In [None]:
basetable_df.show()

Look at the total number of observations in the basetable 

In [None]:
basetable_df.count()

## 5. split train and test set 

In [None]:
basetable_df.toPandas().head()

Look at the total number of observations in the basetable 

In [None]:
basetable_df.count()

We cannot use the randomsplit function, because we have time series data, so we have to use another approach
https://towardsdatascience.com/time-series-from-scratch-train-test-splits-and-evaluation-metrics-4fd654de1b37#:~:text=Train%2Ftest%20splits%20in%20time%20series%20In%20machine%20learning%2C,dataset%20for%20testing%20and%20everything%20else%20for%20training.

First we look at the amount of observations that will be assigned to the training set 

In [None]:
nr_train = int(basetable_df.count()*0.7)
nr_train

convert the final basetable to a pandas dataset 

In [None]:
basetable_pd = basetable_df.toPandas()
basetable_pd.head()

Removing date

In [None]:
basetable_pd = basetable_pd.drop(['date'], axis='columns')

Split the dataframe into train and test 

In [None]:
train_pd = basetable_pd.iloc[:nr_train,:]
test_pd = basetable_pd.iloc[nr_train:,:]

In [None]:
train_pd.head()

Convert the pandas dataframe back to a spark dataframe

In [None]:
train = spark.createDataFrame(train_pd)
train.show()

In [None]:
test = spark.createDataFrame(test_pd)
test.show()

In [None]:
# get number in observations in each set
print("Number of observations train: %s" %train.count())
print("Number of observations test: %s" %test.count())

In [None]:
# get distribution of dependent variable within each set
train.groupBy("dependent").count().show()
test.groupBy("dependent").count().show()

# 6. Modelling

## 6.1 Pipelines

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, GBTClassifier, RandomForestClassifier

In [None]:
# define indexer (IDX)
IDX = StringIndexer(inputCol="dependent", outputCol="label")

In [None]:
train.printSchema()

In [None]:
# define all the numeric features
numFeatureCols = ["volume", "avg_likes", "avg_retweets", "avg_engagement_rate", "avg_followers", "avg_emojis", "avg_words", "avg_hashtags", "avg_mentions", "avg_exclamation_marks", "avg_sentiment"]

# define vector assembler
VA_num = VectorAssembler(inputCols=numFeatureCols, outputCol="numeric_features")

In [None]:
# define the standard scaler (SS)
SS = StandardScaler(inputCol="numeric_features", outputCol="scaled_numeric_features")

In [None]:
# for random forest we don't scale the features 
VA_RF = VectorAssembler(inputCols=["volume", "avg_likes", "avg_retweets", "avg_engagement_rate", "avg_followers", "avg_emojis", "avg_words", "avg_hashtags", "avg_mentions", "avg_exclamation_marks", "avg_sentiment", "avg_polarity", "avg_subjectivity"], outputCol="featuresrf")

In [None]:
# define vector assembler (VA_all)
VA_all = VectorAssembler(inputCols=["scaled_numeric_features", "avg_polarity", "avg_subjectivity"], outputCol="features")

In [None]:
# define logistic regression model
LR = LogisticRegression(featuresCol = "features", labelCol = "label")

In [None]:
# define decision tree model
DT = DecisionTreeClassifier(featuresCol = "features", labelCol = "label", weightCol= "weight")

In [None]:
# define decision tree model
GBT = GBTClassifier(featuresCol = "features", labelCol = "label", weightCol= "weight")

In [None]:
# define random forest model
RF = RandomForestClassifier(featuresCol = "featuresrf", labelCol = "label", weightCol= "weight", numTrees= 200, featureSubsetStrategy= 'all', maxDepth= 10)

## 6.2 Modelling

In [None]:
# define pipeline model and fit on training data
LR_pipeline = Pipeline().setStages([IDX, VA_num, SS, VA_all, LR]).fit(train)

In [None]:
# get predictions on test set
LR_preds = LR_pipeline.transform(test)

In [None]:
# define pipeline model and fit on training data
DT_pipeline = Pipeline().setStages([IDX, VA_num, SS, VA_all, DT]).fit(train)

In [None]:
# get predictions on test set
DT_preds = DT_pipeline.transform(test)

In [None]:
# define pipeline model and fit on training data
GBT_pipeline = Pipeline().setStages([IDX, VA_num, SS, VA_all, GBT]).fit(train)

In [None]:
# get predictions on test set
GBT_preds = GBT_pipeline.transform(test)

In [None]:
# define pipeline model and fit on training data
RF_pipeline = Pipeline().setStages([IDX, VA_RF, RF]).fit(train)

In [None]:
# get predictions on test set
RF_preds = RF_pipeline.transform(test)

# 7. Model Evaluation

In [None]:
# import packages
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# define evaluator (for AUC)
evaluator_auc = BinaryClassificationEvaluator()

# define evaluator (for other metrics)
evaluator_mc = MulticlassClassificationEvaluator()

In [None]:
# get metrics for LR model
lr_f1 = evaluator_mc.evaluate(LR_preds, {evaluator_mc.metricName: "f1"})
lr_accuracy = evaluator_mc.evaluate(LR_preds, {evaluator_mc.metricName: "accuracy"})
lr_recall = evaluator_mc.evaluate(LR_preds, {evaluator_mc.metricName: "recallByLabel"})
lr_auc = evaluator_auc.evaluate(LR_preds, {evaluator_auc.metricName: 'areaUnderROC'})

In [None]:
# get metrics for DT model
dt_f1 = evaluator_mc.evaluate(DT_preds, {evaluator_mc.metricName: "f1"})
dt_accuracy = evaluator_mc.evaluate(DT_preds, {evaluator_mc.metricName: "accuracy"})
dt_recall = evaluator_mc.evaluate(DT_preds, {evaluator_mc.metricName: "recallByLabel"})
dt_auc = evaluator_auc.evaluate(DT_preds, {evaluator_auc.metricName: 'areaUnderROC'})

In [None]:
# get metrics for GBT model
gbt_f1 = evaluator_mc.evaluate(GBT_preds, {evaluator_mc.metricName: "f1"})
gbt_accuracy = evaluator_mc.evaluate(GBT_preds, {evaluator_mc.metricName: "accuracy"})
gbt_recall = evaluator_mc.evaluate(GBT_preds, {evaluator_mc.metricName: "recallByLabel"})
gbt_auc = evaluator_auc.evaluate(GBT_preds, {evaluator_auc.metricName: 'areaUnderROC'})

In [None]:
# get metrics for RF model
rf_f1 = evaluator_mc.evaluate(RF_preds, {evaluator_mc.metricName: "f1"})
rf_accuracy = evaluator_mc.evaluate(RF_preds, {evaluator_mc.metricName: "accuracy"})
rf_recall = evaluator_mc.evaluate(RF_preds, {evaluator_mc.metricName: "recallByLabel"})
rf_auc = evaluator_auc.evaluate(RF_preds, {evaluator_auc.metricName: 'areaUnderROC'})

In [None]:
# check which of both algorithms is the best:
print("LOGISTIC REGRESSION:")
print('  F1       : %g' % lr_f1)
print('  ACCURACY : %g' % lr_accuracy)
print('  RECALL   : %g' % lr_recall)
print('  AUC      : %g' % lr_auc)
print("------------------")
print("SINGLE DECISION TREE:")
print('  F1       : %g' % dt_f1)
print('  ACCURACY : %g' % dt_accuracy)
print('  RECALL   : %g' % dt_recall)
print('  AUC      : %g' % dt_auc)
print("------------------")
print("GRADIENT-BOOSTED TREES:")
print('  F1       : %g' % gbt_f1)
print('  ACCURACY : %g' % gbt_accuracy)
print('  RECALL   : %g' % gbt_recall)
print('  AUC      : %g' % gbt_auc)
print("------------------")
print("RANDOM FOREST:")
print('  F1       : %g' % rf_f1)
print('  ACCURACY : %g' % rf_accuracy)
print('  RECALL   : %g' % rf_recall)
print('  AUC      : %g' % rf_auc)

# 8 Plot model evaluation

## 8.1 Confusion Matrix 

We plot the confussion matrix for the model with the highest AUC, this is the random forest model with cross validation

In [None]:
# import confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# get predictions and labels
preds_and_labels = RF_preds.select(['prediction','label']) \
                                  .withColumn('label', F.col('label').cast(FloatType())) \
                                  .orderBy('prediction') \
                                  .toPandas()

# get confusion matrix
cm = confusion_matrix(preds_and_labels["label"], preds_and_labels["prediction"], labels=[0, 1])
# get confusion matrix figure
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
# plot figure
disp.plot()
plt.show()

Calculate sensitivity and specificity

In [None]:
from sklearn.metrics import roc_curve, auc, recall_score
recall_sensitivity = metrics.recall_score(preds_and_labels["label"], preds_and_labels["prediction"], pos_label=1)
recall_specificity = metrics.recall_score(preds_and_labels["label"], preds_and_labels["prediction"], pos_label=0)
recall_sensitivity, recall_specificity 

## 8.2 Model interpretation

We plot the future importance for the model with the highest AUC, this is the random forest model

In [None]:
# inspect feature importance
feature_importance = RF_pipeline.stages[-1].featureImportances.toArray()
# define all the features
all_feature_names = numFeatureCols + ["avg_polarity", "avg_subjectivity"] 
plt.figure(figsize=(20, 5))
plt.bar(x=range(len(feature_importance)), height=feature_importance)
plt.xticks(range(len(feature_importance)), all_feature_names, rotation=90)
plt.show()

## 8.3 ROC Curve

We will plot the ROC curve for the model with the highest AUC, this is the random forest model.

We will use plotty to plot the ROC curve, so we will have to perform some data type transformations 

In [None]:
# convert the rf predictions to a pandas dataframe
RF_preds_pd = RF_preds.toPandas()
RF_preds_pd.head()

In [None]:
# this is a function that will select the probabilities for value 1
def select1(column):
    one = column[1]
    return one 

In [None]:
# we apply this function to our probabilities 
Yscore = RF_preds_pd['probability'].apply(lambda x: select1(x)).to_numpy()


In [None]:
# Yscore contains the probabilities of our predictions to be 1
Yscore

In [None]:
# Yscore is of the correct type to perform the plot 
type(Yscore)

In [None]:
# Y is the real value of our dependent
Y = test.select('dependent').toPandas()
Y = Y['dependent'].to_numpy()
Y

In [None]:
# Yscore is of the correct type to perform the plot 
type(Y)

In [None]:
# we use the roc_curve function to predict the false-positive, true-positive rate and thresholds based on the Y an Y score
fpr, tpr, thresholds = roc_curve(Y, Yscore)

In [None]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()