In [19]:
# import findspark
import findspark
# initialize findspark with spark directory
findspark.init("C:\BigData\BigData\spark-3.1.2-bin-hadoop3.2")
# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at C:\Users\vikto\AppData\Local\Temp\ipykernel_20996\2064577265.py:8 

In [438]:
# check
spark

In [480]:
# import packages
import os 
import pickle

import re
import datetime
from datetime import date
import requests

import pytz
import emojis

import pandas as pd
import numpy as np

import ast

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
import emojis 
import tweepy
import csv
import time
import pandas as pd
import os
import json
from pandas.tseries.holiday import nearest_workday, \
    AbstractHolidayCalendar, Holiday, \
    USMartinLutherKingJr, USPresidentsDay, GoodFriday, \
    USMemorialDay, USLaborDay, USThanksgivingDay


# 1. Import Data & Exploration

In [440]:
# import the data 
list_brands = ["healthyfood",
               "healthylifestyle",
               "_vegan_",
               "keto",
               "ketodiet",
               "ketolifestyle",
               "veganism",
               "vegetarian"]
from re import search



data_dir = ".././../data/Topic_vegan/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]



files_brand = [file for file in tweet_files if (file.find(list_brands[2]) != -1)]
files_brand               
               
df_json = spark.read.option("multiline","true").json(files_brand)  
df_json.count()

1595676

In [443]:
# check the schema of our json dataframe
df_json.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- id_str: string (nullable = true)
 |    |    |    |

In [444]:
# select interesting features
df = df_json.select(F.col("user.name"),
                                F.col("user.screen_name"),
                                F.col("created_at"), 
                                F.col("full_text"),
                                F.col("entities.hashtags"),
                                F.col("lang"),
                                F.col("favorite_count"),
                                F.col("retweet_count"),
                                F.col("user.followers_count"),
                                F.col("user.friends_count"),
                                F.col("user.favourites_count"),
                                F.col("entities.urls"),
                                F.col("entities.symbols"))

# 2. Preprocess Data

## 2.1 Check Time Period 

In [448]:
# https://developer.twitter.com/en/docs/twitter-ads-api/timezones
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
df = df.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

In [449]:
# show
df.select("created_at", "post_created_at").show(5, truncate=False)

+------------------------------+-------------------+
|created_at                    |post_created_at    |
+------------------------------+-------------------+
|Tue Sep 13 22:32:32 +0000 2022|2022-09-13 22:32:32|
|Tue Sep 13 22:32:26 +0000 2022|2022-09-13 22:32:26|
|Tue Sep 13 22:32:26 +0000 2022|2022-09-13 22:32:26|
|Tue Sep 13 22:32:16 +0000 2022|2022-09-13 22:32:16|
|Tue Sep 13 22:32:09 +0000 2022|2022-09-13 22:32:09|
+------------------------------+-------------------+
only showing top 5 rows



In [450]:
#drop duplicates and retweets 
df = df.filter(~F.col("full_text").startswith("RT"))\
                        .drop_duplicates()
#sorting such when dropping later we only keep the most recent post 
df = df.sort("post_created_at", ascending=False)
#removing spam accounts 
df = df.drop_duplicates(["full_text", "screen_name"])

#df.printSchema()
#df.count() #1340938

## 2.2 Feature Engineering

In [451]:
# define function to count hashtags
def get_hashtags(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "#" in word:
            counter += 1
    return(counter)

In [452]:
# define function to count mentions
def get_mentions(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "@" in word:
            counter += 1
    return(counter)

In [453]:
# define function to count exclamation marks
def get_exclamation_marks(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "!" in word:
            counter += 1
    return(counter)

In [454]:
# define function to count number of emojis used
def emoji_counter(text):
    nr_emojis = emojis.count(text)
    return(nr_emojis)

In [455]:
# define function to calculate engagement rate
def engagement_rate(favorite_count, retweet_count, followers_count):
    if(followers_count == 0):
        eng_rate = 0
    else:
        eng_rate = (favorite_count + retweet_count)/followers_count
    
    return eng_rate

In [456]:
# register functions as udf
get_hashtags_UDF = F.udf(get_hashtags, IntegerType())
get_mentions_UDF = F.udf(get_mentions, IntegerType())
get_exclamation_marks_UDF = F.udf(get_exclamation_marks, IntegerType())
emoji_counter_UDF = F.udf(emoji_counter, IntegerType())
engagement_rate_UDF = F.udf(engagement_rate, DoubleType())

In [457]:
# apply functions to create new features
df = df.withColumn("emoji_count", emoji_counter_UDF("full_text")) \
        .withColumn("text_tokenized", F.split("full_text", " ")) \
        .withColumn("num_words", F.size("text_tokenized")) \
        .withColumn("num_hashtags", get_hashtags_UDF("text_tokenized")) \
        .withColumn("num_mentions", get_mentions_UDF("text_tokenized")) \
        .withColumn("num_exclamation_marks", get_exclamation_marks_UDF("text_tokenized")) \
        .withColumn("engagement_rate", engagement_rate_UDF("favorite_count", "retweet_count", "followers_count"))

# show
df.select("full_text", "emoji_count", "num_words", "num_hashtags", "num_mentions", "num_exclamation_marks", "engagement_rate").toPandas().tail(5)

Unnamed: 0,full_text,emoji_count,num_words,num_hashtags,num_mentions,num_exclamation_marks,engagement_rate
592246,Check out Maddie Payton's video! #TikTok https://t.co/g46yyGy77U #growth #growthmindset #believe #retweet #socialmedia #marketing #contentcreator #influencer #wcw #vegan #tbt #disney #Disneytiktok #affiliatemarketing #trending #trends #fyp #marketing #lifestyle #youtube #hashtag,0,28,22,0,1,0.0
592247,????????? https://t.co/rIWy3nd1tP,0,2,0,0,0,0.0
592248,@YourHostEdge I have sensory issues related to texture and taste that would make it exceedingly difficult to completely switch my diet.\n\nI respect other people’s decisions to go vegan but I don’t see why I should have to go thru it when it could literally cause me pain.,0,47,0,1,0,0.0
592249,My Road To Being A Vegan | Alan Cox | TEDxLambethSalon\nhttps://t.co/VPfV8rnQjx,0,11,0,0,0,0.0
592250,AD|PR - Looking for new beauty products? Read my favourite vegan beauty products\n👇Read More👇\n\nhttps://t.co/YcECxU1IcU #thegirlgang #lbloggers #blogginggals #GRLPWR @sotonbloggers #bloggerstribe #influencerrt #theclqrt #bloggershutrt #teacupclub #bloggerssparkle #cosybloggerclub https://t.co/dRfI2eVvdc,2,27,11,1,0,9.1e-05


# 3. Text Cleaning 

In [458]:
# filter for english tweets (NOTE: for the assignment you can translate non-english tweets using an API)
df = df.filter(F.col("lang") == "en")

In [85]:
# check number of observations
df.count()

439169

In [459]:
# define function to clean text
def clean_text(string):
    
    # define numbers
    NUMBERS = '0123456789'
    PUNCT = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    
    # convert text to lower case
    cleaned_string = string.lower()
    
    # remove URLS
    cleaned_string = re.sub(r'http\S+', ' ', cleaned_string)
    
    # replace emojis by words
    cleaned_string = emoji.demojize(cleaned_string)
    cleaned_string = cleaned_string.replace(":"," ").replace("_"," ")
    cleaned_string = ' '.join(cleaned_string.split())
    
    # remove numbers
    cleaned_string = "".join([char for char in cleaned_string if char not in NUMBERS])
    
    # remove punctuation
    cleaned_string = "".join([char for char in cleaned_string if char not in PUNCT])
    
    # remove words conisting of one character (or less)
    cleaned_string = ' '.join([w for w in cleaned_string.split() if len(w) > 1])
    
    # return
    return(cleaned_string)

In [460]:
# convert to udf
clean_text_udf = F.udf(clean_text, StringType())

In [461]:
# clean string
df = df.withColumn("cleaned_text", clean_text_udf(F.col("full_text")))

In [128]:
# check
pd.set_option('display.max_colwidth', None)
df.select("full_text", "cleaned_text").limit(5).toPandas()

Unnamed: 0,full_text,cleaned_text
0,"""A lot of people say they can't eat vegan, but I'm just like them – and if I can do it they can do it."" Nichole Lewis who is a 26-year-old mom lost 120 pounds and improved her #mentalhealth by eating a plant-based diet. Here are her weight loss secrets: https://t.co/4RAT8rwqTO",lot of people say they cant eat vegan but im just like them and if can do it they can do it nichole lewis who is yearold mom lost pounds and improved her mentalhealth by eating plantbased diet here are her weight loss secrets
1,"""By now, your [vegan/vegetarian] lifestyle may be so easy to maintain that you don't even have to think about it...All of this can change when you study abroad."" Check out Hannah's tips on staying vegan while abroad: https://t.co/fIy45ntvTD",by now your veganvegetarian lifestyle may be so easy to maintain that you dont even have to think about itall of this can change when you study abroad check out hannahs tips on staying vegan while abroad
2,"""Don't stay in the sun""\n""Use sunscreen""\n""Red meat is bad for you""\n""Vegan diet is healthy""\n""Seed oil is healthy""\n""Inflation is a good thing""\n""#Bitcoin is bad for the enviroment""\n\nWhat other things does the media portray as positive while being extremely damaging?",dont stay in the sun use sunscreen red meat is bad for you vegan diet is healthy seed oil is healthy inflation is good thing bitcoin is bad for the enviroment what other things does the media portray as positive while being extremely damaging
3,"""Heroes sacrifice for causes; they do things that others hide from. I may not be some great hero, but I won't hide from this"" - Brandon Mull.\n#Antispeciesism \n#Animalliberation \n#Vegan\n#Love https://t.co/1hK7FXsgRo",heroes sacrifice for causes they do things that others hide from may not be some great hero but wont hide from this brandon mull antispeciesism animalliberation vegan love
4,"""Last year over a million people left the same suicide note... SHOPPING LIST: Butter, Eggs, Milk, Cheese, Beef, Chicken, and Bacon."" - Physicians Committee for Responsible Medicine\n#vegan",last year over million people left the same suicide note shopping list butter eggs milk cheese beef chicken and bacon physicians committee for responsible medicine vegan


# 4. Sentiment 

## 4.1 The Vader Package

VADER sentimental analysis relies on a dictionary that maps lexical features to emotion intensities known as sentiment scores. The sentiment score of a text can be obtained by summing up the intensity of each word in the text.

In [462]:
#using the vaderSentiment package 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [463]:
# define the function to extract the sentiment
def get_sentiment(sentence):

    # initialize sentiment analyzer
    sid_obj = SentimentIntensityAnalyzer()

    # get sentiment dict
    sentiment_dict = sid_obj.polarity_scores(sentence)
    
    # get positive sentiment score
    pos_sentiment = sentiment_dict["pos"]
    
    # return positive sentiment score
    return(pos_sentiment)

get_sentiment_udf = udf(get_sentiment, DoubleType())

In [464]:
df = df.withColumn("sentiment_vader", get_sentiment_udf(F.col("cleaned_text")))

In [153]:
df.select("cleaned_text", "sentiment_vader").show()

+--------------------+---------------+
|        cleaned_text|sentiment_vader|
+--------------------+---------------+
|lot of people say...|          0.139|
|by now your vegan...|          0.094|
|dont stay in the ...|            0.2|
|heroes sacrifice ...|          0.321|
|last year over mi...|          0.075|
|monday mint motiv...|          0.098|
|no nut november m...|          0.238|
|sir why did you s...|          0.263|
|there are many go...|          0.114|
|vegan diets provi...|          0.314|
|we were given veg...|           0.16|
|why are vegans so...|          0.164|
|would you ever go...|            0.0|
|you doing anythin...|          0.172|
|defeating russia ...|            0.0|
|sustainabledecisi...|          0.061|
|arfeatures cai st...|            0.0|
|bgan punks are no...|            0.0|
|cbdoil for anxiet...|          0.372|
|didyouknow as lon...|            0.0|
+--------------------+---------------+
only showing top 20 rows



## 4.2 TextBlob Package 

TextBlob returns polarity and subjectivity of a sentence. 

**Polarity** lies between [-1,1],  -1 defines a negative sentiment and 1 defines a positive sentiment.  

**Subjectivity** quantifies the amount of personal opinion and factual information contained in the text. Subjectivity lies between [0,1]. The higher subjectivity means that the text contains personal opinion rather than factual information. 

In [465]:
#use polarity and subjectivity from TextBlob 
#https://textblob.readthedocs.io/en/dev/
from textblob import TextBlob

In [466]:
# define function to get polarity score of text document
def get_polarity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[0]
# define function to get subjectivity score of text document
def get_subjectivity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[1]
get_polarity_udf = F.udf(get_polarity, DoubleType())
get_subjectivity_udf = F.udf(get_subjectivity, DoubleType())

In [467]:
df = df.withColumn('polarity', get_polarity_udf(F.col('cleaned_text')))\
        .withColumn('subjectivity', get_subjectivity_udf(F.col('cleaned_text')))

In [157]:
df.select("cleaned_text", "sentiment_vader", "polarity", "subjectivity").show()

+--------------------+---------------+--------------------+-------------------+
|        cleaned_text|sentiment_vader|            polarity|       subjectivity|
+--------------------+---------------+--------------------+-------------------+
|lot of people say...|          0.139|                 0.0|                0.0|
|by now your vegan...|          0.094| 0.43333333333333335| 0.8333333333333334|
|dont stay in the ...|            0.2|0.030808080808080826| 0.5393097643097643|
|heroes sacrifice ...|          0.321|                0.65|              0.675|
|last year over mi...|          0.075|-0.07999999999999999| 0.3383333333333333|
|monday mint motiv...|          0.098|                 0.0|                0.0|
|no nut november m...|          0.238|            0.221875|0.47187500000000004|
|sir why did you s...|          0.263|  0.6166666666666667| 0.4666666666666666|
|there are many go...|          0.114|  0.2785714285714286|               0.55|
|vegan diets provi...|          0.314|  

# 5. Basetable 

In [468]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- favourites_count: long (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- display_url: string (nullable = true)
 |    |    |-- expanded_url: string (nullable = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- url: string (nullable = true)


## 5.1 Adjust dates to opening hours stock exchange

The class below contains all the official holidays where the stockmarket is closed, this is needed to be able to assign the tweets that are posted on a holiday, to the next day

In [475]:
class USTradingHolidaysCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday(
            'NewYearsDay',
            month=1,
            day=1,
            observance=nearest_workday
        ),
        USMartinLutherKingJr,
        USPresidentsDay,
        GoodFriday,
        USMemorialDay,
        Holiday(
            'Juneteenth National Independence Day',
            month=6,
            day=19,
            start_date='2021-06-18',
            observance=nearest_workday,
        ),
        Holiday(
            'USIndependenceDay',
            month=7,
            day=4,
            observance=nearest_workday
        ),
        USLaborDay,
        USThanksgivingDay,
        Holiday(
            'Christmas',
            month=12,
            day=25,
            observance=nearest_workday
        ),
    ]

This function calls the US holidays class and checks whether the tweet was posted on an holiday on which the stock market was closed. If this is the case the tweet is assigned to the next day that's not a holiday or weekend

In [482]:
def check_holidays(tweet_date):
    # get the holidays for the periods when the tweets where placed 
    cal = USTradingHolidaysCalendar()
    holidays = cal.holidays(start='2021-10-11', end='2022-10-12')
    tweet_date = pd.to_datetime(tweet_date)
    # if the tweet is placed on a holiday, the tweet will be assigned to the next day that's not in the weekend
    if(tweet_date in holidays):
        tweet_date = tweet_date + datetime.timedelta(days = 1)
        if(tweet_date.isoweekday() == 6):
            tweet_date = tweet_date + datetime.timedelta(days = 2)
        elif(tweet_date.isoweekday() == 6):
            tweet_date = tweet_date + datetime.timedelta(days = 1)
    return tweet_date

This function takes the opening hours of the stock exchange into account for the tweets. When a tweet is posted before the opening hours of the stock market, the tweet is assigned to the day iself. When a tweet is posted the monday, tuesday, wednesday or thursday after the opening hours it is assigned to the next day. When a tweet is posted on friday after the openingshours it is assigned to monday. As well as if the tweet was posted on a saturday or sunday. 

The case when a tweet is posted after the opening hours on friday is a special case in our code. When it goes throug the first if else it will be assigned to the next day, so to saturday. Than when it goes to the second if it is assigned to the monday

In [477]:
def opening_hours(tweet_date):
    tweet_date = pd.to_datetime(tweet_date)
    open_NYSE = tweet_date.replace(hour=14, minute=30, second=0, microsecond=0)
    close_NYSE = tweet_date.replace(hour=21, minute=0, second=0, microsecond=0)
    # if the tweet is posted before the opening hours of the stock market, it is assigned to the next day
    if (tweet_date < open_NYSE):
        dependent_date = tweet_date.date()
    # else when the tweet is posted after the opening hours, it is assigned to the next day
    else:
        dependent_date = tweet_date.date()
        dependent_date = dependent_date + datetime.timedelta(days = 1)
    # if the tweet is posted on a saturday, it is assigned to the next monday
    if (dependent_date.isoweekday() == 6):
        dependent_date = dependent_date + datetime.timedelta(days = 2)
    # if the tweet is posted on a sunday, it is assigned to the next monday 
    if (dependent_date.isoweekday() == 7):
        dependent_date = dependent_date + datetime.timedelta(days = 1)
    return dependent_date

Now we will apply the functions on the dataset

In [483]:
opening_hours_udf = udf(opening_hours, DateType())
check_holidays_udf = udf(check_holidays, DateType())
df = df.withColumn('date_with_holidays', opening_hours_udf(F.col('created_at')))
df = df.withColumn('date', check_holidays_udf(F.col('date_with_holidays')))
df.select('date', 'created_at').show()

+----------+--------------------+
|      date|          created_at|
+----------+--------------------+
|2021-11-03|Tue Nov 02 18:33:...|
|2022-03-11|Thu Mar 10 14:30:...|
|2022-04-27|Tue Apr 26 18:00:...|
|2022-06-21|Tue Jun 21 11:34:...|
|2022-05-02|Sat Apr 30 18:58:...|
|2021-12-14|Mon Dec 13 18:22:...|
|2021-11-03|Tue Nov 02 20:54:...|
|2022-05-02|Fri Apr 29 18:40:...|
|2022-01-12|Tue Jan 11 20:05:...|
|2022-04-27|Wed Apr 27 07:09:...|
|2022-08-01|Sun Jul 31 16:54:...|
|2022-08-11|Wed Aug 10 16:53:...|
|2022-08-03|Tue Aug 02 17:00:...|
|2021-11-02|Mon Nov 01 21:57:...|
|2022-03-09|Tue Mar 08 18:19:...|
|2022-03-31|Thu Mar 31 12:30:...|
|2022-06-29|Wed Jun 29 12:30:...|
|2022-01-03|Thu Dec 30 21:56:...|
|2022-03-14|Fri Mar 11 16:15:...|
|2022-05-17|Mon May 16 15:25:...|
+----------+--------------------+
only showing top 20 rows



## 5.2 Dependent Variable

Import the stock information

In [485]:
stock_information = pd.read_excel("./../../data/stock_information.xlsx")

Here we create the dependent, which is the percentage of change compared to yesterday

In [486]:
stock_information["close_yesterday"] = stock_information["close"].shift(1)
#stock_information = stock_information[stock_information.date != "2021-10-11"]
stock_information["dependent"] = (stock_information["close"] - stock_information["close_yesterday"])/stock_information["close_yesterday"]
display(stock_information)

Unnamed: 0,symbol,date,open,high,low,close,volume,adjusted,close_yesterday,dependent
0,VEGN,2021-10-11,40.020000,40.161999,39.759998,39.759998,5000,39.524410,,
1,VEGN,2021-10-12,39.840000,39.840000,39.669998,39.724998,4900,39.489620,39.759998,-0.000880
2,VEGN,2021-10-13,40.070000,40.099998,39.810001,39.924000,16000,39.687439,39.724998,0.005009
3,VEGN,2021-10-14,40.340000,40.765999,40.340000,40.765999,3100,40.524448,39.924000,0.021090
4,VEGN,2021-10-15,41.060001,41.150002,41.060001,41.122002,1700,40.878345,40.765999,0.008733
...,...,...,...,...,...,...,...,...,...,...
248,VEGN,2022-10-05,31.469999,31.628000,31.120001,31.628000,1800,31.628000,31.719999,-0.002900
249,VEGN,2022-10-06,31.559999,31.600000,31.309999,31.309999,1200,31.309999,31.628000,-0.010054
250,VEGN,2022-10-07,30.299999,30.299999,30.180000,30.239000,1300,30.239000,31.309999,-0.034206
251,VEGN,2022-10-10,30.350000,30.350000,29.629999,29.830000,4500,29.830000,30.239000,-0.013526


This is a function that creates a binary variable. The variable will be 0 if the etf decreased or stayed on the same level, and will be 1 if the etf increased

In [487]:
def make_dependent(x):
    if x > 0:
        x = 1
    else:
        x = 0
    return x

Apply the function on the data

In [488]:
stock_information["dependent"] = stock_information["dependent"].apply(lambda x: make_dependent(x))
display(stock_information)

Unnamed: 0,symbol,date,open,high,low,close,volume,adjusted,close_yesterday,dependent
0,VEGN,2021-10-11,40.020000,40.161999,39.759998,39.759998,5000,39.524410,,0
1,VEGN,2021-10-12,39.840000,39.840000,39.669998,39.724998,4900,39.489620,39.759998,0
2,VEGN,2021-10-13,40.070000,40.099998,39.810001,39.924000,16000,39.687439,39.724998,1
3,VEGN,2021-10-14,40.340000,40.765999,40.340000,40.765999,3100,40.524448,39.924000,1
4,VEGN,2021-10-15,41.060001,41.150002,41.060001,41.122002,1700,40.878345,40.765999,1
...,...,...,...,...,...,...,...,...,...,...
248,VEGN,2022-10-05,31.469999,31.628000,31.120001,31.628000,1800,31.628000,31.719999,0
249,VEGN,2022-10-06,31.559999,31.600000,31.309999,31.309999,1200,31.309999,31.628000,0
250,VEGN,2022-10-07,30.299999,30.299999,30.180000,30.239000,1300,30.239000,31.309999,0
251,VEGN,2022-10-10,30.350000,30.350000,29.629999,29.830000,4500,29.830000,30.239000,0


In [489]:
basetable_dep = stock_information[['date', 'dependent']]
basetable_dep.head()

Unnamed: 0,date,dependent
0,2021-10-11,0
1,2021-10-12,0
2,2021-10-13,1
3,2021-10-14,1
4,2021-10-15,1


## 5.3 Select Features and Group By Name

In [385]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- favourites_count: long (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- display_url: string (nullable = true)
 |    |    |-- expanded_url: string (nullable = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- url: string (nullable = true)


Features:
- number_tweets
- favorite_count
- retweet_count
- followers_count
- engagement_rate
- emoji_count
- num_words
- num_hashtags
- num_mentions
- num_exclamation_marks
- sentiment_vader
- polarity
- subjectivity

We will group the features per date, so we can merge the dataset with the dependent. For the first future we look at how many tweets are posted on one day, the second one looks at the total number of favorites for all tweets on a day, the third one does the same for the number of retweets , the fourth one does the same for the number of followers and the fifth one does the same for the engagement. For all the other features the average is used as group function. This is because the first five features are more related to the network an the amount of people that are being reached on a certain day. While the other feature are more related to the tweets and what's expressed in these tweets

In [190]:
number_of_tweets = df.groupBy('date').agg(F.countDistinct("full_text"))\
                    .withColumnRenamed("count(full_text)", "number_tweets") 
number_of_tweets.show()

+----------+-------------+
|      date|number_tweets|
+----------+-------------+
|2022-07-31|         3788|
|2022-03-28|         1952|
|2022-08-02|         8160|
|2022-06-22|         1060|
|2022-05-26|          434|
|2022-03-16|         1174|
|2022-07-07|         1050|
|2022-03-15|         1204|
|2022-03-12|          502|
|2022-08-20|          490|
|2022-08-11|          545|
|2022-05-01|          485|
|2022-01-09|          631|
|2022-01-15|          584|
|2022-04-27|         9026|
|2021-11-01|        20951|
|2022-09-07|         1013|
|2022-09-06|            4|
|2022-02-01|          684|
|2022-08-15|         1773|
+----------+-------------+
only showing top 20 rows



In [191]:
number_of_favorites = df.groupBy('date').agg(F.sum("favorite_count"))\
                        .withColumnRenamed("sum(favorite_count)", "number_of_favorites") 
number_of_favorites.show()

+----------+-------------------+
|      date|number_of_favorites|
+----------+-------------------+
|2022-07-31|              30457|
|2022-03-28|              18533|
|2022-08-02|              72537|
|2022-06-22|               9382|
|2022-05-26|               3965|
|2022-03-16|               8857|
|2022-07-07|               7177|
|2022-03-15|               8564|
|2022-03-12|               6269|
|2022-08-11|               1440|
|2022-08-20|               6142|
|2022-05-01|               9455|
|2022-01-09|               4773|
|2022-01-15|               4988|
|2022-04-27|              70083|
|2021-11-01|             159869|
|2022-09-06|                  5|
|2022-09-07|               6094|
|2022-02-01|               8620|
|2022-07-05|               7466|
+----------+-------------------+
only showing top 20 rows



In [192]:
number_of_retweets = df.groupBy('date').agg(F.sum("retweet_count"))\
                        .withColumnRenamed("sum(retweet_count)", "number_of_retweets") 
number_of_retweets.show()

+----------+------------------+
|      date|number_of_retweets|
+----------+------------------+
|2022-07-31|              3678|
|2022-03-28|              6313|
|2022-08-02|              8321|
|2022-06-22|              3070|
|2022-05-26|              1285|
|2022-03-16|              2973|
|2022-07-07|              2390|
|2022-03-15|              2426|
|2022-03-12|              2584|
|2022-08-11|               378|
|2022-08-20|              2001|
|2022-05-01|              1667|
|2022-01-09|              1454|
|2022-01-15|              1310|
|2022-04-27|             10022|
|2021-11-01|             26899|
|2022-09-06|                 1|
|2022-09-07|              1453|
|2022-02-01|              1224|
|2022-07-05|              2827|
+----------+------------------+
only showing top 20 rows



In [193]:
number_of_followers = df.groupBy('date').agg(F.sum("followers_count"))\
                        .withColumnRenamed("sum(followers_count)", "number_of_followers") 
number_of_followers.show()

+----------+-------------------+
|      date|number_of_followers|
+----------+-------------------+
|2022-07-31|           32478823|
|2022-03-28|           19476231|
|2022-08-02|           89026921|
|2022-06-22|            4919891|
|2022-05-26|            1658594|
|2022-03-16|            9743592|
|2022-07-07|            5866302|
|2022-03-15|            9292780|
|2022-03-12|            2403831|
|2022-08-11|            2114590|
|2022-08-20|           13507377|
|2022-05-01|            1792821|
|2022-01-09|            9529304|
|2022-01-15|            3845498|
|2022-04-27|           91430365|
|2021-11-01|          259167877|
|2022-09-06|               5837|
|2022-09-07|            6471303|
|2022-02-01|            4189796|
|2022-07-05|            4556302|
+----------+-------------------+
only showing top 20 rows



In [194]:
avg_emojis = df.groupBy('date').agg(F.avg("emoji_count"))\
                        .withColumnRenamed("avg(emoji_count)", "avg_emojis") 
avg_emojis.show()

+----------+-------------------+
|      date|         avg_emojis|
+----------+-------------------+
|2022-07-31|0.44051362683438156|
|2022-03-28| 0.8217213114754098|
|2022-08-02| 0.4500916310323763|
|2022-06-22| 0.6773584905660377|
|2022-05-26| 0.8387096774193549|
|2022-03-16| 0.7529812606473595|
|2022-07-07| 0.8533333333333334|
|2022-03-15| 0.8156146179401993|
|2022-03-12| 0.8784860557768924|
|2022-08-11| 0.7779816513761468|
|2022-08-20| 0.8306122448979592|
|2022-05-01| 0.8639175257731959|
|2022-01-09| 0.7448494453248812|
|2022-01-15|  1.018835616438356|
|2022-04-27| 0.4835639180962922|
|2021-11-01| 0.6724033393964833|
|2022-09-06|                0.0|
|2022-09-07| 0.8104639684106614|
|2022-02-01| 0.8771929824561403|
|2022-07-05| 0.6882168925964547|
+----------+-------------------+
only showing top 20 rows



In [195]:
avg_words = df.groupBy('date').agg(F.avg("num_words"))\
                        .withColumnRenamed("avg(num_words)", "avg_words") 
avg_words.show()

+----------+------------------+
|      date|         avg_words|
+----------+------------------+
|2022-07-31|24.518343815513628|
|2022-03-28| 24.45952868852459|
|2022-08-02|23.463042150274894|
|2022-06-22|  25.0311320754717|
|2022-05-26|25.179723502304146|
|2022-03-16|25.563032367972742|
|2022-07-07|             25.92|
|2022-03-15|24.631229235880397|
|2022-03-12| 26.40239043824701|
|2022-08-11| 26.34862385321101|
|2022-08-20|              25.3|
|2022-05-01| 23.51958762886598|
|2022-01-09|25.893819334389857|
|2022-01-15|26.241438356164384|
|2022-04-27|23.320088544548977|
|2021-11-01|24.398022480294763|
|2022-09-06|             22.25|
|2022-09-07|24.975320829220138|
|2022-02-01|25.263157894736842|
|2022-07-05| 25.08237747653806|
+----------+------------------+
only showing top 20 rows



In [196]:
avg_hashtags = df.groupBy('date').agg(F.avg("num_hashtags"))\
                        .withColumnRenamed("avg(num_hashtags)", "avg_hashtags") 
avg_hashtags.show()

+----------+------------------+
|      date|      avg_hashtags|
+----------+------------------+
|2022-07-31|1.2348008385744236|
|2022-03-28| 5.600409836065574|
|2022-08-02|1.0891875381795968|
|2022-06-22| 6.488679245283019|
|2022-05-26| 5.912442396313364|
|2022-03-16| 6.349233390119251|
|2022-07-07| 6.746666666666667|
|2022-03-15|  6.01578073089701|
|2022-03-12| 6.145418326693227|
|2022-08-11|  6.01651376146789|
|2022-08-20| 5.783673469387755|
|2022-05-01| 6.092783505154639|
|2022-01-09| 6.375594294770206|
|2022-01-15| 6.193493150684931|
|2022-04-27| 1.053901494189264|
|2021-11-01|1.3282496152231706|
|2022-09-06|              3.75|
|2022-09-07| 6.250740375123396|
|2022-02-01| 6.220760233918129|
|2022-07-05| 6.692387904066736|
+----------+------------------+
only showing top 20 rows



In [197]:
avg_num_mentions = df.groupBy('date').agg(F.avg("num_mentions"))\
                        .withColumnRenamed("avg(num_mentions)", "avg_num_mentions") 
avg_num_mentions.show()

+----------+-------------------+
|      date|   avg_num_mentions|
+----------+-------------------+
|2022-07-31|  0.995020964360587|
|2022-03-28|0.48360655737704916|
|2022-08-02| 0.8927306047648137|
|2022-06-22|0.45943396226415095|
|2022-05-26| 0.6820276497695853|
|2022-03-16|0.44122657580919933|
|2022-07-07| 0.4438095238095238|
|2022-03-15| 0.4642857142857143|
|2022-03-12| 0.5836653386454184|
|2022-08-11| 0.5394495412844037|
|2022-08-20|0.40408163265306124|
|2022-05-01|0.41649484536082476|
|2022-01-09| 0.5641838351822503|
|2022-01-15|  0.577054794520548|
|2022-04-27|  1.169120088544549|
|2021-11-01|  1.268364348677767|
|2022-09-06|               1.25|
|2022-09-07|  0.420533070088845|
|2022-02-01| 0.4605263157894737|
|2022-07-05|0.38686131386861317|
+----------+-------------------+
only showing top 20 rows



In [198]:
avg_exclamation_marks = df.groupBy('date').agg(F.avg("num_exclamation_marks"))\
                        .withColumnRenamed("avg(num_exclamation_marks)", "avg_exclamation_marks") 
avg_exclamation_marks.show()

+----------+---------------------+
|      date|avg_exclamation_marks|
+----------+---------------------+
|2022-07-31|  0.20335429769392033|
|2022-03-28|    0.367827868852459|
|2022-08-02|   0.2255345143555284|
|2022-06-22|   0.3849056603773585|
|2022-05-26|   0.4009216589861751|
|2022-03-16|   0.3475298126064736|
|2022-07-07|   0.4180952380952381|
|2022-03-15|   0.3795681063122924|
|2022-03-12|   0.3784860557768924|
|2022-08-11|   0.3779816513761468|
|2022-08-20|  0.29591836734693877|
|2022-05-01|  0.31752577319587627|
|2022-01-09|   0.3502377179080824|
|2022-01-15|   0.4246575342465753|
|2022-04-27|   0.2338682899833979|
|2021-11-01|   0.2843150972435987|
|2022-09-06|                 0.25|
|2022-09-07|   0.3889437314906219|
|2022-02-01|  0.41812865497076024|
|2022-07-05|  0.34515119916579773|
+----------+---------------------+
only showing top 20 rows



In [490]:
sum_engagement_rate = df.groupBy('date').agg(F.sum("engagement_rate"))\
                        .withColumnRenamed("sum(engagement_rate)", "sum_engagement_rate") 
sum_engagement_rate.show()

+----------+-------------------+
|      date|sum_engagement_rate|
+----------+-------------------+
|2022-03-28|  87.06650419514085|
|2022-08-02|  277.0856944553172|
|2022-06-22| 28.198783295628985|
|2022-05-26| 10.662179096184378|
|2022-03-16|  32.51330032521655|
|2022-07-07| 39.186089454593926|
|2022-03-15|  30.20753151903739|
|2022-08-11|   4.71883874340456|
|2022-04-27|  474.4810378905992|
|2021-11-01|  927.1408131470066|
|2022-09-06|  74.41003248934167|
|2022-09-07| 44.357923469565186|
|2022-08-15| 185.90433526333348|
|2022-02-01|  34.44340133515017|
|2022-07-05|  700.8401400910454|
|2022-06-02|  44.90900665243554|
|2022-01-18| 49.242409735958674|
|2021-12-14| 31.289705347890898|
|2022-08-03| 436.83936893284164|
|2021-12-27|   66.5395287645296|
+----------+-------------------+
only showing top 20 rows



In [200]:
avg_sentiment_vader = df.groupBy('date').agg(F.avg("sentiment_vader"))\
                        .withColumnRenamed("avg(sentiment_vader)", "sentiment_vader") 
avg_sentiment_vader.show()

+----------+-------------------+
|      date|    sentiment_vader|
+----------+-------------------+
|2022-07-31|0.13780817610062898|
|2022-03-28|0.16072489754098357|
|2022-08-02|0.13525448992058645|
|2022-06-22|0.17772735849056595|
|2022-05-26|0.15763133640552993|
|2022-03-16|0.15785689948892667|
|2022-07-07|0.16888666666666677|
|2022-03-15|0.15796511627906978|
|2022-03-12|0.15765139442231077|
|2022-08-11|0.15588073394495408|
|2022-08-20|0.15766734693877565|
|2022-05-01|0.17410515463917528|
|2022-01-09| 0.1649524564183834|
|2022-01-15| 0.1764417808219178|
|2022-04-27| 0.1384954067515219|
|2021-11-01|0.14668564899025227|
|2022-09-06|              0.156|
|2022-09-07|0.15967028627838115|
|2022-02-01| 0.1637543859649122|
|2022-07-05|0.15292179353493224|
+----------+-------------------+
only showing top 20 rows



In [202]:
avg_polarity = df.groupBy('date').agg(F.avg("polarity"))\
                        .withColumnRenamed("avg(polarity)", "avg_polarity") 
avg_polarity.show()

+----------+-------------------+
|      date|       avg_polarity|
+----------+-------------------+
|2022-07-31|0.10821469260733596|
|2022-03-28| 0.1884670041360661|
|2022-08-02|0.12154470211661908|
|2022-06-22|0.21354227349158275|
|2022-05-26| 0.2113358760871926|
|2022-03-16|0.16750349272398937|
|2022-07-07|0.21019660344197633|
|2022-03-15|0.19116037771071578|
|2022-03-12|0.17529737841312965|
|2022-08-11|0.18246015815527605|
|2022-08-20| 0.1783665045682031|
|2022-05-01| 0.1964158457848812|
|2022-01-09|0.20437169155938453|
|2022-01-15|0.21327515936783936|
|2022-04-27|0.12120248559707147|
|2021-11-01|0.14879245544541778|
|2022-09-06| 0.3366161616161616|
|2022-09-07|0.17173510747532394|
|2022-02-01|0.19933466293829522|
|2022-07-05|0.18611731228741135|
+----------+-------------------+
only showing top 20 rows



In [203]:
avg_subjectivity = df.groupBy('date').agg(F.avg("subjectivity"))\
                        .withColumnRenamed("avg(subjectivity)", "avg_subjectivity") 
avg_subjectivity.show()

+----------+-------------------+
|      date|   avg_subjectivity|
+----------+-------------------+
|2022-07-31|0.40911586530328414|
|2022-03-28| 0.4198464149429641|
|2022-08-02| 0.4137001408511641|
|2022-06-22|0.42817074511168196|
|2022-05-26|0.41634818733646123|
|2022-03-16| 0.4012224654591205|
|2022-07-07|0.43063931001151085|
|2022-03-15|0.42308740290588975|
|2022-03-12| 0.4224295324674166|
|2022-08-11| 0.4137568749451123|
|2022-08-20|0.40342674553039726|
|2022-05-01|0.42877414921303014|
|2022-01-09|0.43466282528429423|
|2022-01-15|  0.459322670527722|
|2022-04-27| 0.4149151988682309|
|2021-11-01| 0.4245268991661895|
|2022-09-06| 0.4527777777777777|
|2022-09-07|0.42109212841197036|
|2022-02-01| 0.4248360371811769|
|2022-07-05| 0.4149568439805666|
+----------+-------------------+
only showing top 20 rows



Join al the features

In [208]:
# create basetable
basetable_ind = number_of_tweets.join(number_of_favorites, "date", how="inner") \
                    .join(number_of_retweets, "date", how="inner") \
                    .join(number_of_followers, "date", how="inner") \
                    .join(avg_emojis, "date", how="inner") \
                    .join(avg_words, "date", how="inner") \
                    .join(avg_hashtags, "date", how="inner") \
                    .join(avg_num_mentions, "date", how="inner") \
                    .join(avg_exclamation_marks, "date", how="inner") \
                    .join(sum_engagement_rate, "date", how="inner") \
                    .join(avg_sentiment_vader, "date", how="inner") \
                    .join(avg_polarity, "date", how="inner") \
                    .join(avg_subjectivity, "date", how="inner") 
basetable_ind_pd = basetable_ind.toPandas()

In [210]:
basetable_ind_pd.head()

Unnamed: 0,date,number_tweets,number_of_favorites,number_of_retweets,number_of_followers,avg_emojis,avg_words,avg_hashtags,avg_num_mentions,avg_exclamation_marks,avg_engagement_rate,polarity,avg_polarity,avg_subjectivity
0,2022-03-28,1952,18533,6313,19476231,0.821721,24.459529,5.60041,0.483607,0.367828,0.027125,0.188467,0.188467,0.419846
1,2022-07-31,3788,30457,3678,32478823,0.440514,24.518344,1.234801,0.995021,0.203354,0.039405,0.108215,0.108215,0.409116
2,2022-08-02,8160,72537,8321,89026921,0.450092,23.463042,1.089188,0.892731,0.225535,0.03425,0.121545,0.121545,0.4137
3,2022-05-26,434,3965,1285,1658594,0.83871,25.179724,5.912442,0.682028,0.400922,0.024796,0.211336,0.211336,0.416348
4,2022-06-22,1060,9382,3070,4919891,0.677358,25.031132,6.488679,0.459434,0.384906,0.026959,0.213542,0.213542,0.428171


In [213]:
type(basetable_ind_pd['date'][0])

datetime.date

In [491]:
type(basetable_dep['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

Cast the date of the ind basetable to the same type as the dependent basetable, to be able to merge the 2 tabels 

In [224]:
basetable_ind_pd['date'] = pd.to_datetime(basetable_ind_pd['date'])
type(basetable_ind_pd['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

Inner merge the tables based on date

In [386]:
basetable = basetable_ind_pd.merge(basetable_dep, how='inner', on= 'date')

Sort the rows by date

In [388]:
basetable = basetable.sort_values(by= 'date').reset_index(drop=True)
basetable.head(10)

Unnamed: 0,date,number_tweets,number_of_favorites,number_of_retweets,number_of_followers,avg_emojis,avg_words,avg_hashtags,avg_num_mentions,avg_exclamation_marks,avg_engagement_rate,polarity,avg_polarity,avg_subjectivity,dependent
0,2021-10-29,5051,25854,4143,64744182,0.574555,22.847176,1.131913,1.070158,0.268321,0.024244,0.122752,0.122752,0.418258,0
1,2021-11-01,20951,159869,26899,259167877,0.672403,24.398022,1.32825,1.268364,0.284315,0.035796,0.148792,0.148792,0.424527,0
2,2021-11-02,14545,97592,13709,219563281,0.729405,24.434049,1.205981,1.280541,0.296865,0.020413,0.166109,0.166109,0.446198,0
3,2021-11-03,6839,27859,3310,99421032,0.640164,23.440514,1.071324,1.07907,0.24715,0.014389,0.130799,0.130799,0.413306,0
4,2021-12-08,6504,51875,7360,50409471,0.507839,22.838918,1.079926,0.893944,0.262527,0.038302,0.132347,0.132347,0.403964,0
5,2021-12-09,8261,74991,11566,151400994,0.505686,22.778853,1.10271,0.838979,0.242197,0.030169,0.124922,0.124922,0.403552,0
6,2021-12-10,8928,101689,12753,108593320,0.515789,23.013774,0.969093,0.95991,0.241433,0.045711,0.120858,0.120858,0.39748,0
7,2021-12-13,11987,75780,10441,145399078,0.500792,23.917521,1.079977,1.106413,0.242682,0.022381,0.121128,0.121128,0.410183,0
8,2021-12-14,1090,7012,2235,9008986,0.877064,24.938532,5.782569,0.477064,0.407339,0.02908,0.210724,0.210724,0.423161,0
9,2021-12-15,1171,6329,2065,12445838,0.756618,25.267293,6.204099,0.549103,0.373185,0.029278,0.195109,0.195109,0.447951,1


Drop the date column becuase this is not a feature

In [389]:
basetable.drop('date', axis=1, inplace=True)
basetable.head()

Unnamed: 0,number_tweets,number_of_favorites,number_of_retweets,number_of_followers,avg_emojis,avg_words,avg_hashtags,avg_num_mentions,avg_exclamation_marks,avg_engagement_rate,polarity,avg_polarity,avg_subjectivity,dependent
0,5051,25854,4143,64744182,0.574555,22.847176,1.131913,1.070158,0.268321,0.024244,0.122752,0.122752,0.418258,0
1,20951,159869,26899,259167877,0.672403,24.398022,1.32825,1.268364,0.284315,0.035796,0.148792,0.148792,0.424527,0
2,14545,97592,13709,219563281,0.729405,24.434049,1.205981,1.280541,0.296865,0.020413,0.166109,0.166109,0.446198,0
3,6839,27859,3310,99421032,0.640164,23.440514,1.071324,1.07907,0.24715,0.014389,0.130799,0.130799,0.413306,0
4,6504,51875,7360,50409471,0.507839,22.838918,1.079926,0.893944,0.262527,0.038302,0.132347,0.132347,0.403964,0


In [394]:
# store basetable as a .parquet file
basetable.to_parquet("./../../data/basetableBinary.parquet")

In [390]:
# export basetable as a .json file
basetable.to_json("./../../data/basetableBinary.json", orient="records", force_ascii=False, lines=True)

## 5.4 split train and test set 

In [391]:
# import pyspark ml packages
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [395]:
# read in the saved basetable (.parquet)
basetable_df = spark.read.parquet("./../../data/basetableBinary.parquet")

In [396]:
# read in the saved basetable (.json)
basetable_df = spark.read.json("./../../data/basetableBinary.json")

In [398]:
basetable_df.toPandas().head()

Unnamed: 0,avg_emojis,avg_engagement_rate,avg_exclamation_marks,avg_hashtags,avg_num_mentions,avg_polarity,avg_subjectivity,avg_words,dependent,number_of_favorites,number_of_followers,number_of_retweets,number_tweets,polarity
0,0.574555,0.024244,0.268321,1.131913,1.070158,0.122752,0.418258,22.847176,0,25854,64744182,4143,5051,0.122752
1,0.672403,0.035796,0.284315,1.32825,1.268364,0.148792,0.424527,24.398022,0,159869,259167877,26899,20951,0.148792
2,0.729405,0.020413,0.296865,1.205981,1.280541,0.166109,0.446198,24.434049,0,97592,219563281,13709,14545,0.166109
3,0.640164,0.014389,0.24715,1.071324,1.07907,0.130799,0.413306,23.440514,0,27859,99421032,3310,6839,0.130799
4,0.507839,0.038302,0.262527,1.079926,0.893944,0.132347,0.403964,22.838918,0,51875,50409471,7360,6504,0.132347


In [399]:
# initialize stages of preprocessing pipeline
stagesPreprocessingPipeline = []

In [400]:
# define all the independent features
featureCols = [col for col in basetable_df.columns if col not in ["dependent"]]

In [401]:
# define vector assembler
assembler = VectorAssembler(inputCols = featureCols, outputCol = "features")

# add to pipeline stages
stagesPreprocessingPipeline.append(assembler)

In [402]:
# define the pipeline model and fit on data
pipelinePreprocessing = Pipeline().setStages(stagesPreprocessingPipeline).fit(basetable_df)

# transform data by applying pipeline on data
preparedData = pipelinePreprocessing.transform(basetable_df)

In [403]:
# create final basetable
final_basetable = preparedData.select(["dependent", "features"])

In [404]:
final_basetable.toPandas().head(20)

Unnamed: 0,dependent,features
0,0,"[0.5745554036, 0.0242435491, 0.268321282, 1.1319132304, 1.0701582959, 0.1227518592, 0.4182581273, 22.8471760797, 25854.0, 64744182.0, 4143.0, 5051.0, 0.1227518592]"
1,0,"[0.6724033394, 0.0357964676, 0.2843150972, 1.3282496152, 1.2683643487, 0.1487924554, 0.4245268992, 24.3980224803, 159869.0, 259167877.0, 26899.0, 20951.0, 0.1487924554]"
2,0,"[0.729405309, 0.0204128162, 0.2968653543, 1.2059812058, 1.2805405035, 0.1661089637, 0.4461981659, 24.4340489746, 97592.0, 219563281.0, 13709.0, 14545.0, 0.1661089637]"
3,0,"[0.6401636948, 0.0143888502, 0.2471499562, 1.0713241742, 1.0790704472, 0.1307991584, 0.4133064143, 23.4405144695, 27859.0, 99421032.0, 3310.0, 6839.0, 0.1307991584]"
4,0,"[0.5078389179, 0.038301671, 0.2625268982, 1.0799262219, 0.8939440516, 0.1323473782, 0.4039641874, 22.8389179219, 51875.0, 50409471.0, 7360.0, 6504.0, 0.1323473782]"
5,0,"[0.5056859424, 0.0301692101, 0.2421969514, 1.102709896, 0.8389789499, 0.1249220867, 0.4035524285, 22.7788531333, 74991.0, 151400994.0, 11566.0, 8261.0, 0.1249220867]"
6,0,"[0.5157894737, 0.0457112219, 0.2414333707, 0.9690929451, 0.9599104143, 0.1208582149, 0.3974802908, 23.0137737962, 101689.0, 108593320.0, 12753.0, 8928.0, 0.1208582149]"
7,0,"[0.5007922609, 0.0223809361, 0.2426820115, 1.0799766492, 1.1064131432, 0.1211284944, 0.410183019, 23.9175214744, 75780.0, 145399078.0, 10441.0, 11987.0, 0.1211284944]"
8,0,"[0.8770642202, 0.0290796518, 0.4073394495, 5.7825688073, 0.4770642202, 0.2107243447, 0.4231609832, 24.9385321101, 7012.0, 9008986.0, 2235.0, 1090.0, 0.2107243447]"
9,1,"[0.756618275, 0.0292775224, 0.3731853117, 6.2040990606, 0.5491033305, 0.1951091555, 0.4479512406, 25.267292912, 6329.0, 12445838.0, 2065.0, 1171.0, 0.1951091555]"


Look at the total number of observations in the basetable 

In [492]:
final_basetable.count()

140

We cannot use the randomsplit function, because we have time series data, so we have to use another approach

First we look at the amount of observations that will be assigned to the training set 

In [498]:
nr_train = int(final_basetable.count()*0.7)
nr_train

98

convert the final basetable to a pandas dataset 

In [496]:
final_basetable_pd = final_basetable.toPandas()
final_basetable_pd.head()

Unnamed: 0,dependent,features
0,0,"[0.5745554036, 0.0242435491, 0.268321282, 1.1319132304, 1.0701582959, 0.1227518592, 0.4182581273, 22.8471760797, 25854.0, 64744182.0, 4143.0, 5051.0, 0.1227518592]"
1,0,"[0.6724033394, 0.0357964676, 0.2843150972, 1.3282496152, 1.2683643487, 0.1487924554, 0.4245268992, 24.3980224803, 159869.0, 259167877.0, 26899.0, 20951.0, 0.1487924554]"
2,0,"[0.729405309, 0.0204128162, 0.2968653543, 1.2059812058, 1.2805405035, 0.1661089637, 0.4461981659, 24.4340489746, 97592.0, 219563281.0, 13709.0, 14545.0, 0.1661089637]"
3,0,"[0.6401636948, 0.0143888502, 0.2471499562, 1.0713241742, 1.0790704472, 0.1307991584, 0.4133064143, 23.4405144695, 27859.0, 99421032.0, 3310.0, 6839.0, 0.1307991584]"
4,0,"[0.5078389179, 0.038301671, 0.2625268982, 1.0799262219, 0.8939440516, 0.1323473782, 0.4039641874, 22.8389179219, 51875.0, 50409471.0, 7360.0, 6504.0, 0.1323473782]"


Split the dataframe into train and test 

In [499]:
train_pd = final_basetable_pd.iloc[:nr_train,:]
test_pd = final_basetable_pd.iloc[nr_train:,:]

In [500]:
train_pd.head()

Unnamed: 0,dependent,features
0,0,"[0.5745554036, 0.0242435491, 0.268321282, 1.1319132304, 1.0701582959, 0.1227518592, 0.4182581273, 22.8471760797, 25854.0, 64744182.0, 4143.0, 5051.0, 0.1227518592]"
1,0,"[0.6724033394, 0.0357964676, 0.2843150972, 1.3282496152, 1.2683643487, 0.1487924554, 0.4245268992, 24.3980224803, 159869.0, 259167877.0, 26899.0, 20951.0, 0.1487924554]"
2,0,"[0.729405309, 0.0204128162, 0.2968653543, 1.2059812058, 1.2805405035, 0.1661089637, 0.4461981659, 24.4340489746, 97592.0, 219563281.0, 13709.0, 14545.0, 0.1661089637]"
3,0,"[0.6401636948, 0.0143888502, 0.2471499562, 1.0713241742, 1.0790704472, 0.1307991584, 0.4133064143, 23.4405144695, 27859.0, 99421032.0, 3310.0, 6839.0, 0.1307991584]"
4,0,"[0.5078389179, 0.038301671, 0.2625268982, 1.0799262219, 0.8939440516, 0.1323473782, 0.4039641874, 22.8389179219, 51875.0, 50409471.0, 7360.0, 6504.0, 0.1323473782]"


Convert the pandas dataframe back to a spark dataframe

In [501]:
train = spark.createDataFrame(train_pd)
train.show()

+---------+--------------------+
|dependent|            features|
+---------+--------------------+
|        0|[0.5745554036,0.0...|
|        0|[0.6724033394,0.0...|
|        0|[0.729405309,0.02...|
|        0|[0.6401636948,0.0...|
|        0|[0.5078389179,0.0...|
|        0|[0.5056859424,0.0...|
|        0|[0.5157894737,0.0...|
|        0|[0.5007922609,0.0...|
|        0|[0.8770642202,0.0...|
|        1|[0.756618275,0.02...|
|        0|[0.7603174603,0.0...|
|        1|[0.8245614035,0.0...|
|        0|[0.8223255814,0.0...|
|        0|[0.6553432413,0.0...|
|        0|[0.7981481481,0.0...|
|        0|[0.6796036334,0.0...|
|        0|[0.8171232877,0.0...|
|        0|[0.7317073171,0.0...|
|        1|[0.7147719537,0.0...|
|        0|[0.7939142462,0.0...|
+---------+--------------------+
only showing top 20 rows



In [411]:
test = spark.createDataFrame(test_pd)
test.show()

+---------+--------------------+
|dependent|            features|
+---------+--------------------+
|        0|[0.7341568206,0.0...|
|        0|[0.8484848485,0.0...|
|        0|[0.7815699659,0.0...|
|        0|[0.8095238095,0.0...|
|        1|[0.7463697967,0.0...|
|        0|[0.797188755,0.02...|
|        1|[0.7693050193,0.0...|
|        0|[0.6455331412,0.0...|
|        0|[0.791011236,0.02...|
|        0|[0.4495638984,0.0...|
|        0|[0.450091631,0.03...|
|        1|[0.4618828797,0.0...|
|        0|[0.470718232,0.03...|
|        0|[0.4377774959,0.0...|
|        0|[0.5906882591,0.0...|
|        0|[0.6623634558,0.0...|
|        1|[0.7664921466,0.0...|
|        0|[0.7779816514,0.0...|
|        1|[0.72,0.016098006...|
|        0|[0.858028169,0.07...|
+---------+--------------------+
only showing top 20 rows



In [412]:
# get number in observations in each set
print("Number of observations train: %s" %train.count())
print("Number of observations test: %s" %test.count())

Number of observations train: 98
Number of observations test: 42


In [413]:
# get distribution of dependent variable within each set
train.groupBy("dependent").count().show()
test.groupBy("dependent").count().show()

+---------+-----+
|dependent|count|
+---------+-----+
|        0|   72|
|        1|   26|
+---------+-----+

+---------+-----+
|dependent|count|
+---------+-----+
|        0|   35|
|        1|    7|
+---------+-----+



# 6. Modelling

In [414]:
# import models
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, GBTClassifier, RandomForestClassifier

In [415]:
# define the standardizer
standardizer = StandardScaler(inputCol="features", outputCol="featuresNorm")

In [416]:
# define indexer (IDX)
IDX = StringIndexer(inputCol="dependent", outputCol="label")

In [417]:
# define logistic regression model
LR = LogisticRegression(featuresCol = "featuresNorm", labelCol = "label")

# fit model on train set
LR_model = Pipeline().setStages([standardizer, IDX, LR]).fit(train)

In [418]:
# define decision tree model
DT = DecisionTreeClassifier(featuresCol = "featuresNorm", labelCol = "label")

# fit model on train set
DT_model = Pipeline().setStages([standardizer, IDX, DT]).fit(train)

In [420]:
# define decision tree model
GBT = GBTClassifier(featuresCol = "featuresNorm", labelCol = "label")

# fit model on train set
GBT_model = Pipeline().setStages([standardizer, IDX, GBT]).fit(train)

In [428]:
# define random forest model
RF = RandomForestClassifier(featuresCol = "featuresNorm", labelCol = "label")

# fit model on train set
RF_model = Pipeline().setStages([standardizer, IDX, RF]).fit(train)

# 7. Model Evaluation

In [429]:
# get predictions on test set
LR_preds_BN = LR_model.transform(test)
DT_preds_BN = DT_model.transform(test)
GBT_preds_BN = GBT_model.transform(test)
RF_preds_BN = RF_model.transform(test)

In [422]:
# import packages
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [423]:
# define evaluator (for AUC)
evaluator_auc = BinaryClassificationEvaluator()

In [424]:
# get metrics for LR model
lr_f1_BN = evaluator_mc.evaluate(LR_preds_BN, {evaluator_mc.metricName: "f1"})
lr_accuracy_BN = evaluator_mc.evaluate(LR_preds_BN, {evaluator_mc.metricName: "accuracy"})
lr_recall_BN = evaluator_mc.evaluate(LR_preds_BN, {evaluator_mc.metricName: "recallByLabel"})
lr_auc_BN = evaluator_auc.evaluate(LR_preds_BN, {evaluator_auc.metricName: 'areaUnderROC'})

In [425]:
# get metrics for DT model
dt_f1_BN = evaluator_mc.evaluate(DT_preds_BN, {evaluator_mc.metricName: "f1"})
dt_accuracy_BN = evaluator_mc.evaluate(DT_preds_BN, {evaluator_mc.metricName: "accuracy"})
dt_recall_BN = evaluator_mc.evaluate(DT_preds_BN, {evaluator_mc.metricName: "recallByLabel"})
dt_auc_BN = evaluator_auc.evaluate(DT_preds_BN, {evaluator_auc.metricName: 'areaUnderROC'})

In [426]:
# get metrics for GBT model
gbt_f1_BN = evaluator_mc.evaluate(GBT_preds_BN, {evaluator_mc.metricName: "f1"})
gbt_accuracy_BN = evaluator_mc.evaluate(GBT_preds_BN, {evaluator_mc.metricName: "accuracy"})
gbt_recall_BN = evaluator_mc.evaluate(GBT_preds_BN, {evaluator_mc.metricName: "recallByLabel"})
gbt_auc_BN = evaluator_auc.evaluate(GBT_preds_BN, {evaluator_auc.metricName: 'areaUnderROC'})

In [430]:
# get metrics for RF model
rf_f1_BN = evaluator_mc.evaluate(RF_preds_BN, {evaluator_mc.metricName: "f1"})
rf_accuracy_BN = evaluator_mc.evaluate(RF_preds_BN, {evaluator_mc.metricName: "accuracy"})
rf_recall_BN = evaluator_mc.evaluate(RF_preds_BN, {evaluator_mc.metricName: "recallByLabel"})
rf_auc_BN = evaluator_auc.evaluate(RF_preds_BN, {evaluator_auc.metricName: 'areaUnderROC'})

In [431]:
# check which of both algorithms is the best:
print("LOGISTIC REGRESSION:")
print('  F1       : %g' % lr_f1_BN)
print('  ACCURACY : %g' % lr_accuracy_BN)
print('  RECALL   : %g' % lr_recall_BN)
print('  AUC      : %g' % lr_auc_BN)
print("------------------")
print("SINGLE DECISION TREE:")
print('  F1       : %g' % dt_f1_BN)
print('  ACCURACY : %g' % dt_accuracy_BN)
print('  RECALL   : %g' % dt_recall_BN)
print('  AUC      : %g' % dt_auc_BN)
print("------------------")
print("GRADIENT-BOOSTED TREES:")
print('  F1       : %g' % gbt_f1_BN)
print('  ACCURACY : %g' % gbt_accuracy_BN)
print('  RECALL   : %g' % gbt_recall_BN)
print('  AUC      : %g' % gbt_auc_BN)
print("------------------")
print("RANDOM FOREST:")
print('  F1       : %g' % rf_f1_BN)
print('  ACCURACY : %g' % rf_accuracy_BN)
print('  RECALL   : %g' % rf_recall_BN)
print('  AUC      : %g' % rf_auc_BN)

LOGISTIC REGRESSION:
  F1       : 0.776577
  ACCURACY : 0.809524
  RECALL   : 0.942857
  AUC      : 0.571429
------------------
SINGLE DECISION TREE:
  F1       : 0.710857
  ACCURACY : 0.690476
  RECALL   : 0.771429
  AUC      : 0.628571
------------------
GRADIENT-BOOSTED TREES:
  F1       : 0.686386
  ACCURACY : 0.642857
  RECALL   : 0.657143
  AUC      : 0.622449
------------------
RANDOM FOREST:
  F1       : 0.714286
  ACCURACY : 0.714286
  RECALL   : 0.828571
  AUC      : 0.710204
