In [1]:
# import findspark
import findspark
# initialize findspark with spark directory
findspark.init("C:\Program Files\Spark\spark-3.3.1-bin-hadoop3")
#findspark.init("/Users/wouterdewitte/spark/")
# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

In [2]:
# import packages
import os 
import pickle
import re
from datetime import datetime
import requests
import pytz
import emojis
import pandas as pd
import numpy as np
import ast
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import array_contains
import matplotlib.pyplot as plt 

## General

In this notebook we will buid a model that predicts if the trend of a certain topic goes up or down on a certain day based on Twitter data of that day.

## 1. Import Data

### 1.1 Google Trends

In [3]:
# read trend data 
trend = spark.read.csv(".././../data/Google_trends/daily_trends.csv", header=True, inferSchema=True, sep=';')

In [4]:
trend

DataFrame[date: timestamp, dependent_vegan: int]

In [5]:
from pyspark.sql.window import Window

w = Window().partitionBy().orderBy(col("date"))
trend.withColumn("dependent_vegan", lag("dependent_vegan", -1, 0).over(w)).show()

+-------------------+---------------+
|               date|dependent_vegan|
+-------------------+---------------+
|2021-10-04 00:00:00|              1|
|2021-10-05 00:00:00|              1|
|2021-10-06 00:00:00|              1|
|2021-10-07 00:00:00|              1|
|2021-10-08 00:00:00|              1|
|2021-10-09 00:00:00|              0|
|2021-10-10 00:00:00|              0|
|2021-10-11 00:00:00|              0|
|2021-10-12 00:00:00|              0|
|2021-10-13 00:00:00|              0|
|2021-10-14 00:00:00|              0|
|2021-10-15 00:00:00|              1|
|2021-10-16 00:00:00|              1|
|2021-10-17 00:00:00|              0|
|2021-10-18 00:00:00|              1|
|2021-10-19 00:00:00|              0|
|2021-10-20 00:00:00|              0|
|2021-10-21 00:00:00|              1|
|2021-10-22 00:00:00|              1|
|2021-10-23 00:00:00|              1|
+-------------------+---------------+
only showing top 20 rows



In [6]:
# create SQL view
trend.createOrReplaceTempView("trendSQL")

The binary variable indicates if the trend goes up or down.

### 1.2 Twitter

In [7]:
# define data dir
data_dir = "../../data/Topic/"

# get all twitter files
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)] 

In [8]:
# import twitter data 
#twitter_df = spark.read.json(tweet_files)

In [9]:
list_hashtags = ["vegan"]

data_dir = ".././../data/Topic/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]
files_hashtags = [file for file in tweet_files if (file.find(list_hashtags[0]) != -1)]             
twitter_df = spark.read.option("multiline","true").json(files_hashtags) 
twitter_df.count()

1827680

In [10]:
# select interesting features
twitter_df = twitter_df.select(F.col('user.name'),
                                F.col('user.screen_name'),
                                F.col('user.followers_count'),
                                F.col('user.following'),
                                F.col('user.statuses_count'),
                                F.col('user.listed_count'),
                                F.col('created_at'),
                                F.col('full_text'),
                                F.col('entities.hashtags'),
                                F.col('favorite_count'),
                                F.col('retweet_count'),
                                F.col('user.friends_count'))

## 2. Data Preprocessing

### 2.1 Check time period

In [11]:
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
twitter_df = twitter_df.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

In [12]:
# get first post
first_post = F.min('post_created_at').alias('earliest')
# get latest post
latest_post = F.max('post_created_at').alias('latest')
# show tweet period in our dataset
twitter_df.select(first_post, latest_post).show()

+-------------------+-------------------+
|           earliest|             latest|
+-------------------+-------------------+
|2021-10-25 07:19:40|2022-10-11 23:17:33|
+-------------------+-------------------+



### 2.2 Remove retweets and duplicates

In [13]:
# drop all retweets from dataset
no_retweets_df = twitter_df.filter(~F.col("full_text").startswith("RT"))

In [14]:
# first sort no_retweets_df based on date in chronological order (most recent ones on top)
no_retweets_sorted_df = no_retweets_df.sort("post_created_at", ascending=False)

In [15]:
# number of observations before dropping duplicates
no_retweets_sorted_df.count()

745916

In [16]:
# drop duplicates based on tweet text and the profile it was posted from
final_no_duplicates_df = no_retweets_sorted_df.drop_duplicates(["full_text", "screen_name"])

In [17]:
# number of observations after dropping duplicates
final_no_duplicates_df.count()

693932

In [18]:
# rename dataframe
final_twitter_df = final_no_duplicates_df

## 3. Independent Variables

For our independent variables we need to design a pipeline that transforms the data into the desired aggregated metrics per day.

### 3.0 Feature Engineering

#### Define Functions

In [19]:
# define function to count hashtags
def get_hashtags(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "#" in word:
            counter += 1
    return(counter)

In [20]:
# define function to count mentions
def get_mentions(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "@" in word:
            counter += 1
    return(counter)

In [21]:
# define function to count exclamation marks
def get_exclamation_marks(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "!" in word:
            counter += 1
    return(counter)

In [22]:
# define function to count number of emojis used
def emoji_counter(text):
    nr_emojis = emojis.count(text)
    return(nr_emojis)

In [23]:
# define function to calculate engagement rate
def engagement_rate(favorite_count, retweet_count, followers_count):
    if(followers_count == 0):
        eng_rate = 0
    else:
        eng_rate = (favorite_count + retweet_count)/followers_count
    
    return eng_rate

In [24]:
# register functions as udf
get_hashtags_UDF = F.udf(get_hashtags, IntegerType())
get_mentions_UDF = F.udf(get_mentions, IntegerType())
get_exclamation_marks_UDF = F.udf(get_exclamation_marks, IntegerType())
emoji_counter_UDF = F.udf(emoji_counter, IntegerType())
engagement_rate_UDF = F.udf(engagement_rate, DoubleType())

In [25]:
# apply functions to create new features
final_twitter_df = final_twitter_df.withColumn("emoji_count", emoji_counter_UDF("full_text")) \
        .withColumn("text_tokenized", F.split("full_text", " ")) \
        .withColumn("num_words", F.size("text_tokenized")) \
        .withColumn("num_hashtags", get_hashtags_UDF("text_tokenized")) \
        .withColumn("num_mentions", get_mentions_UDF("text_tokenized")) \
        .withColumn("num_exclamation_marks", get_exclamation_marks_UDF("text_tokenized")) \
        .withColumn("engagement_rate", engagement_rate_UDF("favorite_count", "retweet_count", "followers_count"))

#### Text Cleaning

In [26]:
# filter for english tweets (NOTE: for the assignment you can translate non-english tweets using an API)
final_twitter_df = final_twitter_df.filter(F.col("lang") == "en")

In [27]:
# check number of observations
final_twitter_df.count()

469429

In [28]:
# define function to clean text
def clean_text(string):
    
    # define numbers
    NUMBERS = '0123456789'
    PUNCT = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    
    # convert text to lower case
    cleaned_string = string.lower()
    
    # remove URLS
    cleaned_string = re.sub(r'http\S+', ' ', cleaned_string)
    
    # replace emojis by words
    cleaned_string = emojis.decode(cleaned_string)
    cleaned_string = cleaned_string.replace(":"," ").replace("_"," ")
    cleaned_string = ' '.join(cleaned_string.split())
    
    # remove numbers
    cleaned_string = "".join([char for char in cleaned_string if char not in NUMBERS])
    
    # remove punctuation
    cleaned_string = "".join([char for char in cleaned_string if char not in PUNCT])
    
    # remove words conisting of one character (or less)
    cleaned_string = ' '.join([w for w in cleaned_string.split() if len(w) > 1])
    
    # return
    return(cleaned_string)

In [29]:
# convert to udf
clean_text_udf = F.udf(clean_text, StringType())

In [30]:
# clean string
final_twitter_df = final_twitter_df.withColumn("cleaned_text", clean_text_udf(F.col("full_text")))

In [31]:
# check
pd.set_option('display.max_colwidth', None)
final_twitter_df.select("full_text", "cleaned_text").limit(5).toPandas()

Unnamed: 0,full_text,cleaned_text
0,! We will be open 10am-9pm today and tomorrow!! Much love you all!!\n\n#veganuary #veganuary2022 #vegan #veganfood #veganlife #vkind #vegas #lasvegas #vegoutvegas #plantbased #veganfood #food #crueltyfree #healthy #organic #veganmarket #foodie #govegan #vegansofig #love #veganism,we will be open ampm today and tomorrow much love you all veganuary veganuary vegan veganfood veganlife vkind vegas lasvegas vegoutvegas plantbased veganfood food crueltyfree healthy organic veganmarket foodie govegan vegansofig love veganism
1,!! Daily Updates !!\n\nA taste of Vegan Food today on #TheMorningWave with @sam_lehoko and @Lauri_Leah on #ChefsTable joined by the team from Kaylee's Eatery.\n\n#MyMusicMyMix #Music #Food #Vegan #Green #Wednesday #Radio https://t.co/UUNQwkDWxt,daily updates taste of vegan food today on themorningwave with sam lehoko and lauri leah on chefstable joined by the team from kaylees eatery mymusicmymix music food vegan green wednesday radio
2,!love !iq !waddup Building time on the Sketch SMP! Emote Art Raffle at 5 Subs! https://t.co/aKZoWy32dW @CozyIslandLIVE @Streambeanzttv #twitch #TwitchStreamers #twitchstreamer #twitchtv #twitchaffiliate #Livestream #live #LGBTQ #LGBTQIA #Vegan #Minecraft #minecraftsmp #cozy #lgbt,love iq waddup building time on the sketch smp emote art raffle at subs cozyislandlive streambeanzttv twitch twitchstreamers twitchstreamer twitchtv twitchaffiliate livestream live lgbtq lgbtqia vegan minecraft minecraftsmp cozy lgbt
3,""" DID YOU KNOW ?""\n\n#senoritacosmetics #senoritacosmetics_ #skincareroutine #veganskincare #vegan #skincare #skincaretips #gogreen #govegan #facts #funfact #funfactory #funfactsoftheday https://t.co/eodzSAXqH8",did you know senoritacosmetics senoritacosmetics skincareroutine veganskincare vegan skincare skincaretips gogreen govegan facts funfact funfactory funfactsoftheday
4,""" Nature's Interconnection "" \n\nCommission for @Stefanostattoos\n\nTattoo on @apound_of_flesh\n\n#inari #inariokami #fox #foxes #redhead #mädchen #vegan #veganism #woman #women #nature #peru #peruvian #tatuador #tatuaje #tatuajes #тату #plants #plantbased #dorsettstyle #веган https://t.co/2G7IiLiWWW",natures interconnection commission for stefanostattoos tattoo on apound of flesh inari inariokami fox foxes redhead mädchen vegan veganism woman women nature peru peruvian tatuador tatuaje tatuajes тату plants plantbased dorsettstyle веган


#### Sentiment

VADER sentimental analysis relies on a dictionary that maps lexical features to emotion intensities known as sentiment scores. The sentiment score of a text can be obtained by summing up the intensity of each word in the text.

In [33]:
#using the vaderSentiment package 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [34]:
# define the function to extract the sentiment
def get_sentiment(sentence):

    # initialize sentiment analyzer
    sid_obj = SentimentIntensityAnalyzer()

    # get sentiment dict
    sentiment_dict = sid_obj.polarity_scores(sentence)
    
    # get positive sentiment score
    pos_sentiment = sentiment_dict["pos"]
    
    # return positive sentiment score
    return(pos_sentiment)

get_sentiment_udf = udf(get_sentiment, DoubleType())

In [35]:
final_twitter_df = final_twitter_df.withColumn("sentiment_vader", get_sentiment_udf(F.col("cleaned_text")))

In [36]:
final_twitter_df.select("cleaned_text", "sentiment_vader").show()

+--------------------+---------------+
|        cleaned_text|sentiment_vader|
+--------------------+---------------+
|we will be open a...|           0.27|
|daily updates tas...|            0.0|
|love iq waddup bu...|          0.127|
|did you know seno...|            0.0|
|natures interconn...|            0.0|
|scarlet punica gr...|            0.0|
|dairy milk no lon...|            0.0|
|greenwashing is r...|            0.0|
|greenwashing is r...|            0.0|
|newyorkcity schoo...|          0.128|
|vegan food is dis...|            0.0|
|vegan food’s abil...|          0.064|
|my bad cholestero...|          0.054|
|vegan festival ho...|          0.132|
|dont be too hard ...|          0.396|
|steve brexit stev...|          0.077|
|it becomes questi...|          0.211|
|or vegan ones bra...|            0.0|
|vita russia anima...|          0.068|
|of surveyed swine...|            0.0|
+--------------------+---------------+
only showing top 20 rows



#### TextBlob

TextBlob returns polarity and subjectivity of a sentence. 

**Polarity** lies between [-1,1],  -1 defines a negative sentiment and 1 defines a positive sentiment.  

**Subjectivity** quantifies the amount of personal opinion and factual information contained in the text. Subjectivity lies between [0,1]. The higher subjectivity means that the text contains personal opinion rather than factual information. 

In [38]:
#use polarity and subjectivity from TextBlob 
#https://textblob.readthedocs.io/en/dev/
from textblob import TextBlob

# define function to get polarity score of text document
def get_polarity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[0]
# define function to get subjectivity score of text document
def get_subjectivity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[1]
get_polarity_udf = F.udf(get_polarity, DoubleType())
get_subjectivity_udf = F.udf(get_subjectivity, DoubleType())

final_twitter_df = final_twitter_df.withColumn('polarity', get_polarity_udf(F.col('cleaned_text')))\
        .withColumn('subjectivity', get_subjectivity_udf(F.col('cleaned_text')))

final_twitter_df.select("cleaned_text", "sentiment_vader", "polarity", "subjectivity").show()

+--------------------+---------------+--------------------+-------------------+
|        cleaned_text|sentiment_vader|            polarity|       subjectivity|
+--------------------+---------------+--------------------+-------------------+
|we will be open a...|           0.27|               0.375|               0.55|
|daily updates tas...|            0.0|                -0.1|               0.15|
|love iq waddup bu...|          0.127| 0.14545454545454548| 0.6166666666666667|
|did you know seno...|            0.0|                 0.0|                0.0|
|natures interconn...|            0.0|                 0.0|                0.0|
|scarlet punica gr...|            0.0|                 0.0|                0.0|
|dairy milk no lon...|            0.0|                 0.0|                0.0|
|greenwashing is r...|            0.0|                 0.0|                0.0|
|greenwashing is r...|            0.0|                 0.0|                0.0|
|newyorkcity schoo...|          0.128| 0

In [39]:
final_twitter_df = final_twitter_df.withColumn('polarity', get_polarity_udf(F.col('cleaned_text')))\
        .withColumn('subjectivity', get_subjectivity_udf(F.col('cleaned_text')))

In [40]:
final_twitter_df.select("cleaned_text", "sentiment_vader", "polarity", "subjectivity").show()

+--------------------+---------------+--------------------+-------------------+
|        cleaned_text|sentiment_vader|            polarity|       subjectivity|
+--------------------+---------------+--------------------+-------------------+
|we will be open a...|           0.27|               0.375|               0.55|
|daily updates tas...|            0.0|                -0.1|               0.15|
|love iq waddup bu...|          0.127| 0.14545454545454548| 0.6166666666666667|
|did you know seno...|            0.0|                 0.0|                0.0|
|natures interconn...|            0.0|                 0.0|                0.0|
|scarlet punica gr...|            0.0|                 0.0|                0.0|
|dairy milk no lon...|            0.0|                 0.0|                0.0|
|greenwashing is r...|            0.0|                 0.0|                0.0|
|greenwashing is r...|            0.0|                 0.0|                0.0|
|newyorkcity schoo...|          0.128| 0

In [41]:
# create SQL view
final_twitter_df.createOrReplaceTempView("twitterSQL")

### 3.1 Volume of tweets 

In [42]:
# select the relevant data
volume = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, COUNT(*) as volume \
                                    FROM twitterSQL \
                                    GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                    ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [43]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
volume.show(100)

+----------+------+
|      date|volume|
+----------+------+
|2021-10-25|     2|
|2021-10-27|   396|
|2021-10-28|  1007|
|2021-10-29|  9701|
|2021-10-30|  8811|
|2021-10-31|  8602|
|2021-11-01| 17663|
|2021-11-02| 12826|
|2021-11-03|  1549|
|2021-11-04|   966|
|2021-11-05|   186|
|2021-11-06|     5|
|2021-12-05|    12|
|2021-12-06|    36|
|2021-12-07|  2273|
|2021-12-08|  8557|
|2021-12-09|  8547|
|2021-12-10|  9438|
|2021-12-11|  8224|
|2021-12-12|  6558|
|2021-12-13|  1135|
|2021-12-14|  1154|
|2021-12-15|  1190|
|2021-12-16|    89|
|2021-12-25|   495|
| 2022-1-01|  1425|
| 2022-1-02|   707|
| 2022-1-08|  1022|
| 2022-1-09|  1226|
| 2022-1-10|  1433|
| 2022-1-11|  1464|
| 2022-1-12|  1561|
| 2022-1-13|  1358|
| 2022-1-14|  1580|
| 2022-1-15|  1185|
| 2022-1-16|   167|
| 2022-1-20|  1113|
| 2022-1-21|  1904|
| 2022-1-22|  1297|
| 2022-1-23|  1611|
| 2022-1-24|  1819|
| 2022-1-25|  1337|
| 2022-1-26|  1819|
| 2022-1-27|  1490|
| 2022-1-31|    71|
|2022-10-09|   730|
|2022-10-10|   972|


In [44]:
# create SQL view
volume.createOrReplaceTempView("volumeSQL")

### 3.2 Average likes

We exclude tweets with 0 likes.

In [45]:
# select the relevant data
avg_likes = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(favorite_count) as avg_likes \
                           FROM twitterSQL \
                           WHERE favorite_count > 0 \
                           GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                           ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [46]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_likes.show()

+----------+------------------+
|      date|         avg_likes|
+----------+------------------+
|2021-10-25|               5.0|
|2021-10-27|14.878923766816143|
|2021-10-28| 8.787234042553191|
|2021-10-29|13.020252694165737|
|2021-10-30|12.171084337349397|
|2021-10-31|11.718002081165453|
|2021-11-01| 12.87780347983506|
|2021-11-02| 8.506925207756233|
|2021-11-03| 9.510228640192539|
|2021-11-04| 11.72790294627383|
|2021-11-05| 3.090909090909091|
|2021-11-06|               1.0|
|2021-12-05| 4.857142857142857|
|2021-12-06|               5.0|
|2021-12-07| 15.16415770609319|
|2021-12-08|15.566714786553929|
|2021-12-09|13.000201653559186|
|2021-12-10|18.798192771084338|
|2021-12-11|17.180092787853226|
|2021-12-12| 8.526501766784452|
+----------+------------------+
only showing top 20 rows



In [47]:
# create SQL view
avg_likes.createOrReplaceTempView("avg_likesSQL")

### 3.3 Average Retweets

We exclude tweets with 0 retweets.

In [48]:
# select the relevant data
avg_retweets = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(retweet_count) as avg_retweets \
                          FROM twitterSQL \
                          WHERE retweet_count > 0 \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [49]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_retweets.show()

+----------+------------------+
|      date|      avg_retweets|
+----------+------------------+
|2021-10-27| 7.695121951219512|
|2021-10-28| 3.169230769230769|
|2021-10-29| 7.280495759947815|
|2021-10-30| 5.383431085043989|
|2021-10-31| 6.656727272727273|
|2021-11-01|  6.63281027104137|
|2021-11-02| 4.268883878241263|
|2021-11-03| 3.362934362934363|
|2021-11-04| 3.473170731707317|
|2021-11-05|1.5476190476190477|
|2021-11-06|               1.5|
|2021-12-05|               1.0|
|2021-12-06|             3.875|
|2021-12-07| 7.544502617801047|
|2021-12-08| 7.130705394190872|
|2021-12-09|7.3532934131736525|
|2021-12-10| 8.534136546184738|
|2021-12-11|           10.0768|
|2021-12-12| 5.430648769574944|
|2021-12-13| 5.383954154727793|
+----------+------------------+
only showing top 20 rows



In [50]:
# create SQL view
avg_retweets.createOrReplaceTempView("avg_retweetsSQL")

### 3.4 Average Engagement rate

We define engagement rate of a tweet as the sum of likes and retweets divided by the amount of followers of the account that sent out the tweet. For our purpose we will take the avergage engagement rate per day. We exclude accounts who have no followers and we only take tweets into account which are liked and retweeted at least once.

In [51]:
# select the relevant data
avg_engagement_rate = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(engagement_rate) as avg_engagement_rate \
                                     FROM twitterSQL \
                                     GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                     ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [52]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_engagement_rate.show()

+----------+--------------------+
|      date| avg_engagement_rate|
+----------+--------------------+
|2021-10-25|0.003094059405940594|
|2021-10-27| 0.02134529191518882|
|2021-10-28|0.024249144556081494|
|2021-10-29|  0.0223016464141139|
|2021-10-30|0.026574548037982985|
|2021-10-31| 0.04700267284774964|
|2021-11-01| 0.02698780408562503|
|2021-11-02|0.017589389867347455|
|2021-11-03|0.030342320208788704|
|2021-11-04|0.026801852188641857|
|2021-11-05|0.011570938136072207|
|2021-11-06|0.001165479244563...|
|2021-12-05|0.003262968586207...|
|2021-12-06|0.015643164611561896|
|2021-12-07|0.022181606174828457|
|2021-12-08| 0.03823743337493997|
|2021-12-09|0.026620944322647062|
|2021-12-10| 0.04562157050472956|
|2021-12-11|0.030933989747992788|
|2021-12-12|0.018746239331627782|
+----------+--------------------+
only showing top 20 rows



In [53]:
# create SQL view
avg_engagement_rate.createOrReplaceTempView("avg_engagement_rateSQL")

### 3.5 Number of influencers

We will calculate how many influencers actively tweeted a certain day. We define an influencer as someone with:
- followers > 1000 
- engagement_rate > 0.20 
- weekly tweet frequency > 5

In [54]:
def get_influencers(follower_count_tresh, eng_rate_tresh, freq_week_tresh, data):

    #df
    df = data
    
    # get all users with their amount of followers
    influencers = df.groupBy("screen_name") \
                    .agg(first("followers_count").alias("followers_count"))

    # average engagement rate for each user
    eng_rate = df.withColumn('eng_rate', ((df['favorite_count'] + df['retweet_count'])/df['followers_count']))

    eng_rate_user = eng_rate.groupBy("screen_name") \
                            .agg(avg("eng_rate").alias("eng_rate"))

    # average freq_weekly per user
    freq_week = df.withColumn("year", year(df["post_created_at"]))
    freq_week = freq_week.withColumn('week', weekofyear('post_created_at'))

    freq_week = freq_week.groupBy('screen_name', 'year', 'week').agg(countDistinct("full_text"))\
                    .withColumnRenamed("count(full_text)", "freq") \
                        .sort('screen_name', 'year', 'week', ascending = True)
    freq_week = freq_week.select('screen_name', 'freq')

    freq_week = freq_week.groupby("screen_name").agg(avg(freq_week.freq).alias('freq'))

    # put the data together
    data_joined = eng_rate_user.join(influencers, "screen_name").join(freq_week, "screen_name")

    # filter the data
    data_joined = data_joined.filter((data_joined.followers_count > follower_count_tresh) & (data_joined.eng_rate > eng_rate_tresh) & (data_joined.freq > freq_week_tresh))
    
    # show the data
    data_joined.show()
    return data_joined

In [55]:
influencers = get_influencers(1000, 0.002, 2, final_twitter_df)

+---------------+--------------------+---------------+------------------+
|    screen_name|            eng_rate|followers_count|              freq|
+---------------+--------------------+---------------+------------------+
|        AQUAB23|0.022003034901365705|           1318|               3.0|
|AlsJane_therapy|0.008247976142192238|           6226|               2.5|
|AmazingArbuckle|0.003063373540111...|           3482|               3.0|
|   AmeliaLynn70|0.014513189093212512|           2234|2.3333333333333335|
|AstridAlderleaf|0.002847876012638...|           6320|               4.0|
|   BDAWOSBranch|0.002719854941069...|           1103|               3.0|
|    BlogofVegan|0.003641420110945388|           9257| 4.653846153846154|
|   BrianKateman|0.004763913172491486|           1542|               3.5|
|   CathyGreen67|0.003029875597498...|           1161|3.1666666666666665|
|   ChubbieVegan|0.003564221783895219|           2012|2.4411764705882355|
|      CloseUpPR|0.002352610723569...|

In [56]:
# create SQL view
influencers.createOrReplaceTempView("influencersSQL")

In [57]:
# select the relevant data
number_of_influencers = spark.sql(" SELECT DATE_FORMAT(a.post_created_at, 'Y-M-dd') as date, COUNT(b.screen_name) as influencers \
                                    FROM twitterSQL a \
                                    RIGHT OUTER JOIN influencersSQL b ON a.screen_name = b.screen_name\
                                    GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                    ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [58]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
number_of_influencers.show()

+----------+-----------+
|      date|influencers|
+----------+-----------+
|2021-10-27|         32|
|2021-10-28|         66|
|2021-10-29|        533|
|2021-10-30|        400|
|2021-10-31|        433|
|2021-11-01|        895|
|2021-11-02|        638|
|2021-11-03|        120|
|2021-11-04|         92|
|2021-11-05|          9|
|2021-12-05|          3|
|2021-12-06|          2|
|2021-12-07|        188|
|2021-12-08|        394|
|2021-12-09|        452|
|2021-12-10|        547|
|2021-12-11|        575|
|2021-12-12|        325|
|2021-12-13|         82|
|2021-12-14|         77|
+----------+-----------+
only showing top 20 rows



In [59]:
# create SQL view
number_of_influencers.createOrReplaceTempView("number_of_influencersSQL")

### 3.6 Average Followers 

In [60]:
# select the relevant data
avg_followers = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(followers_count) as avg_followers \
                          FROM twitterSQL \
                          WHERE followers_count > 0 \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [61]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_followers.show()

+----------+------------------+
|      date|     avg_followers|
+----------+------------------+
|2021-10-25|            1037.0|
|2021-10-27|3704.7251308900522|
|2021-10-28|11143.559559559559|
|2021-10-29|15421.784556225317|
|2021-10-30|13092.364626318202|
|2021-10-31| 8639.303048134636|
|2021-11-01|14902.429470672389|
|2021-11-02|12490.079042670446|
|2021-11-03|7811.1375488917865|
|2021-11-04| 5468.707240293809|
|2021-11-05| 4627.354838709677|
|2021-11-06|             880.2|
|2021-12-05|1248.4545454545455|
|2021-12-06|14294.638888888889|
|2021-12-07|  6173.18774966711|
|2021-12-08| 18653.36134057116|
|2021-12-09| 9285.449952896844|
|2021-12-10|12855.178205128204|
|2021-12-11| 11750.13307144611|
|2021-12-12|10717.924249422633|
+----------+------------------+
only showing top 20 rows



In [62]:
# create SQL view
number_of_influencers.createOrReplaceTempView("followersSQL")

### 3.7 Average Emoji Counts

In [63]:
# select the relevant data
avg_emoji = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(emoji_count) as avg_emojis \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [64]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_emoji.show()

+----------+-------------------+
|      date|         avg_emojis|
+----------+-------------------+
|2021-10-25|                0.0|
|2021-10-27| 0.4393939393939394|
|2021-10-28|0.40913604766633566|
|2021-10-29| 0.6046799299041336|
|2021-10-30| 0.6746112813528544|
|2021-10-31| 0.6129969774471054|
|2021-11-01| 0.7203193115552284|
|2021-11-02| 0.5962108217682832|
|2021-11-03| 0.4280180761781795|
|2021-11-04|0.36231884057971014|
|2021-11-05| 0.4731182795698925|
|2021-11-06|                0.2|
|2021-12-05| 1.3333333333333333|
|2021-12-06| 1.1111111111111112|
|2021-12-07| 0.4747030356357237|
|2021-12-08| 0.5062521911885006|
|2021-12-09| 0.5205335205335205|
|2021-12-10|0.49502013138376777|
|2021-12-11| 0.5052285992217899|
|2021-12-12| 0.4501372369624886|
+----------+-------------------+
only showing top 20 rows



In [65]:
# create SQL view
number_of_influencers.createOrReplaceTempView("emojiSQL")

### 3.8 Avergae Number of Words

In [66]:
# select the relevant data
avg_words = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_words) as avg_words \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [67]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_words.show()

+----------+------------------+
|      date|         avg_words|
+----------+------------------+
|2021-10-25|              15.0|
|2021-10-27|29.348484848484848|
|2021-10-28|25.028798411122146|
|2021-10-29|23.752808988764045|
|2021-10-30|23.280444898422427|
|2021-10-31| 23.88246919321088|
|2021-11-01| 24.81679216441148|
|2021-11-02|24.399968813347886|
|2021-11-03|25.956100710135573|
|2021-11-04|29.697722567287784|
|2021-11-05|26.892473118279568|
|2021-11-06|              15.6|
|2021-12-05|26.583333333333332|
|2021-12-06|23.666666666666668|
|2021-12-07|22.808622965244172|
|2021-12-08| 22.82587355381559|
|2021-12-09|23.022698022698023|
|2021-12-10|23.552235643144734|
|2021-12-11|23.504012645914397|
|2021-12-12|23.552912473315036|
+----------+------------------+
only showing top 20 rows



In [68]:
# create SQL view
avg_words.createOrReplaceTempView("wordsSQL")

### 3.9 Avergae Number of Hashtags

In [69]:
# select the relevant data
avg_hashtags = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_hashtags) as avg_hashtags \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [70]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_hashtags.show()

+----------+------------------+
|      date|      avg_hashtags|
+----------+------------------+
|2021-10-25|               0.0|
|2021-10-27|1.6767676767676767|
|2021-10-28|1.3475670307845085|
|2021-10-29|1.0503040923616123|
|2021-10-30| 1.139371240494836|
|2021-10-31|1.1068356196233433|
|2021-11-01|1.4346939930929061|
|2021-11-02|1.1361297364727896|
|2021-11-03|1.5061329890251776|
|2021-11-04|1.5838509316770186|
|2021-11-05| 3.575268817204301|
|2021-11-06|               5.6|
|2021-12-05|             11.75|
|2021-12-06| 6.833333333333333|
|2021-12-07|0.9001319841619005|
|2021-12-08| 1.120603015075377|
|2021-12-09|1.0933660933660934|
|2021-12-10|0.9635515999152363|
|2021-12-11|0.8852140077821011|
|2021-12-12| 1.313815187557182|
+----------+------------------+
only showing top 20 rows



In [71]:
# create SQL view
avg_hashtags.createOrReplaceTempView("hashtagsSQL")

### 3.10 Average Number of Mentions

In [72]:
# select the relevant data
avg_mentions = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_mentions) as avg_mentions \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [73]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_mentions.show()

+----------+------------------+
|      date|      avg_mentions|
+----------+------------------+
|2021-10-25|               0.5|
|2021-10-27|1.0934343434343434|
|2021-10-28|1.0903674280039721|
|2021-10-29|   1.1762704875786|
|2021-10-30|1.3434343434343434|
|2021-10-31|1.2687747035573123|
|2021-11-01|1.0906980694106323|
|2021-11-02|1.3746296585061593|
|2021-11-03|1.2246610716591348|
|2021-11-04|1.2536231884057971|
|2021-11-05|0.8118279569892473|
|2021-11-06|               0.0|
|2021-12-05|0.3333333333333333|
|2021-12-06|0.7777777777777778|
|2021-12-07|0.9661240651121865|
|2021-12-08|0.8418838377936193|
|2021-12-09|0.9275769275769276|
|2021-12-10| 1.134668361941089|
|2021-12-11|1.0730787937743191|
|2021-12-12|1.0767002134797194|
+----------+------------------+
only showing top 20 rows



In [74]:
# create SQL view
avg_mentions.createOrReplaceTempView("mentionsSQL")

### 3.11 Average Number of Exclamation Marks

In [75]:
# select the relevant data
avg_marks = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(num_exclamation_marks) as avg_exclamation_marks \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [76]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_marks.show()

+----------+---------------------+
|      date|avg_exclamation_marks|
+----------+---------------------+
|2021-10-25|                  0.0|
|2021-10-27|  0.20959595959595959|
|2021-10-28|   0.2025819265143992|
|2021-10-29|   0.2621379239253685|
|2021-10-30|  0.23357167177391897|
|2021-10-31|   0.2342478493373634|
|2021-11-01|   0.3399196059559531|
|2021-11-02|  0.23023545922345237|
|2021-11-03|   0.2091672046481601|
|2021-11-04|  0.16252587991718426|
|2021-11-05|  0.41935483870967744|
|2021-11-06|                  0.0|
|2021-12-05|   0.3333333333333333|
|2021-12-06|   0.3055555555555556|
|2021-12-07|  0.29916410030796303|
|2021-12-08|  0.24856842351291342|
|2021-12-09|  0.24254124254124254|
|2021-12-10|  0.23151091332909515|
|2021-12-11|              0.21875|
|2021-12-12|  0.23132052455016774|
+----------+---------------------+
only showing top 20 rows



In [77]:
# create SQL view
avg_marks.createOrReplaceTempView("marksSQL")

### 3.12 Sentiment 

In [78]:
# select the relevant data
avg_sentiment = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(sentiment_vader) as avg_sentiment \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [79]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_sentiment.show()

+----------+-------------------+
|      date|      avg_sentiment|
+----------+-------------------+
|2021-10-25|0.21450000000000002|
|2021-10-27| 0.1265151515151515|
|2021-10-28|0.12339424031777559|
|2021-10-29|0.13889227914647964|
|2021-10-30| 0.1416517988877539|
|2021-10-31|0.14321367123924658|
|2021-11-01|0.16307739342127617|
|2021-11-02|0.14077943240293153|
|2021-11-03|0.13228986442866364|
|2021-11-04|0.11481987577639752|
|2021-11-05|0.14730107526881722|
|2021-11-06|             0.1476|
|2021-12-05|            0.15025|
|2021-12-06|0.21541666666666665|
|2021-12-07|0.15144830620325564|
|2021-12-08|0.14207093607572754|
|2021-12-09|0.14165531765531755|
|2021-12-10|0.13792424242424242|
|2021-12-11|0.13936016536964976|
|2021-12-12|0.14111802378774016|
+----------+-------------------+
only showing top 20 rows



In [80]:
# create SQL view
avg_sentiment.createOrReplaceTempView("sentimentSQL")

### 3.13 Polarity

In [81]:
# select the relevant data
avg_polarity = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(polarity) as avg_polarity \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [82]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_polarity.show()

+----------+-------------------+
|      date|       avg_polarity|
+----------+-------------------+
|2021-10-25|0.32500000000000007|
|2021-10-27|0.12702171565641085|
|2021-10-28| 0.1141894718000903|
|2021-10-29|0.12819660932414867|
|2021-10-30| 0.1351922168225551|
|2021-10-31|0.13300623006534804|
|2021-11-01|0.18316873996993763|
|2021-11-02| 0.1344894138598013|
|2021-11-03|0.11277198890931094|
|2021-11-04|0.10145347439457912|
|2021-11-05| 0.2000849611982004|
|2021-11-06|0.17398989898989897|
|2021-12-05|0.14224537037037038|
|2021-12-06| 0.2354082350262906|
|2021-12-07|0.14625957594418673|
|2021-12-08|0.13326438421092746|
|2021-12-09| 0.1309096075876976|
|2021-12-10|0.12113829041011834|
|2021-12-11|0.12830378170827264|
|2021-12-12|0.12405803675085038|
+----------+-------------------+
only showing top 20 rows



In [83]:
# create SQL view
avg_polarity.createOrReplaceTempView("polaritySQL")

### 3.13 Subjectivity

In [84]:
# select the relevant data
avg_subjectivity = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(subjectivity) as avg_subjectivity \
                          FROM twitterSQL \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [85]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_subjectivity.show()

+----------+-------------------+
|      date|   avg_subjectivity|
+----------+-------------------+
|2021-10-25|              0.575|
|2021-10-27| 0.4607615970022174|
|2021-10-28|0.40723402379121265|
|2021-10-29|0.41847483528656193|
|2021-10-30| 0.4174534735960923|
|2021-10-31|0.41968838608618286|
|2021-11-01| 0.4484449299283815|
|2021-11-02|0.42274322635546135|
|2021-11-03|0.42525202685569896|
|2021-11-04| 0.4268351742889971|
|2021-11-05|0.48653377742826687|
|2021-11-06| 0.3757070707070707|
|2021-12-05|0.28032407407407406|
|2021-12-06|  0.419161550897662|
|2021-12-07| 0.4064326072683204|
|2021-12-08| 0.4032351293825101|
|2021-12-09|0.40076446549362343|
|2021-12-10| 0.4005971880182179|
|2021-12-11|0.40747466387117864|
|2021-12-12| 0.4147956652341835|
+----------+-------------------+
only showing top 20 rows



In [86]:
# create SQL view
avg_subjectivity.createOrReplaceTempView("subjectivitySQL")

## 4. Basetable

In [125]:
# create basetable
basetable = spark.sql("SELECT DATE_FORMAT(a.date, 'Y-M-dd') as date, a.dependent_vegan, b.tweet_volume, COALESCE(c.avg_likes,0) as avg_likes \
                       COALESCE(d.avg_retweets,0) as avg_retweets, \
                       COALESCE(e.avg_engagement_rate,0) as avg_engagement_rate, COALESCE(f.influencers,0) as influencers \
                    FROM trendSQL a \
                    INNER JOIN tweet_volumeSQL b ON DATE_FORMAT(a.date, 'Y-M-dd') = b.date \
                    LEFT OUTER JOIN avg_likesSQL c ON b.date = c.date \
                    LEFT OUTER JOIN avg_retweetsSQL d ON c.date = d.date \
                    LEFT OUTER JOIN avg_engagement_rateSQL e ON d.date = e.date \
                    LEFT OUTER JOIN number_of_influencersSQL f ON e.date = f.date \
                    LEFT OUTER JOIN followersSQL g ON f.date = g.date \ 
                    LEFT OUTER JOIN emojiSQL h ON g.date = h.date \
                    LEFT OUTER JOIN wordsSQL i ON h.date = i.date \
                    LEFT OUTER JOIN hashtagsSQL j ON i.date = j.date \
                    LEFT OUTER JOIN mentionsSQL k ON j.date = k.date \
                    LEFT OUTER JOIN marksSQL l ON k.date = l.date \
                    LEFt OUTER JOIN sentimentSQL m ON l.date = m.date \
                    LEFT OUTER JOIN polaritySQL n ON m.date = n.date \
                    LEFT OUTER JOIN subjectivitySQL o ON n.date = o.date \
                    ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')" )

SyntaxError: unterminated string literal (detected at line 11) (365076258.py, line 2)

In [122]:
# show
basetable.show(50)

NameError: name 'basetable' is not defined

In [None]:
# Dropping the date column as this is not a feature 
basetable.drop('date', axis=1, inplace=True)
basetable.head()

In [None]:
# store basetable as a .parquet file
basetable.to_parquet("./../../data/basetable_vegan_trend_prediction.parquet")

In [None]:
# export basetable as a .json file
basetable.to_json("./../../data/basetable_vegan_trend_prediction.json", orient="records", force_ascii=False, lines=True)

## 5. Modelling

In [None]:
# read in the saved basetable (.parquet)
#basetable_df = spark.read.parquet("./../../data/basetable_vegan_trend_prediction.parquet")

In [None]:
# read in the saved basetable (.json)
#basetable_df = spark.read.json("./../../data/basetable_vegan_trend_prediction.json")

In [None]:
#basetable_df.toPandas().head()

In [None]:
# import the required functions
from pyspark.ml.feature import Binarizer, StringIndexer, VectorIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType

In [None]:
# define string indexer to index price 
SI = StringIndexer(inputCol = 'dependent_vegan', outputCol = 'label')

# define vector assembler for numeric variables
numColumns = ['volume','avg_likes','avg_retweets','avg_engagement_rate','influencers' \
             ,'avg_followers', 'avg_emojis', 'avg_words', 'avg_hashtags', 'avg_mentions' \
             ,'avg_exclamation_marks', 'avg_sentiment', 'avg_polarity','avg_subjectivity']
VAnum = VectorAssembler(inputCols=numColumns, outputCol="numFeatures")

In [None]:
# define pipeline stages
stages = [SI, VAnum]
# define pipeline and fit on data
preprocessingPipeline = Pipeline().setStages(stages).fit(basetable)
# apply pipeline on data
basetable_df = preprocessingPipeline.transform(basetable)

In [None]:
# select features and labels
basetable = basetable.select(["numFeatures", "label"])

In [None]:
# check
basetable.show(5)

**Logistic Regression**
- Split the data in a train and test set (70/30).
- Build one pipeline that:
  - standardizes the numerical variables
  - applies a logistic regression to the data
  - check the performance using the AUC.

We cannot use the randomsplit function, because we have time series data, so we have to use another approach

First we look at the amount of observations that will be assigned to the training set 

In [None]:
nr_train = int(basetable.count()*0.7)
nr_train

convert the final basetable to a pandas dataset 

In [None]:
basetable_pd = basetable.toPandas()
basetable_pd.head()

Split the dataframe into train and test 

In [None]:
train_pd = basetable_pd.iloc[:nr_train,:]
test_pd = basetable_pd.iloc[nr_train:,:]
train = spark.createDataFrame(train_pd)
test = spark.createDataFrame(test_pd)

In [None]:
# check number of observations in train and test set
print(train.count())
print(test.count())

In [None]:
# inspect distribution of label in train and test set
basetable.groupBy("label").count().show()
train.groupBy("label").count().show()
test.groupBy("label").count().show()

In [None]:
# import required features
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# define scaler
SS = StandardScaler(inputCol = 'numFeatures', outputCol = 'scaledNumFeatures', withStd = True, withMean = False)

# define vector assembler
VA = VectorAssembler(inputCols = ['scaledNumFeatures'], outputCol = 'features')

# define logistic regression model
LR = LogisticRegression(labelCol = 'label', featuresCol = 'features', maxIter = 10)

In [None]:
# define pipeline stages
stages = [SS, VA, LR]
# create pipeline and fit on training set
lrModelPipeline = Pipeline().setStages(stages).fit(train)
# apply pipeline on test set to get predictions
predictions = lrModelPipeline.transform(test)

In [None]:
# inspect predictions
predictions.show(5)

In [None]:
# define evaluator
evaluator = BinaryClassificationEvaluator()
# get evaluation metric
lrAUC = evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})
# inspect model performance
print('AUC lr: %f' %(lrAUC))