In [2]:
# import findspark
import findspark
# initialize findspark with spark directory
#findspark.init("C:\Program Files\Spark\spark-3.3.1-bin-hadoop3")
findspark.init("/Users/wouterdewitte/spark/")
# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/07 16:01:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# import packages
import os 
import pickle
import re
from datetime import datetime
import requests
import pytz
import emojis
import pandas as pd
import numpy as np
import ast
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import array_contains
import matplotlib.pyplot as plt 

## General

In this notebook we will buid a model that predicts if the trend of a certain topic goes up or down on a certain day based on Twitter data of that day.

## 1. Import Data

### 1.1 Google Trends

In [5]:
# read trend data 
trend = spark.read.csv(".././../data/Google_trends/daily_trends.csv", header=True, inferSchema=True, sep=';')

In [45]:
trend

DataFrame[date: timestamp, dependent_vegan: int]

In [54]:
from pyspark.sql.window import Window

w = Window().partitionBy().orderBy(col("date"))
trend.withColumn("dependent_vegan", lag("dependent_vegan", -1, 0).over(w)).show()

22/12/07 16:43:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/07 16:43:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/07 16:43:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/07 16:43:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/07 16:43:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+-------------------+---------------+
|               date|dependent_vegan|
+-------------------+---------------+
|2021-10-04 00:00:00|              1|
|2021-10-05 00:00:00|  

In [55]:
# create SQL view
trend.createOrReplaceTempView("trendSQL")

The binary variable indicates if the trend goes up or down.

### 1.2 Twitter

In [56]:
# define data dir
data_dir = "../../data/Topic/"

# get all twitter files
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)] 

In [57]:
# import twitter data 
#twitter_df = spark.read.json(tweet_files)

In [58]:
list_hashtags = ["vegan",
               "veganism",
               "vegetarian",
                "veganfood",
                "vegano",
                "veganrecipes",
                "vegansofig",
                "vegansofinstagram"]

data_dir = ".././../data/Topic/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]
files_hashtags = [file for file in tweet_files if (file.find(list_hashtags[0]) != -1)]             
twitter_df = spark.read.option("multiline","true").json(files_hashtags) 
twitter_df.count()

                                                                                

1827680

In [59]:
# select interesting features
twitter_df = twitter_df.select(F.col('user.name'),
                                F.col('user.screen_name'),
                                F.col('user.followers_count'),
                                F.col('user.following'),
                                F.col('user.statuses_count'),
                                F.col('user.listed_count'),
                                F.col('created_at'),
                                F.col('full_text'),
                                F.col('entities.hashtags'),
                                F.col('favorite_count'),
                                F.col('retweet_count'),
                                F.col('user.friends_count'))

## 2. Data Preprocessing

#### 2.1 Check time period

In [60]:
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
twitter_df = twitter_df.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

In [61]:
# get first post
first_post = F.min('post_created_at').alias('earliest')
# get latest post
latest_post = F.max('post_created_at').alias('latest')
# show tweet period in our dataset
twitter_df.select(first_post, latest_post).show()



+-------------------+-------------------+
|           earliest|             latest|
+-------------------+-------------------+
|2021-10-25 07:19:40|2022-10-11 23:17:33|
+-------------------+-------------------+



                                                                                

#### 2.2 Remove retweets and duplicates

In [62]:
# drop all retweets from dataset
no_retweets_df = twitter_df.filter(~F.col("full_text").startswith("RT"))

In [63]:
# first sort no_retweets_df based on date in chronological order (most recent ones on top)
no_retweets_sorted_df = no_retweets_df.sort("post_created_at", ascending=False)

In [64]:
# number of observations before dropping duplicates
no_retweets_sorted_df.count()

                                                                                

745916

In [65]:
# drop duplicates based on tweet text and the profile it was posted from
final_no_duplicates_df = no_retweets_sorted_df.drop_duplicates(["full_text", "screen_name"])

In [66]:
# number of observations after dropping duplicates
final_no_duplicates_df.count()

                                                                                

693932

In [67]:
# rename dataframe
final_twitter_df = final_no_duplicates_df

## 3. Independent Variables

For our independent variables we need to design a pipeline that transforms the data into the desired aggregated metrics per day.

In [68]:
# create SQL view
final_twitter_df.createOrReplaceTempView("twitterSQL")

### 3.1 Volume of tweets 

In [69]:
# select the relevant data
tweet_volume = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, COUNT(*) as tweet_volume \
                                    FROM twitterSQL \
                                    GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                    ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [70]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
tweet_volume.show(100)

[Stage 285:>                                                        (0 + 8) / 9]

+----------+------------+
|      date|tweet_volume|
+----------+------------+
|2021-10-25|          50|
|2021-10-26|          45|
|2021-10-27|         894|
|2021-10-28|        2825|
|2021-10-29|       14021|
|2021-10-30|       12497|
|2021-10-31|       12414|
|2021-11-01|       24108|
|2021-11-02|       17623|
|2021-11-03|        3316|
|2021-11-04|        2560|
|2021-11-05|         593|
|2021-11-06|           6|
|2021-12-03|           4|
|2021-12-04|          66|
|2021-12-05|          72|
|2021-12-06|        1336|
|2021-12-07|        4560|
|2021-12-08|       13077|
|2021-12-09|       12693|
|2021-12-10|       13848|
|2021-12-11|       12213|
|2021-12-12|       10589|
|2021-12-13|        2930|
|2021-12-14|        1941|
|2021-12-15|        1596|
|2021-12-16|         107|
|2021-12-25|         637|
| 2022-1-01|        1744|
| 2022-1-02|         973|
| 2022-1-08|        1326|
| 2022-1-09|        1672|
| 2022-1-10|        1886|
| 2022-1-11|        1891|
| 2022-1-12|        1996|
| 2022-1-13|

                                                                                

In [71]:
# create SQL view
tweet_volume.createOrReplaceTempView("tweet_volumeSQL")

### 3.2 Average likes

We exclude tweets with 0 likes.

In [72]:
# select the relevant data
avg_likes = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(favorite_count) as avg_likes \
                           FROM twitterSQL \
                           WHERE favorite_count > 0 \
                           GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                           ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [73]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_likes.show()



+----------+------------------+
|      date|         avg_likes|
+----------+------------------+
|2021-10-25|           4.65625|
|2021-10-26|             5.125|
|2021-10-27|11.645669291338583|
|2021-10-28|11.103731815306768|
|2021-10-29| 12.31424108305129|
|2021-10-30|11.979163693449408|
|2021-10-31|12.956186317321688|
|2021-11-01| 13.27580421620833|
|2021-11-02| 8.794319501636576|
|2021-11-03|15.065796937039138|
|2021-11-04|10.239657631954351|
|2021-11-05| 3.459016393442623|
|2021-11-06|               2.5|
|2021-12-03|              20.0|
|2021-12-04|             10.08|
|2021-12-05|5.7105263157894735|
|2021-12-06| 11.53735255570118|
|2021-12-07|26.699334319526628|
|2021-12-08| 14.52754383542731|
|2021-12-09|12.793646370349729|
+----------+------------------+
only showing top 20 rows



                                                                                

In [74]:
# create SQL view
avg_likes.createOrReplaceTempView("avg_likesSQL")

### 3.3 Average Retweets

We exclude tweets with 0 retweets.

In [75]:
# select the relevant data
avg_retweets = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(retweet_count) as avg_retweets \
                          FROM twitterSQL \
                          WHERE retweet_count > 0 \
                          GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                          ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [76]:
# show 
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_retweets.show()



+----------+------------------+
|      date|      avg_retweets|
+----------+------------------+
|2021-10-25|               3.0|
|2021-10-26| 4.166666666666667|
|2021-10-27| 5.993006993006993|
|2021-10-28| 5.175879396984925|
|2021-10-29|6.7106673161227475|
|2021-10-30| 5.183630640083946|
|2021-10-31| 6.077004219409282|
|2021-11-01| 6.752923976608187|
|2021-11-02| 4.722175732217573|
|2021-11-03| 8.869379014989294|
|2021-11-04| 5.420485175202156|
|2021-11-05|1.7222222222222223|
|2021-11-06|               1.5|
|2021-12-03|               2.5|
|2021-12-04|2.5714285714285716|
|2021-12-05|2.7777777777777777|
|2021-12-06| 7.503703703703704|
|2021-12-07|13.976780185758514|
|2021-12-08| 6.731100963977676|
|2021-12-09|  6.67574931880109|
+----------+------------------+
only showing top 20 rows



                                                                                

In [77]:
# create SQL view
avg_retweets.createOrReplaceTempView("avg_retweetsSQL")

### 3.4 Engagement rate

We define engagement rate of a tweet as the sum of likes and retweets divided by the amount of followers of the account that sent out the tweet. For our purpose we will take the avergage engagement rate per day. We exclude accounts who have no followers and we only take tweets into account which are liked and retweeted at least once.

In [78]:
# select the relevant data
avg_engagement_rate = spark.sql("SELECT DATE_FORMAT(post_created_at, 'Y-M-dd') as date, AVG(engagement_rate) as avg_engagement_rate \
                                     FROM (  SELECT screen_name, post_created_at, (favorite_count+retweet_count)/followers_count as engagement_rate \
                                             FROM twitterSQL \
                                             WHERE favorite_count > 0 AND retweet_count > 0 AND followers_count > 0 ) \
                                     GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                     ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [79]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
avg_engagement_rate.show()



22/12/07 16:49:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 16:49:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 16:49:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 16:49:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 16:49:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 16:49:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 16:49:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




+----------+--------------------+
|      date| avg_engagement_rate|
+----------+--------------------+
|2021-10-25|0.035312352622552404|
|2021-10-26|0.041442045473123704|
|2021-10-27| 0.07417525789424631|
|2021-10-28| 0.21275163037467218|
|2021-10-29| 0.07491809784621552|
|2021-10-30| 0.11431344329991702|
|2021-10-31| 0.19613345872986773|
|2021-11-01| 0.09034112086921371|
|2021-11-02| 0.06331765741485894|
|2021-11-03| 0.34234633383160695|
|2021-11-04| 0.05662231773327246|
|2021-11-05| 0.21296895770236038|
|2021-11-06|0.005484460694698354|
|2021-12-03| 0.01529917011031044|
|2021-12-04| 0.32792771260105785|
|2021-12-05|0.009540321788060942|
|2021-12-06| 0.06278172079703152|
|2021-12-07|  0.2260207942040726|
|2021-12-08|  0.1298943012098277|
|2021-12-09| 0.06074244817993021|
+----------+--------------------+
only showing top 20 rows



                                                                                

In [80]:
# create SQL view
avg_engagement_rate.createOrReplaceTempView("avg_engagement_rateSQL")

### 3.5 Number of influencers

We will calculate how many influencers actively tweeted a certain day. We define an influencer as someone with:
- followers > 1000 
- engagement_rate > 0.20 
- weekly tweet frequency > 5

In [81]:
def get_influencers(follower_count_tresh, eng_rate_tresh, freq_week_tresh, data):

    #df
    df = data
    
    # get all users with their amount of followers
    influencers = df.groupBy("screen_name") \
                    .agg(first("followers_count").alias("followers_count"))

    # average engagement rate for each user
    eng_rate = df.withColumn('eng_rate', ((df['favorite_count'] + df['retweet_count'])/df['followers_count']))

    eng_rate_user = eng_rate.groupBy("screen_name") \
                            .agg(avg("eng_rate").alias("eng_rate"))

    # average freq_weekly per user
    freq_week = df.withColumn("year", year(df["post_created_at"]))
    freq_week = freq_week.withColumn('week', weekofyear('post_created_at'))

    freq_week = freq_week.groupBy('screen_name', 'year', 'week').agg(countDistinct("full_text"))\
                    .withColumnRenamed("count(full_text)", "freq") \
                        .sort('screen_name', 'year', 'week', ascending = True)
    freq_week = freq_week.select('screen_name', 'freq')

    freq_week = freq_week.groupby("screen_name").agg(avg(freq_week.freq).alias('freq'))

    # put the data together
    data_joined = eng_rate_user.join(influencers, "screen_name").join(freq_week, "screen_name")

    # filter the data
    data_joined = data_joined.filter((data_joined.followers_count > follower_count_tresh) & (data_joined.eng_rate > eng_rate_tresh) & (data_joined.freq > freq_week_tresh))
    
    # show the data
    data_joined.show()
    return data_joined

In [82]:
influencers = get_influencers(1000, 0.002, 2, final_twitter_df)

[Stage 331:(136 + 7) / 143][Stage 333:>  (0 + 1) / 9][Stage 335:>  (0 + 0) / 9]]

22/12/07 16:52:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 331:(140 + 3) / 143][Stage 333:>  (2 + 5) / 9][Stage 335:>  (0 + 0) / 9]

22/12/07 16:52:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




+---------------+--------------------+---------------+------------------+
|    screen_name|            eng_rate|followers_count|              freq|
+---------------+--------------------+---------------+------------------+
|      5GenocIDe|0.003218405440206...|           1140| 4.942857142857143|
|        AQUAB23|0.022003034901365705|           1318|               3.0|
|AlsJane_therapy|0.008247976142192238|           6226|               2.5|
|AmazingArbuckle|0.003063373540111...|           3482|               3.0|
|   AmeliaLynn70|0.014513189093212512|           2234|2.3333333333333335|
|Antoniosaiyajin|0.005135345260946718|           3699|               3.0|
|   BDAWOSBranch|0.002719854941069...|           1103|               3.0|
|    BlogofVegan|0.003437569278129...|           9257| 5.115384615384615|
|   BrianKateman|0.004763913172491486|           1542|               3.5|
|   CathyGreen67|0.003029875597498...|           1161|3.1666666666666665|
|   ChubbieVegan|0.003564221783895...|

                                                                                

In [83]:
# create SQL view
influencers.createOrReplaceTempView("influencersSQL")

In [84]:
# select the relevant data
number_of_influencers = spark.sql(" SELECT DATE_FORMAT(a.post_created_at, 'Y-M-dd') as date, COUNT(b.screen_name) as influencers \
                                    FROM twitterSQL a \
                                    RIGHT OUTER JOIN influencersSQL b ON a.screen_name = b.screen_name\
                                    GROUP BY DATE_FORMAT(post_created_at, 'Y-M-dd') \
                                    ORDER BY DATE_FORMAT(post_created_at, 'Y-M-dd')")

In [85]:
# show
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
number_of_influencers.show()

[Stage 387:>                                                        (0 + 8) / 9]

+----------+-----------+
|      date|influencers|
+----------+-----------+
|2021-10-26|          1|
|2021-10-27|         47|
|2021-10-28|        124|
|2021-10-29|        764|
|2021-10-30|        580|
|2021-10-31|        631|
|2021-11-01|       1216|
|2021-11-02|        887|
|2021-11-03|        180|
|2021-11-04|        147|
|2021-11-05|         20|
|2021-12-04|          3|
|2021-12-05|          6|
|2021-12-06|         26|
|2021-12-07|        264|
|2021-12-08|        584|
|2021-12-09|        698|
|2021-12-10|        759|
|2021-12-11|        775|
|2021-12-12|        528|
+----------+-----------+
only showing top 20 rows



                                                                                

In [86]:
# create SQL view
number_of_influencers.createOrReplaceTempView("number_of_influencersSQL")

## 4. Basetable

In [87]:
# create basetable
basetable = spark.sql("SELECT DATE_FORMAT(a.date, 'Y-M-dd') as date, a.dependent_vegan, b.tweet_volume, COALESCE(c.avg_likes,0) as avg_likes, \
                       COALESCE(d.avg_retweets,0) as avg_retweets, \
                       COALESCE(e.avg_engagement_rate,0) as avg_engagement_rate, COALESCE(f.influencers,0) as influencers \
                       FROM trendSQL a \
                       INNER JOIN tweet_volumeSQL b ON DATE_FORMAT(a.date, 'Y-M-dd') = b.date \
                       LEFT OUTER JOIN avg_likesSQL c ON b.date = c.date \
                       LEFT OUTER JOIN avg_retweetsSQL d ON c.date = d.date \
                       LEFT OUTER JOIN avg_engagement_rateSQL e ON d.date = e.date \
                       LEFT OUTER JOIN number_of_influencersSQL f ON e.date = f.date")

In [88]:
# show
basetable.show(50)

[Stage 427:(140 + 3) / 143][Stage 431:>  (0 + 5) / 9][Stage 433:>  (0 + 0) / 9]]

22/12/07 17:01:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 427:(142 + 1) / 143][Stage 431:=> (5 + 4) / 9][Stage 433:>  (0 + 3) / 9]

22/12/07 17:01:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 431:==>(6 + 3) / 9][Stage 433:>  (0 + 5) / 9][Stage 435:>  (0 + 0) / 9]

22/12/07 17:01:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 433:>  (0 + 8) / 9][Stage 435:>  (0 + 0) / 9][Stage 437:>  (0 + 0) / 9]

22/12/07 17:01:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 433:==>(8 + 1) / 9][Stage 435:>  (0 + 7) / 9][Stage 437:>  (0 + 0) / 9]

22/12/07 17:01:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 439:>  (0 + 8) / 9][Stage 442:>  (0 + 0) / 9][Stage 444:>  (0 + 0) / 9]

22/12/07 17:01:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:01:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

+----------+---------------+------------+------------------+------------------+--------------------+-----------+
|      date|dependent_vegan|tweet_volume|         avg_likes|      avg_retweets| avg_engagement_rate|influencers|
+----------+---------------+------------+------------------+------------------+--------------------+-----------+
|2021-11-03|              1|        3316|15.065796937039138| 8.869379014989294| 0.34234633383160695|        180|
| 2022-8-15|              0|        1456| 22.95566502463054|        8.76171875|0.061455990873796947|        209|
| 2022-3-03|              0|          26|2.6470588235294117|               1.0| 0.03432893276873259|          1|
|2021-10-25|              0|          50|           4.65625|               3.0|0.035312352622552404|          0|
| 2022-6-13|              0|         108| 7.879310344827586| 2.675675675675676| 0.03734894322976568|         13|
| 2022-8-14|              1|        1194|13.847765363128492| 5.109913793103448|  0.1940645976779

In [89]:
# import the required functions
from pyspark.ml.feature import Binarizer, StringIndexer, VectorIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType

In [90]:
# define string indexer to index price 
SI = StringIndexer(inputCol = 'dependent_vegan', outputCol = 'label')

# define vector assembler for numeric variables
numColumns = ['avg_likes','avg_retweets','avg_engagement_rate','influencers']
VAnum = VectorAssembler(inputCols=numColumns, outputCol="numFeatures")

In [91]:
# define pipeline stages
stages = [SI, VAnum]
# define pipeline and fit on data
preprocessingPipeline = Pipeline().setStages(stages).fit(basetable)
# apply pipeline on data
basetable = preprocessingPipeline.transform(basetable)

[Stage 528:(141 + 2) / 143][Stage 532:>  (0 + 6) / 9][Stage 534:>  (0 + 0) / 9] 

22/12/07 17:08:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 534:==>(8 + 1) / 9][Stage 536:>  (0 + 7) / 9][Stage 538:>  (0 + 0) / 9]]

22/12/07 17:08:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 540:>  (0 + 8) / 9][Stage 543:>  (0 + 0) / 9][Stage 545:>  (0 + 0) / 9]

22/12/07 17:08:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:08:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [92]:
# select features and labels
basetable = basetable.select(["numFeatures", "label"])

In [93]:
# check
basetable.show(5)

[Stage 635:(140 + 3) / 143][Stage 637:==>(8 + 1) / 9][Stage 639:>  (0 + 4) / 9] 

22/12/07 17:14:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 643:>  (0 + 8) / 9][Stage 645:>  (0 + 0) / 9][Stage 647:>  (0 + 0) / 9]]

22/12/07 17:14:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 647:>  (0 + 8) / 9][Stage 650:>  (0 + 0) / 9][Stage 652:>  (0 + 0) / 9]

22/12/07 17:14:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:14:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

+--------------------+-----+
|         numFeatures|label|
+--------------------+-----+
|[15.0657969370391...|  0.0|
|[22.9556650246305...|  1.0|
|[2.64705882352941...|  1.0|
|[4.65625,3.0,0.03...|  1.0|
|[7.87931034482758...|  1.0|
+--------------------+-----+
only showing top 5 rows



**Logistic Regression**
- Split the data in a train and test set (70/30).
- Build one pipeline that:
  - standardizes the numerical variables
  - applies a logistic regression to the data
  - check the performance using the AUC.

We cannot use the randomsplit function, because we have time series data, so we have to use another approach

First we look at the amount of observations that will be assigned to the training set 

In [None]:
nr_train = int(basetable.count()*0.7)
nr_train

98

convert the final basetable to a pandas dataset 

In [None]:
basetable_pd = basetable.toPandas()
basetable_pd.head()

Unnamed: 0,avg_emojis,avg_engagement_rate,avg_exclamation_marks,avg_hashtags,avg_num_mentions,avg_polarity,avg_subjectivity,avg_words,dependent,number_of_favorites,number_of_followers,number_of_retweets,number_tweets,polarity
0,0.574555,0.024244,0.268321,1.131913,1.070158,0.122752,0.418258,22.847176,0,25854,64744182,4143,5051,0.122752
1,0.672403,0.035796,0.284315,1.32825,1.268364,0.148792,0.424527,24.398022,0,159869,259167877,26899,20951,0.148792
2,0.729405,0.020413,0.296865,1.205981,1.280541,0.166109,0.446198,24.434049,0,97592,219563281,13709,14545,0.166109
3,0.640164,0.014389,0.24715,1.071324,1.07907,0.130799,0.413306,23.440514,0,27859,99421032,3310,6839,0.130799
4,0.507839,0.038302,0.262527,1.079926,0.893944,0.132347,0.403964,22.838918,0,51875,50409471,7360,6504,0.132347


Split the dataframe into train and test 

In [None]:
train_pd = basetable_pd.iloc[:nr_train,:]
test_pd = basetable_pd.iloc[nr_train:,:]
train = spark.createDataFrame(train_pd)
test = spark.createDataFrame(test_pd)

In [95]:
# check number of observations in train and test set
print(train.count())
print(test.count())

[Stage 736:(140 + 3) / 143][Stage 738:==>(8 + 1) / 9][Stage 740:>  (0 + 4) / 9]]

22/12/07 17:21:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 736:(142 + 1) / 143][Stage 740:>  (0 + 7) / 9][Stage 742:>  (0 + 0) / 9]

22/12/07 17:21:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 740:==>(8 + 1) / 9][Stage 742:>  (0 + 7) / 9][Stage 744:>  (0 + 0) / 9]]

22/12/07 17:21:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 742:=> (5 + 4) / 9][Stage 744:>  (0 + 4) / 9][Stage 746:>  (0 + 0) / 9]

22/12/07 17:21:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 748:>  (0 + 8) / 9][Stage 751:>  (0 + 0) / 9][Stage 753:>  (0 + 0) / 9]

22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:21:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

169


[Stage 843:(140 + 3) / 143][Stage 845:==>(8 + 1) / 9][Stage 847:>  (0 + 4) / 9]]

22/12/07 17:27:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 847:==>(8 + 1) / 9][Stage 849:>  (0 + 7) / 9][Stage 851:>  (0 + 0) / 9]]

22/12/07 17:27:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 849:==>(8 + 1) / 9][Stage 851:>  (0 + 7) / 9][Stage 853:>  (0 + 0) / 9]

22/12/07 17:27:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:27:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:27:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 855:>  (0 + 8) / 9][Stage 858:>  (0 + 0) / 9][Stage 860:>  (0 + 0) / 9]

22/12/07 17:27:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:27:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:27:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:27:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:27:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:27:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

67


In [96]:
# inspect distribution of label in train and test set
basetable.groupBy("label").count().show()
train.groupBy("label").count().show()
test.groupBy("label").count().show()

[Stage 950:(140 + 3) / 143][Stage 952:==>(8 + 1) / 9][Stage 954:>  (0 + 4) / 9] 

22/12/07 17:33:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 950:(142 + 1) / 143][Stage 954:=> (5 + 4) / 9][Stage 956:>  (0 + 3) / 9]

22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 954:==>(6 + 3) / 9][Stage 956:>  (2 + 6) / 9][Stage 958:>  (0 + 0) / 9]

22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 956:==>(8 + 1) / 9][Stage 958:>  (0 + 7) / 9][Stage 960:>  (0 + 0) / 9]

22/12/07 17:33:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 958:=> (5 + 4) / 9][Stage 960:>  (0 + 4) / 9][Stage 962:>  (0 + 0) / 9]

22/12/07 17:33:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 960:==>(8 + 1) / 9][Stage 962:>  (0 + 7) / 9][Stage 965:>  (0 + 0) / 9]

22/12/07 17:33:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:33:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 962:>  (1 + 8) / 9][Stage 965:>  (0 + 0) / 9][Stage 967:>  (0 + 0) / 9]

22/12/07 17:33:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

+-----+-----+
|label|count|
+-----+-----+
|  0.0|  121|
|  1.0|  115|
+-----+-----+



[Stage 1057:(140 + 3) / 143][Stage 1061:> (0 + 5) / 9][Stage 1063:> (0 + 0) / 9]

22/12/07 17:40:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1057:(142 + 1) / 143][Stage 1061:=>(5 + 4) / 9][Stage 1063:> (0 + 3) / 9]

22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1063:> (4 + 5) / 9][Stage 1065:> (0 + 3) / 9][Stage 1067:> (0 + 0) / 9]9]

22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1067:=>(5 + 4) / 9][Stage 1069:> (0 + 4) / 9][Stage 1072:> (0 + 0) / 9]

22/12/07 17:40:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:40:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1069:> (4 + 5) / 9][Stage 1072:> (0 + 3) / 9][Stage 1074:> (0 + 0) / 9]

22/12/07 17:40:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   86|
|  1.0|   83|
+-----+-----+



[Stage 1164:(140 + 3) / 143][Stage 1168:> (0 + 5) / 9][Stage 1170:> (0 + 0) / 9]

22/12/07 17:47:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1170:> (2 + 7) / 9][Stage 1172:> (0 + 1) / 9][Stage 1174:> (0 + 0) / 9]9]

22/12/07 17:47:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1172:> (0 + 8) / 9][Stage 1174:> (0 + 0) / 9][Stage 1176:> (0 + 0) / 9]

22/12/07 17:47:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:47:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1176:> (0 + 8) / 9][Stage 1179:> (0 + 0) / 9][Stage 1182:> (0 + 0) / 9]

22/12/07 17:47:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:47:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:47:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:47:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:47:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   35|
|  1.0|   32|
+-----+-----+



In [97]:
# import required features
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [98]:
# define scaler
SS = StandardScaler(inputCol = 'numFeatures', outputCol = 'scaledNumFeatures', withStd = True, withMean = False)

# define vector assembler
VA = VectorAssembler(inputCols = ['scaledNumFeatures'], outputCol = 'features')

# define logistic regression model
LR = LogisticRegression(labelCol = 'label', featuresCol = 'features', maxIter = 10)

In [99]:
# define pipeline stages
stages = [SS, VA, LR]
# create pipeline and fit on training set
lrModelPipeline = Pipeline().setStages(stages).fit(train)
# apply pipeline on test set to get predictions
predictions = lrModelPipeline.transform(test)

[Stage 1271:(142 + 1) / 143][Stage 1275:> (4 + 5) / 9][Stage 1277:> (0 + 2) / 9]

22/12/07 17:53:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1279:> (0 + 8) / 9][Stage 1281:> (0 + 0) / 9][Stage 1283:> (0 + 0) / 9]9]

22/12/07 17:53:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1283:> (0 + 8) / 9][Stage 1286:> (0 + 0) / 9][Stage 1288:> (0 + 0) / 9]

22/12/07 17:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 17:53:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1378:(140 + 3) / 143][Stage 1382:> (0 + 5) / 9][Stage 1384:> (0 + 0) / 9]

22/12/07 18:01:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:01:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:01:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1390:> (0 + 8) / 9][Stage 1393:> (0 + 0) / 9][Stage 1395:> (0 + 0) / 9]9]

22/12/07 18:01:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:01:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:01:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:01:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:01:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:01:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1474:(142 + 1) / 143][Stage 1478:=>(7 + 2) / 9][Stage 1480:> (0 + 5) / 9]

22/12/07 18:08:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:08:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:08:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1482:> (0 + 8) / 9][Stage 1484:> (0 + 0) / 9][Stage 1486:> (0 + 0) / 9]

22/12/07 18:08:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1486:> (0 + 8) / 9][Stage 1489:> (0 + 0) / 9][Stage 1492:> (0 + 0) / 9]

22/12/07 18:08:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:08:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:08:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:08:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:08:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

22/12/07 18:08:09 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
22/12/07 18:08:10 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/07 18:08:10 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


[Stage 1635:(138 + 5) / 143][Stage 1637:> (0 + 3) / 9][Stage 1639:> (0 + 0) / 9]

22/12/07 18:15:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1635:(141 + 2) / 143][Stage 1639:> (0 + 6) / 9][Stage 1641:> (0 + 0) / 9]

22/12/07 18:15:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1635:(142 + 1) / 143][Stage 1639:> (2 + 7) / 9][Stage 1641:> (0 + 0) / 9]

22/12/07 18:15:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1639:=>(6 + 3) / 9][Stage 1641:> (0 + 5) / 9][Stage 1643:> (0 + 0) / 9]

22/12/07 18:15:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1639:=>(8 + 1) / 9][Stage 1641:> (0 + 7) / 9][Stage 1643:> (0 + 0) / 9]

22/12/07 18:15:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1643:=>(7 + 2) / 9][Stage 1645:> (0 + 6) / 9][Stage 1647:> (0 + 0) / 9]

22/12/07 18:15:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1650:=>(7 + 2) / 9][Stage 1652:> (0 + 6) / 9][Stage 1655:> (0 + 0) / 9]

22/12/07 18:15:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:15:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1731:(137 + 6) / 143][Stage 1733:> (0 + 2) / 9][Stage 1735:> (0 + 0) / 9]

22/12/07 18:22:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1731:(139 + 4) / 143][Stage 1733:> (0 + 4) / 9][Stage 1735:> (0 + 0) / 9]

22/12/07 18:22:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1731:(140 + 3) / 143][Stage 1733:> (4 + 5) / 9][Stage 1735:> (0 + 0) / 9]

22/12/07 18:22:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1731:(141 + 2) / 143][Stage 1735:> (0 + 6) / 9][Stage 1737:> (0 + 0) / 9]

22/12/07 18:22:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1737:> (4 + 5) / 9][Stage 1739:> (0 + 3) / 9][Stage 1741:> (0 + 0) / 9]9]

22/12/07 18:22:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1739:=>(5 + 4) / 9][Stage 1741:> (0 + 4) / 9][Stage 1743:> (0 + 0) / 9]

22/12/07 18:22:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1741:=>(6 + 3) / 9][Stage 1743:> (0 + 5) / 9][Stage 1746:> (0 + 0) / 9]

22/12/07 18:22:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1743:> (4 + 5) / 9][Stage 1746:> (0 + 3) / 9][Stage 1748:> (0 + 0) / 9]

22/12/07 18:22:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1746:=>(5 + 4) / 9][Stage 1748:> (0 + 4) / 9][Stage 1751:> (0 + 0) / 9]

22/12/07 18:22:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:22:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [100]:
# inspect predictions
predictions.show(5)

[Stage 1827:(139 + 4) / 143][Stage 1829:> (3 + 4) / 9][Stage 1831:> (0 + 0) / 9]

22/12/07 18:30:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1827:(140 + 3) / 143][Stage 1829:=>(7 + 2) / 9][Stage 1831:> (0 + 3) / 9]

22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1827:(141 + 2) / 143][Stage 1831:> (3 + 6) / 9][Stage 1833:> (0 + 0) / 9]

22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1831:=>(7 + 2) / 9][Stage 1833:> (1 + 6) / 9][Stage 1835:> (0 + 0) / 9]9]

22/12/07 18:30:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1835:=>(5 + 4) / 9][Stage 1837:> (0 + 4) / 9][Stage 1839:> (0 + 0) / 9]

22/12/07 18:30:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1837:=>(8 + 1) / 9][Stage 1839:> (0 + 7) / 9][Stage 1842:> (0 + 0) / 9]

22/12/07 18:30:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1844:> (0 + 8) / 9][Stage 1847:> (0 + 0) / 9][Stage 1850:> (0 + 0) / 9]

22/12/07 18:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|         numFeatures|label|   scaledNumFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|[3.45901639344262...|  1.0|[0.59587416514823...|[0.59587416514823...|[1.01086761805780...|[0.73318990915824...|       0.0|
|[4.54022988505747...|  0.0|[0.78213150347260...|[0.78213150347260...|[0.17509106410821...|[0.54366127975776...|       0.0|
|[5.125,4.16666666...|  1.0|[0.88286806104012...|[0.88286806104012...|[0.24477190471981...|[0.56088927294919...|       0.0|
|[5.49019607843137...|  1.0|[0.94577927151118...|[0.94577927151118...|[0.13019981263182...|[0.53250404876349...|       0.0|
|[5.71052631578947...|  1.0|[0.98373488701261...|[0.98373488701261...|[0.05310818635663...|[0.51327392683264...|       0.0|
+-------

In [101]:
# define evaluator
evaluator = BinaryClassificationEvaluator()
# get evaluation metric
lrAUC = evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})
# inspect model performance
print('AUC lr: %f' %(lrAUC))

[Stage 1928:(140 + 3) / 143][Stage 1932:> (0 + 5) / 9][Stage 1934:> (0 + 0) / 9]

22/12/07 18:37:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1928:(142 + 1) / 143][Stage 1932:=>(5 + 4) / 9][Stage 1934:> (0 + 3) / 9]

22/12/07 18:37:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1934:=>(5 + 4) / 9][Stage 1936:> (0 + 4) / 9][Stage 1938:> (0 + 0) / 9]9]

22/12/07 18:37:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1936:> (0 + 8) / 9][Stage 1938:> (0 + 0) / 9][Stage 1940:> (0 + 0) / 9]

22/12/07 18:37:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1940:> (0 + 8) / 9][Stage 1943:> (0 + 0) / 9][Stage 1945:> (0 + 0) / 9]

22/12/07 18:37:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1943:=>(7 + 2) / 9][Stage 1945:> (0 + 6) / 9][Stage 1948:> (0 + 0) / 9]

22/12/07 18:37:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/12/07 18:37:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

AUC lr: 0.542857
