In [2]:
# import findspark
import findspark
# initialize findspark with spark directory
findspark.init("C:\Program Files\Spark\spark-3.3.1-bin-hadoop3")
# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

In [3]:
# import packages
import os 
import pickle
import re
from datetime import datetime
import requests
import pytz
import emoji
import pandas as pd
import numpy as np
import ast
import pyspark.sql.functions as F
from pyspark.sql.types import *
import matplotlib.pyplot as plt 

### 1. General

We want to predict the popularity of the topic vegan. In order to do this we apply regression analysis with as dependent variable external Google trends data. Google trends data represents how much certain topics are searched for in Google. This analysis should give us inside in which independent variables obtained from Twitter are important, as well as a prediction of the popularity on google based on twitter data. Variable importance in this context can certainly have some business value.

### 2. Import Twitter Data & Google Tends Data

#### 2.1 Google Trends

In [14]:
# read trend data 
trend = spark.read.csv(".././../data/Google_trends/interest_in_time_between_12oct2021_12oct2022.csv", header=True, inferSchema=True)

In [15]:
# explore trend data
trend.toPandas()

  series = series.astype(t, copy=False)


Unnamed: 0,Week,vegan: (Wereldwijd)
0,2021-10-17,85
1,2021-10-24,85
2,2021-10-31,88
3,2021-11-07,86
4,2021-11-14,88
5,2021-11-21,98
6,2021-11-28,87
7,2021-12-05,87
8,2021-12-12,88
9,2021-12-19,99


We see the trend data is relatively measured towards its maximum.

#### 2.2 Twitter

We have Twitter data of 1 year between 2021-10-12 and 2022-10-12

In [3]:
# define data dir
#data_dir = "../../data/Topic_vegan/"

# get all twitter files
#tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)] 

In [4]:
list_brands = ["healthyfood",
               "healthylifestyle",
               "vegan",
               "keto",
               "ketodiet",
               "ketolifestyle",
               "veganism",
               "vegetarian"]

data_dir = ".././../data/Topic_vegan/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]

files_brand = [file for file in tweet_files if (file.find(list_brands[3]) != -1)]            
               
df_brand = spark.read.option("multiline","true").json(files_brand)  

In [None]:
# import twitter data 
#twitter_df = spark.read.json(tweet_files)

In [7]:
# select interesting features
twitter_df = twitter_df.select(F.col('user.name'),
                                F.col('user.screen_name'),
                                F.col('user.followers_count'),
                                F.col('user.following'),
                                F.col('user.statuses_count'),
                                F.col('user.listed_count'),
                                F.col('created_at'),
                                F.col('full_text'),
                                F.col('entities.hashtags'),
                                F.col('favorite_count'),
                                F.col('retweet_count'),
                                F.col('user.friends_count'))

In [8]:
twitter_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- following: boolean (nullable = true)
 |-- statuses_count: long (nullable = true)
 |-- listed_count: long (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- friends_count: long (nullable = true)



### 2. Preprocess Data

#### 2.1 Check time period

In [9]:
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
twitter_df = twitter_df.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

In [10]:
# show
twitter_df.select("created_at", "post_created_at").show(5, truncate=False)

+------------------------------+-------------------+
|created_at                    |post_created_at    |
+------------------------------+-------------------+
|Mon Apr 04 10:09:55 +0000 2022|2022-04-04 10:09:55|
|Mon Apr 04 10:09:54 +0000 2022|2022-04-04 10:09:54|
|Mon Apr 04 10:09:54 +0000 2022|2022-04-04 10:09:54|
|Mon Apr 04 10:09:52 +0000 2022|2022-04-04 10:09:52|
|Mon Apr 04 10:09:52 +0000 2022|2022-04-04 10:09:52|
+------------------------------+-------------------+
only showing top 5 rows



In [11]:
# get first post
first_post = F.min('post_created_at').alias('earliest')
# get latest post
latest_post = F.max('post_created_at').alias('latest')
# show tweet period in our dataset
twitter_df.select(first_post, latest_post).show()

+-------------------+-------------------+
|           earliest|             latest|
+-------------------+-------------------+
|2021-10-12 16:08:51|2022-10-11 23:17:33|
+-------------------+-------------------+



#### 2.2 Remove retweets and duplicates

In [12]:
# drop all retweets from dataset
no_retweets_df = twitter_df.filter(~F.col("full_text").startswith("RT"))

In [4]:
# first sort no_retweets_df based on date in chronological order (most recent ones on top)
no_retweets_sorted_df = no_retweets_df.sort("post_created_at", ascending=False)

NameError: name 'no_retweets_df' is not defined

In [14]:
# number of observations before dropping duplicates
no_retweets_sorted_df.count()

1497352

In [15]:
# drop duplicates based on tweet text and the profile it was posted from
final_no_duplicates_df = no_retweets_sorted_df.drop_duplicates(["full_text", "screen_name"])

In [16]:
# number of observations after dropping duplicates
final_no_duplicates_df.count()

1340938

In [17]:
# rename dataframe
final_twitter_df = final_no_duplicates_df
final_twitter_df.limit(3).toPandas()

  series = series.astype(t, copy=False)


Unnamed: 0,name,screen_name,followers_count,following,statuses_count,listed_count,created_at,full_text,hashtags,favorite_count,retweet_count,friends_count,post_created_at
0,Follow the Vegans Ⓥ,vegan_v_vegan,4285,False,25878,20,Sat May 14 00:55:33 +0000 2022,!\n#vegan #GoVegan #dairy https://t.co/G0Jmiwb84g,"[([2, 8], vegan), ([9, 17], GoVegan), ([18, 24...",6,1,4199,2022-05-14 00:55:33
1,🌱Veg-In-Out Market🌱,veginoutmarket,947,False,1719,2,Sat Jan 15 07:17:18 +0000 2022,! We will be open 10am-9pm today and tomorrow!...,"[([69, 79], veganuary), ([80, 94], veganuary20...",0,0,2057,2022-01-15 07:17:18
2,Mix 93.8 FM,Mix938FM,10745,False,16386,68,Wed Sep 07 09:11:40 +0000 2022,!! Daily Updates !!\n\nA taste of Vegan Food t...,"[([52, 67], TheMorningWave), ([104, 115], Chef...",8,3,2812,2022-09-07 09:11:40
