# Initialize pyspark environment

In [1]:
import findspark

# initialize findspark with spark directory

#ALWAYS HAVE TO BE CHANGED 
#path = "/Users/konstantinlazarov/Desktop/Big_Data/PySpark/Week_5/spark"
path = "/Users/Artur/spark"
findspark.init(path) 

# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

# Import necessary packages and data

#### Import necessary packages

In [2]:
# import packages
import os 
import pickle

import re
from datetime import datetime
import requests

import pytz
import emojis

import pandas as pd
import numpy as np

import ast

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover

import tweepy
import csv
import time
import pandas as pd
import datetime
import os
import json
from pandas.tseries.holiday import nearest_workday, \
    AbstractHolidayCalendar, Holiday, \
    USMartinLutherKingJr, USPresidentsDay, GoodFriday, \
    USMemorialDay, USLaborDay, USThanksgivingDay
from datetime import date

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from string import punctuation
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import word_tokenize

from nltk.tokenize.treebank import TreebankWordDetokenizer
import re

#### Import the twitter data 

In [3]:
list_brands = ["healthyfood",
               "healthylifestyle",
               "vegan",
               "keto",
               "ketodiet",
               "ketolifestyle",
               "veganism",
               "vegetarian"]
from re import search



data_dir = ".././../data/Topic/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]



files_brand = [file for file in tweet_files if (file.find(list_brands[2]) != -1)]
files_brand               
               
df_json = spark.read.option("multiline","true").json(files_brand)  
df_json.count()


1827680

# Predict the engagement rate of a tweet

## 1. Goal of our analysis

In this notebook, we are going to predict the engagement rate of tweets. Further, it will be interesting to see the driving factors behind the engagement rate. This can be valuable information when creating an own social media brand or when you want to increase the reach of your tweets.

We start by creating a basetable for this model.

First, we start by defining our dependent variable. The engagement rate has already been discussed in the data exploration section. We will use the same definition in order to create a model to predict the engagement rate. Below, we repeat this definition:

Engagement on Twitter is measured by the number of retweets, follows, replies, favorites, and other people’s reactions to your tweets, including the clicks on the links and hashtags in those tweets. Your Twitter engagement rate is your engagement figure divided by the number of impressions on the tweet.

In order to predict the engagement of a tweet, we will use the following variables:


    1) number of words
    2) number of hashtags
    3) number of tags
    4) number of emojis
    5) the month
    6) day of the month
    7) day of the week
    8) hour of the day
    9) the language
    10) tweeted by an influencer or not
    11) tweeted quote
    13) presence of a symbol
    14) Indicator if a mention to another user was made
    15) The media type present
    16) The number of text characters in the tweet


The goal of our model is not only to predict the engagement rate, but look at the underlying drivers of the engagement rate. This way, we aim to optimize our engagement rate.

# 2. Basetable creation


We start by selecting all the variables that we will need in this analysis.

In [5]:
# Select the interesting variables
basetable_engr = df_json.select(F.col('created_at').alias('tweet_created'), \
                                   F.col('entities.symbols').alias('symbols'), \
                                   F.col('display_text_range').alias('text_range'), \
                                   F.col('extended_entities.media.type').alias('media_type'), \
                                   F.col('favorite_count'), \
                                   F.col('full_text'), \
                                   F.col('is_quote_status').alias('quoted'), \
                                   F.col('lang').alias('language'), \
                                   F.col('retweet_count'),\
                                   F.col('user.created_at').alias('user_created'), \
                                   F.col('user.followers_count').alias('user_followers'), \
                                   F.col('user.friends_count').alias('user_following'), \
                                   F.col('user.verified').alias('user_verified'), \
                                   F.col("user.screen_name"), \
                                   F.col('user.statuses_count').alias('nr_tweets_by_user'))

## 2.1 Data processing

### 2.1.1 Check Time Period 

In [6]:
# https://developer.twitter.com/en/docs/twitter-ads-api/timezones
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
basetable_engr = basetable_engr.withColumn('tweet_created', F.to_utc_timestamp(date_udf("tweet_created"), "UTC"))
basetable_engr = basetable_engr.withColumn('user_created', F.to_utc_timestamp(date_udf("user_created"), "UTC"))


In [7]:
#drop duplicates and retweets 
basetable_engr = basetable_engr.filter(~F.col("full_text").startswith("RT"))\
                        .drop_duplicates()

#sorting such when dropping later we only keep the most recent post 
basetable_engr = basetable_engr.sort("tweet_created", ascending=False)

#removing spam accounts 
basetable_engr = basetable_engr.drop_duplicates(["full_text", "screen_name"])

## 2.2 Create the dependent variable

AAN TE PASSEN

Repeat definition: 
Engagement on Twitter is measured by the number of retweets, follows, replies, favorites, and other people’s reactions to your tweets, including the clicks on the links and hashtags in those tweets. Your Twitter engagement rate is your engagement figure divided by the number of impressions on the tweet.

In [8]:
# add engagement rate to the dataframe
basetable_engr = basetable_engr.withColumn('eng_rate', ((basetable_engr['favorite_count'] + basetable_engr['retweet_count'])/basetable_engr['user_followers']))
basetable_engr.select(F.col('eng_rate')).show()


+--------------------+
|            eng_rate|
+--------------------+
|0.001179245283018...|
|0.003097040605643...|
|0.027937551355792935|
|                 0.0|
|                 0.0|
|                 0.0|
|                 0.0|
|5.963313694153567E-5|
|                 0.0|
|0.004761904761904762|
|                 0.0|
|2.448280083241523E-4|
|1.958863858961802E-4|
|0.043859649122807015|
| 0.03837118245888802|
| 0.04980842911877394|
|0.001669449081803005|
|                 0.0|
|0.003231017770597...|
|  0.0273224043715847|
+--------------------+
only showing top 20 rows



### 2.3 Text cleaning 

Now that we have created the dependent variable, we will clean the text of the tweets. Then, the cleaned tokenized text can now be used to create some new features. Besides, we also use some create some new features out of other variables. For some, we first define the necessary functions. Features created:

    1) number of words
    2) number of hashtags
    3) number of tags
    4) number of emojis
    5) get the number of exclamation marks
    6) the month
    7) day of the month
    8) day of the week
    9) hour of the day
    10) The number of upper case words
    11) tweeted quote
    12) presence of a symbol
    13) The age of the account
    14) The number of media elements
    15) The media type present
    16) The number of text characters in the tweet
    17) Indicator if the account is verified


First, define a function to clean the data.

In [9]:
# define puncutation and stopwords
PUNCTUATION = [char for char in punctuation if char not in ["!", "@", "#"]]
STOPWORDS = stopwords.words("english")
NUMBERS = '0123456789'

In [10]:
# define function to remove punctuation
def remove_punct(text):
    ## Remove punctuation
    text = "".join([char for char in text if char not in PUNCTUATION])
    return(text)



In [11]:
# define function to tokenize the text and remove the stopwords
def remove_stops(text):
    ## Tokenize
    word_tokens = word_tokenize(text)

    ## Remove stopwords
    text_tokenized = [word for word in word_tokens if word not in STOPWORDS]

    ## Return
    return(text_tokenized)

In [12]:
# define function to remove urls
def remove_urls(text):
    ## Remove links
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)
    
    return(text)

Emojis are at the very core of communication over social channels. One small image can completely describe one or more human emotions. A naive thing to do during pre-processing would be to remove all emojis. This could result in significant loss of meaning.
A good way to achieve this is to replace the emoji with corresponding text explaining the emoji.

In [97]:
# define function to replace all emojis in a text with their corresponding meaning in words
import demoji
def replace_emoji(text):
    emoji_text = demoji.findall(text)
    return emoji_text


Users sometimes combine multiple words into a single word, where the word disambiguation is done by using capital letters, for example GoodMorning, RainyDay, PlayingInTheCold, etc

In [75]:
# Use a spelling correction library to handle nonstandard spellings
#from textblob import TextBlob
#def correct_spelling(text):
#    text = TextBlob(text).correct()
#    return text

In [76]:
# define function to remove numbers
def remove_numbers(text):
    text = "".join([char for char in text if char not in NUMBERS])
    return text

Next, we create extra features for our model. For some, we first create a function.

    1) For the number of words, we can just use the function F.size()

In [77]:
# 2) Define function to count hashtags
def get_hashtags(text):
    counter = 0
    for letter in text:
        if letter == "#":
            counter += 1
    return(counter)

In [78]:
# 3) Define function to count tags
def get_tags(text):
    counter = 0
    for letter in text:
        if letter == "@":
            counter += 1
    return(counter)

In [79]:
# 4) Define a function to get the number of emojis
def emoji_counter(text):
    nr_emojis = emojis.count(text)
    return(nr_emojis)

In [80]:
# 5) Define function to count exclamation marks
def get_exclamation_marks(text):
    counter = 0
    for letter in text:
        if letter ==  "!":
            counter += 1
    return(counter)

    6) the month
    8) day of the week
    9) hour of the day

We saw how to create each of these variables when solving the questions for this assignment. The same code will be used here. This means we need to create one help function.

In [81]:
def dayToInt(dayOfWeek):
    if(dayOfWeek == "Mon"):
        return 1
    if(dayOfWeek == "Tue"):
        return 2
    if(dayOfWeek == "Wed"):
        return 3
    if(dayOfWeek == "Thu"):
        return 4
    if(dayOfWeek == "Fri"):
        return 5
    if(dayOfWeek == "Sat"):
        return 6
    if(dayOfWeek == "Sun"):
        return 7

    10) The variable language is already present.
    11) The number of upper case words

In [82]:
# 11) Define number of upper case words
def get_upper_case_words(text):
    counter = 0
    
    ## Tokenize
    word_tokens = word_tokenize(text)

    ## Check for uppercase words
    for word in word_tokens:
        if word.isupper():
            counter += 1
    return(counter)

    12) tweeted by an influencer or not
For this variable, we first have to determine which accounts we consider to be an influencer account. During the data exploration phase, we created a function in order to look at influencers based on 3 characteristics.

We define an influencer as an account with the following characteristics:

- a lot of followers => follower_count > 10000
- there is a high engagement rate on their tweets which shows their influence => er > 0.05
- tweet frequency is high enough => freq_weekly > 20

This means that we consider about 5% of our tweets comes from an influencer

    13) tweeted quote

    15) presence of a symbol
    16) The age of the account
    17) The media type present
    18) The number of text characters in the tweet



In [83]:
# 13) Define a function that indicates if the tweet was a quote
def tweeted_quote_indicator(quoted):
    quote = 0
    if quoted == True:
        quote = 1
    return quote

In [84]:
# 14) define a function that indicates the presence of a symbol
def symbol_indicator(symbols):
    symbol = 0
    if(symbols > 0):
        symbol = 1
    return symbol


    15) Define the age of the account. This is defined as the number of days since the account has been created and the last day of scraping (2022-10-11). The last day of scraping was calculated in the exploration phase of the data. For this variable, we will use the function datediff.

In [85]:
# 16) define a function to get help get the number of media types included in the tweet
def adjust_nr_media(number):
    if number == -1:
        number = 0
        
    return number
    

In [86]:
# 17) define a function to get the first media element
def get_media_type(media):
    if media == None:
        media = 'no_media'
    else:
        media = media[0]
       
    return media

In [87]:
# 18) define a function to get the first media element
def get_nr_text_characters(text_range):
    number = text_range[1] - text_range[0]  
    return number


In [88]:
# 19) Look if the user is a verified user
def verified_ind(verified):
    indicator = 0
    if verified == True:
        indicator = 1
    return indicator

In [89]:
# 19, 20 + 21) Besides, we also add some sensitivity parameters
def get_sentiment(sentence):

    # initialize sentiment analyzer
    sid_obj = SentimentIntensityAnalyzer()

    # get sentiment dict
    sentiment_dict = sid_obj.polarity_scores(sentence)
    
    # get positive sentiment score
    pos_sentiment = sentiment_dict["pos"]
    
    # return positive sentiment score
    return(pos_sentiment)

# define function to get polarity score of text 
def get_polarity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[0]

# define function to get subjectivity score of text 
def get_subjectivity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[1]

Register the functions as udf

In [98]:
# register the functions as an udf
remove_punct_UDF = F.udf(remove_punct, StringType())
remove_urls_UDF = F.udf(remove_urls, StringType())
emoji_counter_udf = F.udf(emoji_counter, IntegerType())
replace_emoji_UDF = F.udf(replace_emoji, StringType())
remove_numbers_UDF = F.udf(remove_numbers, StringType())
get_upper_case_words_UDF = F.udf(get_upper_case_words, IntegerType()) 
tokenize_and_remove_stops_UDF = F.udf(remove_stops, ArrayType(StringType()))
get_hashtags_udf = F.udf(get_hashtags, IntegerType())
get_tags_udf = F.udf(get_tags, IntegerType())
get_exclamation_marks_UDF = F.udf(get_exclamation_marks, IntegerType())
convert_dayToInt_UDF = F.udf(dayToInt, StringType())
tweeted_quote_indicator_UDF = F.udf(tweeted_quote_indicator, IntegerType())
symbol_indicator_udf = F.udf(symbol_indicator, IntegerType())
adjust_nr_media_udf = F.udf(adjust_nr_media, IntegerType())
get_media_type_udf = F.udf(get_media_type, StringType())
get_nr_text_characters_udf = F.udf(get_nr_text_characters, IntegerType())
verified_ind_udf = F.udf(verified_ind, IntegerType())
get_sentiment_udf = F.udf(get_sentiment, DoubleType())
get_polarity_udf = F.udf(get_polarity, DoubleType())
get_subjectivity_udf = F.udf(get_subjectivity, DoubleType())



In [99]:
# create the final basetable for our analysis
basetable_engr_final = basetable_engr.withColumn("nr_emojis", emoji_counter_udf(F.col("full_text")))\
                            .withColumn('emojis_text', replace_emoji_UDF('full_text'))\
                            .withColumn('upper_case_words', get_upper_case_words_UDF('emojis_text'))\
                            .withColumn('text_lower', F.lower('full_text'))\
                            .withColumn('clean_punct', remove_punct_UDF('text_lower'))\
                            .withColumn('clean_urls', remove_urls_UDF('clean_punct'))\
                            .withColumn('clean_numbers', remove_numbers_UDF('clean_urls'))\
                            .withColumn('text_tokinezed_no_stops', tokenize_and_remove_stops_UDF('clean_numbers'))\
                            .withColumn("num_words", F.size("text_tokinezed_no_stops")) \
                            .withColumn("num_hashtags", get_hashtags_udf("text_tokinezed_no_stops")) \
                            .withColumn("num_mentions", get_tags_udf("text_tokinezed_no_stops")) \
                            .withColumn('nr_exlcamations', get_exclamation_marks_UDF('text_tokinezed_no_stops'))\
                            .withColumn("week_day", F.date_format(F.col("tweet_created"), "E"))\
                            .withColumn("week_day", convert_dayToInt_UDF("week_day"))\
                            .withColumn("hour", F.date_format(F.col("tweet_created"), "H").cast('int'))\
                            .withColumn("month", F.date_format(F.col("tweet_created"), "M").cast('int'))\
                            .withColumn('quoted_ind', tweeted_quote_indicator_UDF('quoted'))\
                            .withColumn('symbol_ind', F.size('symbols'))\
                            .withColumn('symbols_ind', symbol_indicator_udf('symbol_ind'))\
                            .withColumn('user_age_days', F.datediff(F.lit("2022-10-11"), F.col("user_created")))\
                            .withColumn('verified', verified_ind_udf('user_verified'))\
                            .withColumn("nr_media_elements", F.size("media_type"))\
                            .withColumn("nr_media_elements", adjust_nr_media_udf("nr_media_elements"))\
                            .withColumn("media_type", get_media_type_udf('media_type'))\
                            .withColumn("nr_text_char", get_nr_text_characters_udf('text_range'))\
                            .drop('clean_punct')\
                            .drop('clean_urls')\
                            .drop('text_lower')\
                            .drop('clean_numbers')\
                            .drop('week_day')\
                            .drop('post_created_at')\
                            .drop('quoted')\
                            .drop('symbols')\
                            .drop('user_created')\
                            .drop('user_verified')\
                            .drop('display_text_range')\
                            .filter("num_words > 0")






In [100]:
basetable_engr_final.select("full_text", "text_tokinezed_no_stops", "num_words", "num_hashtags", "emojis_text", "nr_exlcamations", "eng_rate").toPandas().head(5)

Unnamed: 0,full_text,text_tokinezed_no_stops,num_words,num_hashtags,emojis_text,nr_exlcamations,eng_rate
0,"This afternoon, live-streaming a relaxed conve...","[afternoon, livestreaming, relaxed, conversati...",22,6,{},0,0.001179
1,Sweet Potato &amp; Courgette Fritters with Min...,"[sweet, potato, amp, courgette, fritters, mint...",13,2,{},0,0.003097
2,Creamy mushroom bucatini #vegan #veganforlife ...,"[creamy, mushroom, bucatini, #, vegan, #, vega...",10,3,{},0,0.027938
3,@Polypieter veggie is al super!\nvegan is quas...,"[@, polypieter, veggie, al, super, !, vegan, q...",44,1,{😥=sad but relieved face},1,0.0
4,Enriched with natural origin ingredients certi...,"[enriched, natural, origin, ingredients, certi...",20,4,{😉=winking face},0,0.0


- Create 15 new columns in the dataframe and drop the necessary variables:
        1. number of words by using the F.size() function
        2. number of hashtags by using udf
        3. number of tags by using udf

In [None]:
# Create the features
basetable_engr = basetable_engr.withColumn("num_words", F.size("text_tokenized_no_stops"))\
                                         .withColumn("num_hashtags", get_hashtags_udf("text_tokenized_no_stops")) \
                                         .withColumn("num_tags", get_tags_udf("text_tokenized_no_stops"))\
                                         .withColumn("num_tags", get_tags_udf("text_tokenized_no_stops"))\



