In [19]:
# import findspark
import findspark
# initialize findspark with spark directory
findspark.init("C:\BigData\BigData\spark-3.1.2-bin-hadoop3.2")
# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at C:\Users\vikto\AppData\Local\Temp\ipykernel_20996\2064577265.py:8 

In [20]:
# check
spark

In [67]:
import emojis 

In [97]:
!pip3 install sparknlp

Collecting sparknlp
  Downloading sparknlp-1.0.0-py3-none-any.whl (1.4 kB)
Collecting spark-nlp
  Downloading spark_nlp-4.2.4-py2.py3-none-any.whl (448 kB)
Installing collected packages: spark-nlp, sparknlp
Successfully installed spark-nlp-4.2.4 sparknlp-1.0.0


In [98]:
import sparknlp

In [94]:
# import packages
import os 
import pickle

import re
from datetime import datetime

import requests

import pytz
import emojis

import pandas as pd
import numpy as np

import ast

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover

# 1. Import Data & Exploration

In [112]:
list_brands = ["healthyfood",
               "healthylifestyle",
               "_vegan_",
               "keto",
               "ketodiet",
               "ketolifestyle",
               "veganism",
               "vegetarian"]
from re import search



data_dir = ".././../data/Topic_vegan/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]



files_brand = [file for file in tweet_files if (file.find(list_brands[2]) != -1)]
files_brand               
               
df_json = spark.read.option("multiline","true").json(files_brand)  
df_json.count()

1595676

In [113]:
# check the schema of our json dataframe
df_json.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- id_str: string (nullable = true)
 |    |    |    |

In [114]:
# select interesting features
df = df_json.select(F.col("user.name"),
                                F.col("user.screen_name"),
                                F.col("created_at"), 
                                F.col("full_text"),
                                F.col("entities.hashtags"),
                                F.col("lang"),
                                F.col("favorite_count"),
                                F.col("retweet_count"),
                                F.col("user.followers_count"),
                                F.col("user.friends_count"),
                                F.col("user.favourites_count"),
                                F.col("entities.urls"),
                                F.col("entities.symbols"))

# 2. Preprocess Data

## 2.1 Check Time Period 

In [115]:
# https://developer.twitter.com/en/docs/twitter-ads-api/timezones
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
df = df.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

In [45]:
# show
df.select("created_at", "post_created_at").show(5, truncate=False)

+------------------------------+-------------------+
|created_at                    |post_created_at    |
+------------------------------+-------------------+
|Tue Sep 13 22:32:32 +0000 2022|2022-09-13 22:32:32|
|Tue Sep 13 22:32:26 +0000 2022|2022-09-13 22:32:26|
|Tue Sep 13 22:32:26 +0000 2022|2022-09-13 22:32:26|
|Tue Sep 13 22:32:16 +0000 2022|2022-09-13 22:32:16|
|Tue Sep 13 22:32:09 +0000 2022|2022-09-13 22:32:09|
+------------------------------+-------------------+
only showing top 5 rows



In [116]:
#drop duplicates and retweets 
df = df.filter(~F.col("full_text").startswith("RT"))\
                        .drop_duplicates()
#sorting such when dropping later we only keep the most recent post 
df = df.sort("post_created_at", ascending=False)
#removing spam accounts 
df = df.drop_duplicates(["full_text", "screen_name"])

#df.printSchema()
#df.count() #1340938

# Feature Engineering

In [117]:
# define function to count hashtags
def get_hashtags(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "#" in word:
            counter += 1
    return(counter)

In [118]:
# define function to count mentions
def get_mentions(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "@" in word:
            counter += 1
    return(counter)

In [119]:
# define function to count exclamation marks
def get_exclamation_marks(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "!" in word:
            counter += 1
    return(counter)

In [120]:
# define function to count number of emojis used
def emoji_counter(text):
    nr_emojis = emojis.count(text)
    return(nr_emojis)

In [121]:
# define function to calculate engagement rate
def engagement_rate(favorite_count, retweet_count, followers_count):
    if(followers_count == 0):
        eng_rate = 0
    else:
        eng_rate = (favorite_count + retweet_count)/followers_count
    
    return eng_rate

In [122]:
# register functions as udf
get_hashtags_UDF = F.udf(get_hashtags, IntegerType())
get_mentions_UDF = F.udf(get_mentions, IntegerType())
get_exclamation_marks_UDF = F.udf(get_exclamation_marks, IntegerType())
emoji_counter_UDF = F.udf(emoji_counter, IntegerType())
engagement_rate_UDF = F.udf(engagement_rate, DoubleType())

In [123]:
# apply functions to create new features
df = df.withColumn("emoji_count", emoji_counter_UDF("full_text")) \
        .withColumn("text_tokenized", F.split("full_text", " ")) \
        .withColumn("num_words", F.size("text_tokenized")) \
        .withColumn("num_hashtags", get_hashtags_UDF("text_tokenized")) \
        .withColumn("num_mentions", get_mentions_UDF("text_tokenized")) \
        .withColumn("num_exclamation_marks", get_exclamation_marks_UDF("text_tokenized")) \
        .withColumn("engagement_rate", engagement_rate_UDF("favorite_count", "retweet_count", "followers_count"))

# show
df.select("full_text", "emoji_count", "num_words", "num_hashtags", "num_mentions", "num_exclamation_marks", "engagement_rate").toPandas().tail(5)

Unnamed: 0,full_text,emoji_count,num_words,num_hashtags,num_mentions,num_exclamation_marks,engagement_rate
592246,Check out Maddie Payton's video! #TikTok https://t.co/g46yyGy77U #growth #growthmindset #believe #retweet #socialmedia #marketing #contentcreator #influencer #wcw #vegan #tbt #disney #Disneytiktok #affiliatemarketing #trending #trends #fyp #marketing #lifestyle #youtube #hashtag,0,28,22,0,1,0.0
592247,????????? https://t.co/rIWy3nd1tP,0,2,0,0,0,0.0
592248,@YourHostEdge I have sensory issues related to texture and taste that would make it exceedingly difficult to completely switch my diet.\n\nI respect other people’s decisions to go vegan but I don’t see why I should have to go thru it when it could literally cause me pain.,0,47,0,1,0,0.0
592249,My Road To Being A Vegan | Alan Cox | TEDxLambethSalon\nhttps://t.co/VPfV8rnQjx,0,11,0,0,0,0.0
592250,AD|PR - Looking for new beauty products? Read my favourite vegan beauty products\n👇Read More👇\n\nhttps://t.co/YcECxU1IcU #thegirlgang #lbloggers #blogginggals #GRLPWR @sotonbloggers #bloggerstribe #influencerrt #theclqrt #bloggershutrt #teacupclub #bloggerssparkle #cosybloggerclub https://t.co/dRfI2eVvdc,2,27,11,1,0,9.1e-05


# 3. Text Cleaning 

In [124]:
# filter for english tweets (NOTE: for the assignment you can translate non-english tweets using an API)
df = df.filter(F.col("lang") == "en")

In [85]:
# check number of observations
df.count()

439169

In [125]:
# define function to clean text
def clean_text(string):
    
    # define numbers
    NUMBERS = '0123456789'
    PUNCT = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    
    # convert text to lower case
    cleaned_string = string.lower()
    
    # remove URLS
    cleaned_string = re.sub(r'http\S+', ' ', cleaned_string)
    
    # replace emojis by words
    cleaned_string = emoji.demojize(cleaned_string)
    cleaned_string = cleaned_string.replace(":"," ").replace("_"," ")
    cleaned_string = ' '.join(cleaned_string.split())
    
    # remove numbers
    cleaned_string = "".join([char for char in cleaned_string if char not in NUMBERS])
    
    # remove punctuation
    cleaned_string = "".join([char for char in cleaned_string if char not in PUNCT])
    
    # remove words conisting of one character (or less)
    cleaned_string = ' '.join([w for w in cleaned_string.split() if len(w) > 1])
    
    # return
    return(cleaned_string)

In [126]:
# convert to udf
clean_text_udf = F.udf(clean_text, StringType())

In [127]:
# clean string
df = df.withColumn("cleaned_text", clean_text_udf(F.col("full_text")))

In [128]:
# check
pd.set_option('display.max_colwidth', None)
df.select("full_text", "cleaned_text").limit(5).toPandas()

Unnamed: 0,full_text,cleaned_text
0,"""A lot of people say they can't eat vegan, but I'm just like them – and if I can do it they can do it."" Nichole Lewis who is a 26-year-old mom lost 120 pounds and improved her #mentalhealth by eating a plant-based diet. Here are her weight loss secrets: https://t.co/4RAT8rwqTO",lot of people say they cant eat vegan but im just like them and if can do it they can do it nichole lewis who is yearold mom lost pounds and improved her mentalhealth by eating plantbased diet here are her weight loss secrets
1,"""By now, your [vegan/vegetarian] lifestyle may be so easy to maintain that you don't even have to think about it...All of this can change when you study abroad."" Check out Hannah's tips on staying vegan while abroad: https://t.co/fIy45ntvTD",by now your veganvegetarian lifestyle may be so easy to maintain that you dont even have to think about itall of this can change when you study abroad check out hannahs tips on staying vegan while abroad
2,"""Don't stay in the sun""\n""Use sunscreen""\n""Red meat is bad for you""\n""Vegan diet is healthy""\n""Seed oil is healthy""\n""Inflation is a good thing""\n""#Bitcoin is bad for the enviroment""\n\nWhat other things does the media portray as positive while being extremely damaging?",dont stay in the sun use sunscreen red meat is bad for you vegan diet is healthy seed oil is healthy inflation is good thing bitcoin is bad for the enviroment what other things does the media portray as positive while being extremely damaging
3,"""Heroes sacrifice for causes; they do things that others hide from. I may not be some great hero, but I won't hide from this"" - Brandon Mull.\n#Antispeciesism \n#Animalliberation \n#Vegan\n#Love https://t.co/1hK7FXsgRo",heroes sacrifice for causes they do things that others hide from may not be some great hero but wont hide from this brandon mull antispeciesism animalliberation vegan love
4,"""Last year over a million people left the same suicide note... SHOPPING LIST: Butter, Eggs, Milk, Cheese, Beef, Chicken, and Bacon."" - Physicians Committee for Responsible Medicine\n#vegan",last year over million people left the same suicide note shopping list butter eggs milk cheese beef chicken and bacon physicians committee for responsible medicine vegan


Tokenizing the text 

In [129]:
#tokenize the cleaned_text variable 
tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="tokens")
df = tokenizer.transform(df)

In [130]:
df.select('cleaned_text', 'tokens').show()

+--------------------+--------------------+
|        cleaned_text|              tokens|
+--------------------+--------------------+
|lot of people say...|[lot, of, people,...|
|by now your vegan...|[by, now, your, v...|
|dont stay in the ...|[dont, stay, in, ...|
|heroes sacrifice ...|[heroes, sacrific...|
|last year over mi...|[last, year, over...|
|monday mint motiv...|[monday, mint, mo...|
|no nut november m...|[no, nut, novembe...|
|sir why did you s...|[sir, why, did, y...|
|there are many go...|[there, are, many...|
|vegan diets provi...|[vegan, diets, pr...|
|we were given veg...|[we, were, given,...|
|why are vegans so...|[why, are, vegans...|
|would you ever go...|[would, you, ever...|
|you doing anythin...|[you, doing, anyt...|
|defeating russia ...|[defeating, russi...|
|sustainabledecisi...|[sustainabledecis...|
|arfeatures cai st...|[arfeatures, cai,...|
|bgan punks are no...|[bgan, punks, are...|
|cbdoil for anxiet...|[cbdoil, for, anx...|
|didyouknow as lon...|[didyoukno

In [131]:
#remove stop words 
remover = StopWordsRemover(inputCol="tokens", outputCol="clean_tokens")
df = remover.transform(df)
df.select('tokens', 'clean_tokens').show()

+--------------------+--------------------+
|              tokens|        clean_tokens|
+--------------------+--------------------+
|[lot, of, people,...|[lot, people, say...|
|[by, now, your, v...|[veganvegetarian,...|
|[dont, stay, in, ...|[dont, stay, sun,...|
|[heroes, sacrific...|[heroes, sacrific...|
|[last, year, over...|[last, year, mill...|
|[monday, mint, mo...|[monday, mint, mo...|
|[no, nut, novembe...|[nut, november, m...|
|[sir, why, did, y...|[sir, start, laug...|
|[there, are, many...|[many, good, reas...|
|[vegan, diets, pr...|[vegan, diets, pr...|
|[we, were, given,...|[given, vegan, se...|
|[why, are, vegans...|[vegans, annoying...|
|[would, you, ever...|[ever, go, vegeta...|
|[you, doing, anyt...|[anything, specia...|
|[defeating, russi...|[defeating, russi...|
|[sustainabledecis...|[sustainabledecis...|
|[arfeatures, cai,...|[arfeatures, cai,...|
|[bgan, punks, are...|[bgan, punks, vegan]|
|[cbdoil, for, anx...|[cbdoil, anxiety,...|
|[didyouknow, as, ...|[didyoukno

In [132]:
# Stem text
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df = df.withColumn("tokens_stemmed", stemmer_udf("clean_tokens"))

In [133]:
df.select('clean_tokens', 'tokens_stemmed').show()

+--------------------+--------------------+
|        clean_tokens|      tokens_stemmed|
+--------------------+--------------------+
|[lot, people, say...|[lot, peopl, say,...|
|[veganvegetarian,...|[veganvegetarian,...|
|[dont, stay, sun,...|[dont, stay, sun,...|
|[heroes, sacrific...|[hero, sacrific, ...|
|[last, year, mill...|[last, year, mill...|
|[monday, mint, mo...|[monday, mint, mo...|
|[nut, november, m...|[nut, novemb, mea...|
|[sir, start, laug...|[sir, start, laug...|
|[many, good, reas...|[mani, good, reas...|
|[vegan, diets, pr...|[vegan, diet, pro...|
|[given, vegan, se...|[given, vegan, se...|
|[vegans, annoying...|[vegan, annoy, an...|
|[ever, go, vegeta...|[ever, go, vegeta...|
|[anything, specia...|[anyth, special, ...|
|[defeating, russi...|[defeat, russia, ...|
|[sustainabledecis...|[sustainabledecis...|
|[arfeatures, cai,...|[arfeatur, cai, s...|
|[bgan, punks, vegan]| [bgan, punk, vegan]|
|[cbdoil, anxiety,...|[cbdoil, anxieti,...|
|[didyouknow, long...|[didyoukno

# 4. Sentiment 

## 4.1 The Vader Package

In [138]:
!pip3 install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [139]:
#using the vaderSentiment package 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [141]:
# define the function to extract the sentiment
def get_sentiment(sentence):

    # initialize sentiment analyzer
    sid_obj = SentimentIntensityAnalyzer()

    # get sentiment dict
    sentiment_dict = sid_obj.polarity_scores(sentence)
    
    # get positive sentiment score
    pos_sentiment = sentiment_dict["pos"]
    
    # return positive sentiment score
    return(pos_sentiment)

get_sentiment_udf = udf(get_sentiment, DoubleType())

In [152]:
df = df.withColumn("sentiment_vader", get_sentiment_udf(F.col("cleaned_text")))

In [153]:
df.select("cleaned_text", "sentiment_vader").show()

+--------------------+---------------+
|        cleaned_text|sentiment_vader|
+--------------------+---------------+
|lot of people say...|          0.139|
|by now your vegan...|          0.094|
|dont stay in the ...|            0.2|
|heroes sacrifice ...|          0.321|
|last year over mi...|          0.075|
|monday mint motiv...|          0.098|
|no nut november m...|          0.238|
|sir why did you s...|          0.263|
|there are many go...|          0.114|
|vegan diets provi...|          0.314|
|we were given veg...|           0.16|
|why are vegans so...|          0.164|
|would you ever go...|            0.0|
|you doing anythin...|          0.172|
|defeating russia ...|            0.0|
|sustainabledecisi...|          0.061|
|arfeatures cai st...|            0.0|
|bgan punks are no...|            0.0|
|cbdoil for anxiet...|          0.372|
|didyouknow as lon...|            0.0|
+--------------------+---------------+
only showing top 20 rows



## 4.2 TextBlob Package 

In [154]:
#use polarity and subjectivity from TextBlob 
#https://textblob.readthedocs.io/en/dev/
from textblob import TextBlob

In [155]:
# define function to get polarity score of text document
def get_polarity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[0]
# define function to get subjectivity score of text document
def get_subjectivity(row):
    textBlob_review = TextBlob(row)
    return textBlob_review.sentiment[1]
get_polarity_udf = F.udf(get_polarity, DoubleType())
get_subjectivity_udf = F.udf(get_subjectivity, DoubleType())

In [156]:
df = df.withColumn('polarity', get_polarity_udf(F.col('cleaned_text')))\
        .withColumn('subjectivity', get_subjectivity_udf(F.col('cleaned_text')))

In [157]:
df.select("cleaned_text", "sentiment_vader", "polarity", "subjectivity").show()

+--------------------+---------------+--------------------+-------------------+
|        cleaned_text|sentiment_vader|            polarity|       subjectivity|
+--------------------+---------------+--------------------+-------------------+
|lot of people say...|          0.139|                 0.0|                0.0|
|by now your vegan...|          0.094| 0.43333333333333335| 0.8333333333333334|
|dont stay in the ...|            0.2|0.030808080808080826| 0.5393097643097643|
|heroes sacrifice ...|          0.321|                0.65|              0.675|
|last year over mi...|          0.075|-0.07999999999999999| 0.3383333333333333|
|monday mint motiv...|          0.098|                 0.0|                0.0|
|no nut november m...|          0.238|            0.221875|0.47187500000000004|
|sir why did you s...|          0.263|  0.6166666666666667| 0.4666666666666666|
|there are many go...|          0.114|  0.2785714285714286|               0.55|
|vegan diets provi...|          0.314|  

# 5. Basetable 

In [159]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- favourites_count: long (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- display_url: string (nullable = true)
 |    |    |-- expanded_url: string (nullable = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- url: string (nullable = true)


## 5.1 Adjust dates to opening hours stock exchange