In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("FinalCoursework") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import Row

# Helper function to test the correctness of the solutions
def test(var, val, msg=""):
    print("1 test passed.") if var == val else print("1 test failed. " + msg)

In [3]:
# <FILL-IN WITH YOUR CODE>
df_trump = spark.read.option("multiline", True).csv("data/hashtag_donaldtrump.csv", header=True, inferSchema=True)
df_biden = spark.read.option("multiline", True).csv("data/hashtag_joebiden.csv", header=True, inferSchema=True)
# Cache your DataFrame
df_trump.cache()
df_biden.cache()
# <FILL-IN WITH YOUR CODE>

DataFrame[created_at: string, tweet_id: string, tweet: string, likes: string, retweet_count: string, source: string, user_id: string, user_name: string, user_screen_name: string, user_description: string, user_join_date: string, user_followers_count: string, user_location: string, lat: string, long: string, city: string, country: string, continent: string, state: string, state_code: string, collected_at: string]

In [4]:
df_trump.count()

1041276

In [5]:
df_biden.count()

823970

In [6]:
df_trump.show(5)

+-------------------+--------------------+--------------------+-----+-------------+------------------+--------------------+--------------------+----------------+--------------------+-------------------+--------------------+--------------------+----------+------------+----------+--------------------+-------------+--------------------+----------+--------------------+
|         created_at|            tweet_id|               tweet|likes|retweet_count|            source|             user_id|           user_name|user_screen_name|    user_description|     user_join_date|user_followers_count|       user_location|       lat|        long|      city|             country|    continent|               state|state_code|        collected_at|
+-------------------+--------------------+--------------------+-----+-------------+------------------+--------------------+--------------------+----------------+--------------------+-------------------+--------------------+--------------------+----------+---------

In [7]:
df_biden.show(5)

+-------------------+--------------------+--------------------+-----+-------------+------------------+--------------------+------------------+----------------+--------------------+-------------------+--------------------+--------------------+----------+-------------------+----+--------------------+-------------+----------+----------+--------------------+
|         created_at|            tweet_id|               tweet|likes|retweet_count|            source|             user_id|         user_name|user_screen_name|    user_description|     user_join_date|user_followers_count|       user_location|       lat|               long|city|             country|    continent|     state|state_code|        collected_at|
+-------------------+--------------------+--------------------+-----+-------------+------------------+--------------------+------------------+----------------+--------------------+-------------------+--------------------+--------------------+----------+-------------------+----+--------

In [8]:
#Dropping columns
remove_columns = ['created_at', 'tweet_id', 'user_id', 'user_screen_name', 'user_join_date', 'user_location', 'lat', 'long', 'state_code', 
                  'collected_at']
df_trump = df_trump.drop(*remove_columns)
df_biden = df_biden.drop(*remove_columns)

#Dropping rows with na values
df_trump = df_trump.na.drop()
df_biden = df_biden.na.drop()

df_trump.show(5)

+--------------------+-----+-------------+-------------------+--------------------+--------------------+--------------------+-----------------+--------------------+-------------+--------------------+
|               tweet|likes|retweet_count|             source|           user_name|    user_description|user_followers_count|             city|             country|    continent|               state|
+--------------------+-----+-------------+-------------------+--------------------+--------------------+--------------------+-----------------+--------------------+-------------+--------------------+
|#Trump: As a stud...|  2.0|          1.0|    Twitter Web App|              snarke|Will mock for foo...|              1185.0|         Portland|United States of ...|North America|              Oregon|
|You get a tie! An...|  4.0|          3.0| Twitter for iPhone|Rana Abtar - ÿ±ŸÜÿß ...|Washington Corres...|              5393.0|       Washington|United States of ...|North America|District of Columbi

In [9]:
df_biden.show(5)

+--------------------+-----+-------------+-------------------+--------------------+--------------------+--------------------+-----------+--------------------+-------------+----------+
|               tweet|likes|retweet_count|             source|           user_name|    user_description|user_followers_count|       city|             country|    continent|     state|
+--------------------+-----+-------------+-------------------+--------------------+--------------------+--------------------+-----------+--------------------+-------------+----------+
|In 2020, #NYPost ...|  0.0|          0.0| Twitter for iPhone|Change Illinois |...|Illinois, home of...|              1397.0|    Chicago|United States of ...|North America|  Illinois|
|"Comments on this...|  0.0|          0.0|    Twitter Web App|         John Ubaldi|Just Facts... No ...|                83.0|      Tampa|United States of ...|North America|   Florida|
|@RealJamesWoods #...|  0.0|          0.0|Twitter for Android|            Sam KE

In [10]:
df_trump.count()

204952

In [11]:
df_biden.count()

167881

In [12]:
from pyspark.sql.functions import lit

#Adding a presidents column where 0 represents trump and 1 represents biden. This will turn it into a classification problem
df_trump = df_trump.withColumn('President', lit(0))
df_biden = df_biden.withColumn('President', lit(1))

#Merging the 2 datasets
df = df_trump.union(df_biden)

In [13]:
df.count()

372833

In [14]:
from pyspark.sql.functions import col, lower
from pyspark.ml.feature import StopWordsRemover, Tokenizer

#Converting all of the words in tweets into lowercase
df = df.withColumn("tweet", lower(col("tweet")))

#Tokenizing the tweets
tokenizer = Tokenizer(inputCol = "tweet", outputCol = "tokenized_tweets")
df = tokenizer.transform(df)

#Column consisting of different stop words which can appear in a tweet
stop_words = [
    "rt", "amp", "via", "...'", "‚Äô", "‚Ä¶", "'s", "n't", "'re", "'m", "'ve", "'ll",
    "lol", "omg", "wtf", "rofl", "brb", "btw", "idk", "imho", "fyi",
    "http", "https", "www", "com", "org", "net", "html", "co", "ly",
    "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec",
    "mon", "tue", "wed", "thu", "fri", "sat", "sun",
    "january", "february", "march", "april", "june", "july", "august", "september", "october", 
    "november", "december",
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"
]

#Removing all the stop words
remover = StopWordsRemover(inputCol = "tokenized_tweets", outputCol = "preprocessed_tweets", 
                           stopWords = stop_words)
df = remover.transform(df)

In [15]:
df.select("tweet", "tokenized_tweets", "preprocessed_tweets").show(5)

+--------------------+--------------------+--------------------+
|               tweet|    tokenized_tweets| preprocessed_tweets|
+--------------------+--------------------+--------------------+
|#trump: as a stud...|[#trump:, as, a, ...|[#trump:, as, a, ...|
|you get a tie! an...|[you, get, a, tie...|[you, get, a, tie...|
|in 2020, #nypost ...|[in, 2020,, #nypo...|[in, 2020,, #nypo...|
|#trump #president...|[#trump, #preside...|[#trump, #preside...|
|@susan_hutch @joe...|[@susan_hutch, @j...|[@susan_hutch, @j...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [16]:
remove_columns = ["tweet", "tokenized_tweets"]
df = df.drop(*remove_columns)

In [17]:
df.show(5)

+-----+-------------+-------------------+--------------------+--------------------+--------------------+-----------------+--------------------+-------------+--------------------+---------+--------------------+
|likes|retweet_count|             source|           user_name|    user_description|user_followers_count|             city|             country|    continent|               state|President| preprocessed_tweets|
+-----+-------------+-------------------+--------------------+--------------------+--------------------+-----------------+--------------------+-------------+--------------------+---------+--------------------+
|  2.0|          1.0|    Twitter Web App|              snarke|Will mock for foo...|              1185.0|         Portland|United States of ...|North America|              Oregon|        0|[#trump:, as, a, ...|
|  4.0|          3.0| Twitter for iPhone|Rana Abtar - ÿ±ŸÜÿß ...|Washington Corres...|              5393.0|       Washington|United States of ...|North America|