In [None]:
import numpy as np 
import pandas as pd 
from numpy.linalg import eig
import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans

In [None]:
import os
# cwd=os.getcwd()
# print(cwd)
os.listdir('/kaggle/input')


In [None]:
hashtag_trump = pd.read_csv("/kaggle/input/us-election-2020-tweets/hashtag_donaldtrump.csv",engine ="python")
hashtag_biden = pd.read_csv("/kaggle/input/us-election-2020-tweets/hashtag_joebiden.csv",engine ="python")
biden_tweets = pd.read_csv("/kaggle/input/individual-tweets/tweets_of_joe_biden.csv",engine ="python")
trump_tweets = pd.read_json("/kaggle/input/trump-tweets/trump-tweets.json")



### Turning Trump's and Joe's tweets into correct dtypes

In [None]:
print(hashtag_biden.head())
print(hashtag_biden.dtypes)

In [None]:
hashtag_trump['created_at'] = pd.to_datetime(hashtag_trump['created_at'], errors='coerce')
hashtag_trump['collected_at'] = pd.to_datetime(hashtag_trump['collected_at'], errors='coerce')
hashtag_trump['user_join_date'] = pd.to_datetime(hashtag_trump['user_join_date'], errors='coerce')
columns = ['tweet_id', 'likes', 'retweet_count', 'user_id', 'user_followers_count', 'lat', 'long']
hashtag_trump[columns] = hashtag_trump[columns].apply(pd.to_numeric, errors='coerce')
hashtag_trump.dropna(subset=['created_at', 'tweet_id', 'likes', 'retweet_count', 'user_id', 'user_followers_count', 'lat', 'long', 'collected_at'], inplace=True)


hashtag_biden['created_at'] = pd.to_datetime(hashtag_biden['created_at'], errors='coerce')
hashtag_biden['collected_at'] = pd.to_datetime(hashtag_biden['collected_at'], errors='coerce')
hashtag_biden['user_join_date'] = pd.to_datetime(hashtag_biden['user_join_date'], errors='coerce')
hashtag_biden[columns] = hashtag_biden[columns].apply(pd.to_numeric, errors='coerce')
hashtag_biden.dropna(subset=['created_at', 'tweet_id', 'likes', 'retweet_count', 'user_id', 'user_followers_count', 'lat', 'long', 'collected_at'], inplace=True)

# Data Wrangling 

In [None]:
trump= trump_tweets.copy()


In [None]:
trump_deleted = pd.read_json("/kaggle/input/trump-tweets/trump_deleted.json")
trump_deleted
status = list()
id_deleted = list(trump_deleted['id'])
print(id_deleted)
for ids in trump['id']:
    if(ids in id_deleted):
        status.insert(ids,"Deleted")
    else:
        status.insert(ids,"Normal")
status
trump['status'] = status
trump


In [None]:
#We now need to convert all the "favorites" into integers as some are integers and some strings. Pandas might automaticall do this for us
trump = trump.astype({'favorites': 'int'})
trump = trump.astype({'retweets': 'int'})
trump.info()

In [None]:

# Here we have created a trump data set which displays all the deleted tweets. We do this by cross correlating trumps deleted tweets with the list of all his tweets
#The retweets have 0 favourites (which are basically likes) and have an RT at the front 
# To distinquish between the tweets trump deleted and the tweets deleted by twitter (restricted), we make the assumption that whenever a tweet is deleted by twitter the 
# like and retweet numbers are set to 0. We use this to distrinquish 

new_status = list()
for index,val in enumerate(trump['status']):
    list_fav = list(trump['favorites'])
    list_retweets = list(trump['retweets'])
    if(val=="Normal" and list_fav[index]==0 and list_retweets[index] == 0):
        new_status.insert(index, "Restricted")
    else:
        new_status.insert(index,val)
trump['status'] = new_status
trump[trump['status'] == "Deleted"]

            

The trump data now has a status column which says if the tweet if the tweet is "normal", "Resitrcited" or "deleted".We have also converted all the 'favorites' and the 'retweets' into integers. 

In [None]:
trump.insert(1, 'time', pd.to_datetime(trump['date']).dt.time)
trump.insert(1, 'day', pd.to_datetime(trump['date']).dt.date)
trump = trump[trump['day']>=datetime.date(2020,10,7)]
trump

In [None]:
#We now rename the columns of the trump to match the Biden data below
trump.rename(columns={'date': 'tweet_date', 'day': 'tweet_day', 'time': 'tweet_time', 'favorites':'tweet_like_count','id':'tweet_id', 'retweets':'tweet_retweet_count', 'text':'tweet_content'}, inplace=True)
trump.iloc[::-1].reset_index()

In [None]:
biden_tweets

In [None]:
print(biden_tweets.info())
biden = biden_tweets.copy()

In [None]:
#splitting the datetime column to so it splits into two columns- Date and Time (create new column after date for time, and replace old datetime column with just date)

biden.insert(4, 'tweet_time', pd.to_datetime(biden['tweet_date']).dt.time)
biden.insert(4, 'tweet_day', pd.to_datetime(biden['tweet_date']).dt.date)
#drop the tweet_url
biden = biden.drop(columns=['tweet_url'])
biden.info()

In [None]:
#restricting date for Joe Biden's tweets to be between Oct 07 2020 and Nov 07 2020
biden = biden[(biden['tweet_day']>=datetime.date(2020,10,7)) & (biden['tweet_day']<=datetime.date(2020,11,7))]
biden

In [None]:
#check that the tweets came from the same account- Joe Biden
biden["username"].unique()

In [None]:
#We now reindex the biden data set to start from 0
biden = biden.sort_values(by=['tweet_date'], ascending=True)
biden.reset_index()

#### Removing tweets with both hashtags

In [None]:
#Find tweets which hashtag both
hashtag_both = hashtag_trump.merge(hashtag_biden, how='inner', on='tweet_id')
#hashtag_both.shape

In [None]:
#Remove common tweets from each dataset
hashtag_justbiden = hashtag_biden[~hashtag_biden['tweet_id'].isin(hashtag_both.tweet_id)]
hashtag_justtrump = hashtag_trump[~hashtag_trump['tweet_id'].isin(hashtag_both.tweet_id)]


#### Grouping by state

In [None]:
US = ['United States of America']

#Biden location groupping
B_US_df = hashtag_biden.loc[hashtag_biden['country'].isin(US)]
to_c_filter = ['tweet_id', 'user_id', 'user_name', 'user_location', 'user_screen_name', 'user_description', 'user_join_date', 'city', 'continent', 'state_code']
for cf in to_c_filter:
    B_US_df = B_US_df.drop(labels = cf, axis = 1)
B_groupped_by_states = B_US_df.groupby('state')

#Trump location groupping
T_US_df = hashtag_trump.loc[hashtag_trump['country'].isin(US)]
to_c_filter = ['tweet_id', 'user_id', 'user_name', 'user_location', 'user_screen_name', 'user_description', 'user_join_date', 'city', 'continent', 'state_code']
for cf in to_c_filter:
    T_US_df = T_US_df.drop(labels = cf, axis = 1)
T_groupped_by_states = T_US_df.groupby('state')



#B_groupped_by_states.first()
#T_groupped_by_states.first()

#Might need to remove these depending on what do we want to do, because they are not states.
#District of Columbia? has a place in the electoral college?
#Puerto Rico
#Northern Marianna islands
#Guam


In [None]:
#Plot likes against time for Biden
B_US_df['created_at'] = pd.to_datetime(B_US_df['created_at'])
B_US_df['likes'] = B_US_df['likes'].astype(float)
B_US_df['user_followers_count'] = B_US_df['user_followers_count'].astype(float)

B_US_df_likes = B_US_df[['created_at', 'likes']]       
#B_US_df_likes['likes'] = B_US_df['likes'] / B_US_df['user_followers_count']
print(B_US_df_likes['likes'])
df = B_US_df_likes.groupby(B_US_df_likes['created_at'].dt.date).mean().reset_index()
plt.bar(df['created_at'], df['likes'])



T_US_df['created_at'] = pd.to_datetime(T_US_df['created_at'])
#print(T_US_df.groupby(T_US_df['created_at'].dt.date).sum(axis=0))

#sc = hashtag_justbiden.plot.scatter(x='')

# Exploration 

In [None]:
candidate = list(["Biden","Trump"])
likes = list([biden['tweet_like_count'].sum(),trump['tweet_like_count'].sum()])
plt.bar(candidate, likes)
#Graph shows total number of likes for each candidate
list(biden['tweet_day'])[0]


In [None]:
#removed the tweets with zero likes (restricted and retweeted) as these accounted only for aprx 50/1500 data points.
# We did this because we want to analyse the ratio of retweets to likes. Having 0 likes provides no information. 
biden_tweet_retweet_ratio = biden.copy()
intermediate = trump[trump['isRetweet'] == False]
trump_tweet_retweet_ratio = intermediate[intermediate["status"] != "Restricted"]


In [None]:
#We now attempt to see about how the engagment with each candidate evolves over each time by looking at retweet to tweet like ratio
ratio_biden = list()
for i in range(len(biden_tweet_retweet_ratio['tweet_id'])):
    ratio_biden.insert(i,list(biden_tweet_retweet_ratio['tweet_retweet_count'])[i]/list(biden_tweet_retweet_ratio['tweet_like_count'])[i])
ratio_trump = list()
for i in range(len(trump_tweet_retweet_ratio['tweet_id'])):
    ratio_trump.insert(i,list(trump_tweet_retweet_ratio['tweet_retweet_count'])[i]/list(trump_tweet_retweet_ratio['tweet_like_count'])[i])

        
biden_tweet_retweet_ratio['retweet_tweet_like_ratio'] = ratio_biden
trump_tweet_retweet_ratio['retweet_tweet_like_ratio'] = ratio_trump
end = list(trump_tweet_retweet_ratio['tweet_day'])[0]
start = list(trump_tweet_retweet_ratio['tweet_day'])[-1]
increment = datetime.timedelta(days=1)
ratio_biden_average = list()
ratio_trump_average = list()
dates = trump_tweet_retweet_ratio['tweet_day'].unique()
index=0
while(start <= end):
    average_biden = biden_tweet_retweet_ratio.loc[(biden_tweet_retweet_ratio['tweet_day'] == start), 'retweet_tweet_like_ratio'].mean()
    average_trump = trump_tweet_retweet_ratio.loc[(trump_tweet_retweet_ratio['tweet_day'] == start), 'retweet_tweet_like_ratio'].mean()
    ratio_biden_average.insert(index,average_biden)
    ratio_trump_average.insert(index,average_trump)
    index+=1
    start += increment 

In [None]:
plt.figure(2)
plt.plot(ratio_biden_average,label='Biden ratio')
plt.ylim(0, 0.3)
plt.plot(ratio_trump_average, label="Trump ratio")
plt.ylim(0, 0.3)
plt.xlabel("Days")
plt.ylabel("Ratio")
plt.title("Retweet to Like Ratio for Trump and Biden")
plt.legend()
plt.show()

In [None]:
# Here we explore the #Trump and #Biden dataset. We will look into activity of each state and compare this to known events which take place.
T_state_df = T_US_df[T_US_df['state'] == 'California'] # Change this state value to change the graphs
B_state_df = B_US_df[B_US_df['state'] == 'California']
start = list(T_US_df['tweet_day'])[0]
end = list(T_US_df['tweet_day'])[-1]
increment = datetime.timedelta(days=1)
index = 0
Trump_number_tweets_list = list()
Trump_number_likes_list = list()
Biden_number_tweets_list = list()
Biden_number_likes_list = list()
while(start<=end):
    Trump_data_for_day = T_state_df[T_state_df['tweet_day'] == start]
    Biden_data_for_day = B_state_df[B_state_df['tweet_day'] == start]
    # Add trump data to lists 
    Trump_number_tweets = len(Trump_data_for_day)
    Trump_number_tweets_list.insert(index, Trump_number_tweets)
    Trump_number_likes_list.insert(index, Trump_data_for_day['likes'].sum())
    #Add Biden data to lists 
    Biden_number_tweets = len(Biden_data_for_day)
    Biden_number_tweets_list.insert(index, Biden_number_tweets)
    Biden_number_likes_list.insert(index, Biden_data_for_day['likes'].sum())
    index+=1
    start += increment

In [None]:
# Final US debate was on the 22nd hence the spike on the 23rd in most states. 
dates_trump = T_US_df['tweet_day'].unique()
# Total number of likes related to trump day by day for a generic state
plt.figure(3)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator())
plt.plot(dates_trump,Trump_number_tweets_list, label='Number of Trump Tweets')
plt.plot(dates_trump, Biden_number_tweets_list,label='Number of Biden Tweets')
plt.xticks(rotation=70)
plt.title("Number of Tweets Related to Each Candidate")
plt.xlabel("Days") # from 15th octobre to 8th Nov
plt.ylabel("Number of Tweets")
plt.legend()
plt.show()
# Large spike at end for Biden probably related to him winning the election. 

In [None]:

dates_trump = T_US_df['tweet_day'].unique()
plt.figure(4)

# Total number of likes related to trump day by day for a generic state
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator())

plt.plot(dates_trump,Trump_number_likes_list, label='Number of Trump Likes')
plt.plot(dates_trump, Biden_number_likes_list,label='Number of Biden Likes')
plt.xticks(rotation=70)
plt.title("Number of Likes Related to Each Candidate")
plt.xlabel("Days") # from 15th octobre to 8th Nov
plt.ylabel("Number of Likes")
plt.legend()
plt.show()
# This graph shows the sum of likes relating to trump and biden everyday. This provides some insite into the popularity of each candidate



In [None]:
# Here we plot the number of tweets made with #Trump on an hour by hour basis. We then plot the time of the most liked trump tweets to see if there is a correlation between his influential tweets and activity. 

hashtag_biden_us = hashtag_biden.loc[hashtag_biden['country'].isin(US)]
hashtag_trump_us = hashtag_trump.loc[hashtag_trump['country'].isin(US)]

hashtag_trump_date_tweet = hashtag_trump_us[['created_at', 'tweet']]
hashtag_biden_date_tweet = hashtag_biden_us[['created_at', 'tweet']]
[['created_at', 'tweet']]


trump_hour_tweet = hashtag_trump_date_tweet.groupby([hashtag_trump_date_tweet['created_at'].dt.date, hashtag_trump_date_tweet['created_at'].dt.hour]).count().drop(columns = {'created_at'})
biden_hour_tweet = hashtag_biden_date_tweet.groupby([hashtag_biden_date_tweet['created_at'].dt.date, hashtag_biden_date_tweet['created_at'].dt.hour]).count().drop(columns = {'created_at'})
trump_hour_tweet.index.names = ['date', 'hour']
biden_hour_tweet.index.names = ['date', 'hour']
trump_hour_tweet.reset_index(inplace=True)
biden_hour_tweet.reset_index(inplace=True)

trump_hour_tweet["date_hour"] = trump_hour_tweet[["date", "hour"]].apply((lambda x: str(x['date'])+ ' ' + (str(x['hour']))), axis =1)
biden_hour_tweet["date_hour"] = biden_hour_tweet[["date", "hour"]].apply((lambda x: str(x['date'])+ ' ' + (str(x['hour']))), axis =1)
fig1 = px.line(trump_hour_tweet, x='date_hour', y='tweet')

trump_no_retweets = trump[(trump['isRetweet'] == False) & (trump['tweet_date'].dt.date>= datetime.date(2020,10,15)) & (trump['tweet_date'].dt.date<=datetime.date(2020,11,8))]
first_likes = trump_no_retweets.sort_values(by = ['tweet_like_count'], ascending=False).head(50)
first_likes['date_hour'] = first_likes.apply((lambda x: str(x['tweet_day'])+ ' ' + (str(x['tweet_date'].hour))), axis =1)
first_likes
for i in first_likes['date_hour']:
    fig1.add_vline(i, line_width=1, line_color="red")

    
fig1

In [None]:
# This graph shows the number of tweets wth #trump on an hour basis. We also plot the dates of the restricted trump tweets. 
fig2 = px.line(trump_hour_tweet, x='date_hour', y='tweet')
trump_no_retweets = trump[(trump['isRetweet'] == False) & (trump['tweet_date'].dt.date>= datetime.date(2020,10,15)) & (trump['tweet_date'].dt.date<=datetime.date(2020,11,8))]
first_likes = trump_no_retweets[(trump_no_retweets['status'] == 'Restricted')| (trump_no_retweets['status'] == 'Deleted')]
first_likes['date_hour'] = first_likes.apply((lambda x: str(x['tweet_day'])+ ' ' + (str(x['tweet_date'].hour))), axis =1)
first_likes
for i in first_likes['date_hour']:
    fig2.add_vline(i, line_width=1, line_color="red")

    
fig2

In [None]:
#This shows the number of #biden tweets relating on hour basis
fig3 = px.line(biden_hour_tweet,x='date_hour', y=['tweet'])

fig3

# 
- Looking at the Daily tweet count, retweets on both the candidates

In [None]:
biden_per_day = pd.DataFrame(biden["tweet_day"].value_counts())
biden_per_day = biden_per_day.sort_index()
#biden_per_day.insert(0, "Date", biden["tweet_day"].unique())
biden_per_day = biden_per_day.rename(columns={"tweet_day":"Tweet_Count"})
biden_per_day.reindex()

likes = list()
retweet = list()
for i in biden_per_day.index:
    likes.append(biden.loc[biden['tweet_day'] == i]["tweet_like_count"].sum())
    retweet.append(biden.loc[biden['tweet_day'] == i]["tweet_retweet_count"].sum())
biden_per_day["Number_of_Likes"] = likes
biden_per_day["Number_of_Retweets"] = retweet
biden_per_day

In [None]:
trump_per_day = pd.DataFrame(trump["tweet_day"].value_counts())
trump_per_day = trump_per_day.sort_index()
trump_per_day = trump_per_day.rename(columns={"tweet_day":"Tweet_Count"})
trump_per_day.reindex()

likes = list()
retweet = list()
for i in trump_per_day.index:
    likes.append(trump.loc[trump['tweet_day'] == i]["tweet_like_count"].sum())
    retweet.append(trump.loc[trump['tweet_day'] == i]["tweet_retweet_count"].sum())
trump_per_day["Number_of_Likes"] = likes
trump_per_day["Number_of_Retweets"] = retweet
trump_per_day

In [None]:
ave_biden_tweets = biden_per_day["Tweet_Count"].mean()
ave_trump_tweets = trump_per_day["Tweet_Count"].mean()
ratio_biden_trump_tweets = ave_biden_tweets/ave_trump_tweets
ratio_biden_trump_tweets

### Map of the total number of tweets based on the state of the user

In [None]:
hashtag_biden_us = hashtag_biden.loc[hashtag_biden['country'].isin(US)]
hashtag_trump_us = hashtag_trump.loc[hashtag_trump['country'].isin(US)]
hashtag_trump_loc_tweet = hashtag_trump_us[['tweet', 'state_code']]
hashtag_biden_loc_tweet = hashtag_biden_us[['tweet', 'state_code']]

hashtag_trump_loc_tweet = hashtag_trump_loc_tweet.groupby('state_code').count().reset_index()
hashtag_biden_loc_tweet = hashtag_biden_loc_tweet.groupby('state_code').count().reset_index()


In [None]:
tweet_trump_map = px.choropleth(hashtag_trump_loc_tweet,
                    locations='state_code', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='tweet',
                    color_continuous_scale="YlOrBr",
                    range_color = [0, 30000]
                    
                    )
tweet_trump_map.show()

In [None]:
tweet_biden_map = px.choropleth(hashtag_biden_loc_tweet,
                    locations='state_code', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='tweet',
                    color_continuous_scale="YlOrBr", 
                    range_color = [0, 30000]
                    
                    )
tweet_biden_map.show()

### Number of Tweets each day

In [None]:
hashtag_trump_date_tweet = hashtag_trump_us[['created_at', 'tweet']]
hashtag_biden_date_tweet = hashtag_biden_us[['created_at', 'tweet']]

trump_date_tweet = hashtag_trump_date_tweet.groupby(hashtag_trump_date_tweet['created_at'].dt.date).count().drop(columns={'created_at'}).reset_index()
biden_date_tweet = hashtag_biden_date_tweet.groupby(hashtag_biden_date_tweet['created_at'].dt.date).count().drop(columns={'created_at'}).reset_index()


In [None]:
trump_date_tweet_bar = px.bar(trump_date_tweet, x='created_at', y='tweet')
trump_date_tweet_bar.show()

In [None]:

biden_date_tweet_bar = px.bar(biden_date_tweet, x='created_at', y='tweet')
biden_date_tweet_bar.show()

In [None]:
hashtag_trump_loc_tweet = hashtag_trump_us[['created_at', 'tweet', 'state_code']]
hashtag_biden_loc_tweet = hashtag_biden_us[['created_at', 'tweet', 'state_code']]

trump_date_tweet = hashtag_trump_loc_tweet.groupby([hashtag_trump_loc_tweet['created_at'].dt.date, 'state_code']).count().drop(columns={'created_at'}).reset_index()
biden_date_tweet = hashtag_biden_loc_tweet.groupby([hashtag_biden_loc_tweet['created_at'].dt.date, 'state_code']).count().drop(columns={'created_at'}).reset_index()


In [None]:
trump_date_tweet_map = px.choropleth(trump_date_tweet, # dataset to use
                    locations='state_code', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='tweet',
                    color_continuous_scale="YlOrBr", 
                    range_color = [0, 3768],
                    animation_frame ="created_at"
                   )
trump_date_tweet_map.show()

In [None]:
biden_date_tweet_map = px.choropleth(biden_date_tweet, # dataset to use
                    locations='state_code', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='tweet',
                    color_continuous_scale="YlOrBr", 
                    range_color = [0, 3768],
                    animation_frame ="created_at"
                   )
biden_date_tweet_map.show()

### Number of likes (normalised by tweet) compared to date collected

In [None]:
hashtag_trump_collected_like = hashtag_trump_us[['collected_at', 'likes']]
hashtag_biden_collected_like = hashtag_biden_us[['collected_at', 'likes']]

hashtag_trump_collected_tweet = hashtag_trump_us[['collected_at', 'tweet']]
hashtag_biden_collected_tweet = hashtag_biden_us[['collected_at', 'tweet']]


trump_collected_like = hashtag_trump_collected_like.groupby(hashtag_trump_collected_like['collected_at'].dt.date).sum().reset_index()
biden_collected_like = hashtag_biden_collected_like.groupby(hashtag_biden_collected_like['collected_at'].dt.date).sum().reset_index()

trump_collected_tweet = hashtag_trump_collected_tweet.groupby(hashtag_trump_collected_tweet['collected_at'].dt.date).count().drop(columns={'collected_at'}).reset_index()
biden_collected_tweet = hashtag_biden_collected_tweet.groupby(hashtag_biden_collected_tweet['collected_at'].dt.date).count().drop(columns={'collected_at'}).reset_index()

trump_collected_like['likes'] = trump_collected_like['likes'] / trump_collected_tweet['tweet']
biden_collected_like['likes'] = biden_collected_like['likes'] / biden_collected_tweet['tweet']

In [None]:
trump_collected_like_bar = px.bar(trump_collected_like, x='collected_at', y='likes')
trump_collected_like_bar.show()

In [None]:
biden_collected_like_bar = px.bar(biden_collected_like, x='collected_at', y='likes')
biden_collected_like_bar.show()

### Follower Count against Retweets

In [None]:
hashtag_trump_follower_retweet = hashtag_trump_us[['user_id', 'user_followers_count', 'retweet_count']]
hashtag_biden_follower_retweet = hashtag_biden_us[['user_id', 'user_followers_count', 'retweet_count']]

trump_follower_retweet = hashtag_trump_follower_retweet.groupby(["user_id"]).sum()
biden_follower_retweet = hashtag_biden_follower_retweet.groupby(["user_id"]).sum()

print(trump_follower_retweet)

In [None]:
follower_retweet_scatter = px.scatter(trump_follower_retweet, x="user_followers_count", y="retweet_count")


follower_retweet_scatter.add_scatter(x=biden_follower_retweet["user_followers_count"], y=biden_follower_retweet["retweet_count"], mode= 'markers')

follower_retweet_scatter.show()


### Tweets by different platforms

In [None]:
hashtag_trump_tweet_platform = hashtag_trump_us[['source', 'tweet']]
hashtag_biden_tweet_platform = hashtag_biden_us[['source', 'tweet']]

trump_source_tweet = hashtag_trump_tweet_platform.groupby(["source"]).count().reset_index()
biden_source_tweet = hashtag_biden_tweet_platform.groupby(["source"]).count().reset_index()

In [None]:
px.bar(trump_source_tweet, x='source', y='tweet').show()

In [None]:
px.bar(biden_source_tweet, x='source', y='tweet').show()

### Number of times trump was mentioned in hashtag biden vs number of times biden was mentioned in hashtag trump

In [None]:
hashtag_trump_containing_biden = hashtag_justtrump[hashtag_justtrump['tweet'].str.contains("biden|Biden")][['tweet']]
hashtag_biden_containing_trump = hashtag_justbiden[hashtag_justbiden['tweet'].str.contains("trump|Trump")][['tweet']]

px.bar(x=['Trump Containing Biden', 'Biden Containing Trump'], y = [hashtag_trump_containing_biden['tweet'],hashtag_biden_containing_trump['tweet']]).show()

### Languages of tweets

In [None]:
#Use langdetect to determine language of tweets - doing on sample initially as takes long time to run
!pip install langdetect # enable internet on sidebar for this to work
from langdetect import detect

def get_lang(tweet):
    try:
        language = detect(tweet)
    except:
        language = 'n/a'
    return language

Running on sample of 5000 tweets to test graph functionality. Plan is to run on whole dataset once and save to csv - will take 30min+ est so desirable to do only once.

In [None]:
#Combine both US datasets without duplicates
total_US_df = pd.concat([B_US_df[['tweet', 'state_code']].copy(),T_US_df[['tweet', 'state_code']].copy()])

#Randomly sample for language analysis - takes long time to run on whole dataset - consider running once and saving output to file
total_US_dfsample=total_US_df.sample(n=5000).copy()
total_US_dfsample['language'] = total_US_dfsample.tweet.apply(lambda i:get_lang(i))
#total_US_dfsample.groupby('language')

Code below to run on whole dataset, will save to csv - uncomment when find a time cba 

In [None]:
#Uncomment to run on whole dataset - this will take ages! once in output folder just uncomment read in line instead
#out= '../output' # folder to save intermediary data to
#Path(out).mkdir(parents=True, exist_ok=True)
#total_US_dfsample['language'] = total_US_df.tweet.apply(lambda i:get_lang(i))
#total_US_dfsample.to_csv(out + '/total_US_languages.csv', index=False)

#Uncomment to read-in existing csv of language analysis
#total_US_dfsample = pd.read_csv(out + '/total_US_languages.csv', lineterminator='\r')


In [None]:
fig = px.bar(total_US_dfsample.language.value_counts(),  y='language')
fig.update_layout(title='Frequency of languages across tweets',xaxis_title='Tweet language', yaxis_title='Count')
fig.show()

In [None]:
# Group by state and count languages
counts = total_US_dfsample.groupby(['state_code','language']).size().reset_index(name='count')

# Calculate percentages
totals = counts.groupby('state_code')['count'].transform('sum')
counts['percentage'] = counts['count'] / totals * 100

# Create 3D scatter plot of each state's dominant language
fig = px.scatter_3d(counts, x='language', y='state_code', z='percentage', color='language')
fig.update_layout(title='Proportion of language separated by state')

# show the plot
fig.show()

We can see above that Puerto Rico has majority Spanish language in their tweets. Puerto Rico is a US territory but citizens do not have Presidential Election voting rights. It might be worth excluding US territories who cannot vote? Or perhaps filtering to english only will suffice.

In [None]:
#Proportion of tweets english by states
english_counts = counts[counts['language'] == 'en']


# create choropleth map
fig = px.choropleth(english_counts, locations='state_code', 
                    locationmode="USA-states", 
                    scope="usa", color='percentage',
                    color_continuous_scale="YlOrBr", 
                    range_color = [60, 100], labels={'english_percentage': '% of English Speakers'})

# add map title
fig.update_layout(title_text='Percentage of Tweets in English by State')

# show the plot
fig.show()