<a href="https://colab.research.google.com/github/ab17254/dissertation/blob/main/load_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Upload config.py with API keys
from google.colab import files
uploaded = files.upload()

Saving config.py to config.py


In [2]:
import os
from config import KaggleKey
os.environ['KAGGLE_USERNAME'] = KaggleKey.kaggle_username
os.environ['KAGGLE_KEY'] = KaggleKey.kaggle_key
!kaggle datasets download -d albrace/twitter-data

Downloading twitter-data.zip to /content
 99% 291M/293M [00:03<00:00, 93.9MB/s]
100% 293M/293M [00:03<00:00, 81.5MB/s]


In [3]:
!unzip /content/twitter-data.zip -d /content/data

Archive:  /content/twitter-data.zip
  inflating: /content/data/all_twitter_data.csv  
  inflating: /content/data/political_twitter_data.csv  


In [4]:
import string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

sns.set_style('whitegrid')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
pol_df = pd.read_csv('/content/data/political_twitter_data.csv', encoding='cp1252')
all_df = pd.read_csv('/content/data/all_twitter_data.csv',encoding='cp1252')

In [8]:
print(all_df.shape)
all_df.isnull().sum()

(2311105, 21)


tweet_date                          0
tweet_content                       0
tweet_id                            0
tweet_likes                         0
tweet_replies                       0
tweet_retweets                      0
tweet_quotes                        0
user_username                       0
user_id                             0
user_followers                      0
user_friends                        0
user_statuses                       0
user_verified                       0
user_url                            0
tweet_url                           0
mentioned_users               1688666
quotedTweet_id                1924546
quotedTweet_content           1924546
quotedTweet_username          1924546
quotedTweet_userID            1924546
quotedTweet_mentionedUsers    2228417
dtype: int64

In [9]:
print(pol_df.shape)
pol_df.isnull().sum()

(21360, 21)


tweet_date                        0
tweet_content                     0
tweet_id                          0
tweet_likes                       0
tweet_replies                     0
tweet_retweets                    0
tweet_quotes                      0
user_username                     0
user_id                           0
user_followers                    0
user_friends                      0
user_statuses                     0
user_verified                     0
user_url                          0
tweet_url                         0
mentioned_users               13765
quotedTweet_id                18654
quotedTweet_content           18654
quotedTweet_username          18654
quotedTweet_userID            18654
quotedTweet_mentionedUsers    20251
dtype: int64

In [10]:
pol_df['tweet_date'] = pol_df['tweet_date'].map(lambda x: str(x)[:-15])
all_df['tweet_date'] = all_df['tweet_date'].map(lambda x: str(x)[:-15])

all_df['tweet_date'] = pd.to_datetime(all_df['tweet_date'])
all_df = all_df[~(all_df['tweet_date'] < '2017-04-18')]

pol_df['tweet_date'] = pd.to_datetime(pol_df['tweet_date'])

In [11]:
punc = list(string.punctuation)
stop = stopwords.words('english') + punc + ['rt', 'via']

tt = TweetTokenizer()

In [12]:
def hashtag_extract(s):
  hashtags = []
  for i in s:
    ht = re.findall(r"#(\w+)", i)
    hashtags.append(ht)
  return hashtags

In [13]:
def fix_encode(df):
  df['tweet_content'] = df['tweet_content'].str.replace('&amp', '&')
  df['tweet_content'] = df['tweet_content'].str.replace('b"', '')
  df['tweet_content'] = df['tweet_content'].str.replace("b'", '')
  return df

In [14]:
def join_punct(s):
    return ''.join(ch for ch in s if ch not in punc)

In [15]:
def remove_emoji(s):
    emoji_pattern = re.compile(
        r'(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|(?:\U0001f92d))',
        re.UNICODE)
    return emoji_pattern.sub(r'', s)

In [16]:
def remove_url(s):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    no_url = url_pattern.sub(r'', s)
    return no_url

In [17]:
def clean_tweet(s):
  clean_tweets = []
  for i in s:
    tweet_token = tt.tokenize(join_punct(remove_emoji(remove_url(i.lower()))))
    no_stop_words = [token for token in tweet_token if not token in stop]
    clean_tweets.append(no_stop_words)
  return clean_tweets

In [18]:
def preprocessing_user(df):

  df['mentioned_users']=df['mentioned_users'].fillna("")
  mentioned_user = []
  for i, user in enumerate(df['mentioned_users']):
    usernames = user.split("username='")
    username_list = []
    for username in usernames:
      username_list.append(re.findall("^(.+?)'", username))
    mentioned_user.append(username_list[1:])
  df = df.drop(columns=['mentioned_users'])
  df['mentioned_users'] = mentioned_user
  df['mentioned_users'] = df['mentioned_users'].apply(lambda x: np.nan if len(x)==0 else x)

  df['quotedTweet_mentionedUsers']=df['quotedTweet_mentionedUsers'].fillna("")
  mentioned_user = []
  for i, user in enumerate(df['quotedTweet_mentionedUsers']):
    usernames = user.split("username='")
    username_list = []
    for username in usernames:
      username_list.append(re.findall("^(.+?)'", username))
    mentioned_user.append(username_list[1:])
  df = df.drop(columns=['quotedTweet_mentionedUsers'])
  df['quotedTweet_mentionedUsers'] = mentioned_user
  df['quotedTweet_mentionedUsers'] = df['quotedTweet_mentionedUsers'].apply(lambda x: np.nan if len(x)==0 else x)

  return df

In [19]:
pol_df = fix_encode(pol_df)
pol_df['hashtags'] = hashtag_extract(pol_df['tweet_content'])
pol_df['clean_tweet_content'] = clean_tweet(pol_df['tweet_content'])
pol_df = preprocessing_user(pol_df)

In [20]:
pol_df

Unnamed: 0,tweet_date,tweet_content,tweet_id,tweet_likes,tweet_replies,tweet_retweets,tweet_quotes,user_username,user_id,user_followers,user_friends,user_statuses,user_verified,user_url,tweet_url,quotedTweet_id,quotedTweet_content,quotedTweet_username,quotedTweet_userID,hashtags,clean_tweet_content,mentioned_users,quotedTweet_mentionedUsers
0,2017-06-08,Please #VoteConservative today &; let's have a...,872880278473199616,10,1,4,0,AdamAfriyie,22031058,16392,860,2357,True,https://twitter.com/AdamAfriyie,https://twitter.com/AdamAfriyie/status/8728802...,,,,,[VoteConservative],"[please, voteconservative, today, lets, sensib...",[[theresa_may]],
1,2017-06-08,Visited polling stations across the constituen...,872786978256932864,12,1,5,0,AdamAfriyie,22031058,16392,860,2357,True,https://twitter.com/AdamAfriyie,https://twitter.com/AdamAfriyie/status/8727869...,,,,,[VoteConservative],"[visited, polling, stations, across, constitue...",,
2,2017-05-29,Another uplifting visit to Slough doorsteps to...,869274966352646145,13,2,5,1,AdamAfriyie,22031058,16392,860,2357,True,https://twitter.com/AdamAfriyie,https://twitter.com/AdamAfriyie/status/8692749...,,,,,[VoteConservative],"[another, uplifting, visit, slough, doorsteps,...","[[MarkVivis], [GregHands]]",
3,2017-05-27,The residents of Ascot were on the whole remar...,868539438644723712,9,1,3,0,AdamAfriyie,22031058,16392,860,2357,True,https://twitter.com/AdamAfriyie,https://twitter.com/AdamAfriyie/status/8685394...,,,,,[VoteConservative],"[residents, ascot, whole, remarkably, polite, ...",,
4,2017-05-16,Very happy to sign the #RefugeePledge. Importa...,864500887246577664,0,0,0,0,AdamBernard_HA,855166968525094914,898,1104,13128,False,https://twitter.com/AdamBernard_HA,https://twitter.com/AdamBernard_HA/status/8645...,,,,,"[RefugeePledge, GE2017, RefugeesWelcome]","[happy, sign, refugeepledge, important, rememb...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21355,2017-04-30,Tomorrow @mandoconserv need help in the aftern...,858664419269779457,0,0,0,0,LeedsCityTories,1688863044,1564,731,1837,False,https://twitter.com/LeedsCityTories,https://twitter.com/LeedsCityTories/status/858...,,,,,"[GE2017, conhold, Leeds]","[tomorrow, mandoconserv, need, help, afternoon...","[[mandoconserv], [PudseyCons]]",
21356,2017-04-30,Thanks to all our activists who were out campa...,858653858284937216,1,0,1,0,LeedsCityTories,1688863044,1564,731,1837,False,https://twitter.com/LeedsCityTories,https://twitter.com/LeedsCityTories/status/858...,,,,,"[conhold, GE2017]","[thanks, activists, campaigning, yesterday, to...","[[PudseyCons], [mandoconserv]]",
21357,2017-04-27,A great evening with our PM. Lovely to see so ...,857701195040919552,11,0,4,2,LeedsCityTories,1688863044,1564,731,1837,False,https://twitter.com/LeedsCityTories,https://twitter.com/LeedsCityTories/status/857...,,,,,[GE2017],"[great, evening, pm, lovely, see, many, member...",,
21358,2017-04-20,This Sat campaigning for @StuartAndrew @Pudse...,855167700917776384,4,0,1,0,LeedsCityTories,1688863044,1564,731,1837,False,https://twitter.com/LeedsCityTories,https://twitter.com/LeedsCityTories/status/855...,,,,,[Conservatives],"[sat, campaigning, stuartandrew, pudseycons, a...","[[StuartAndrew], [PudseyCons], [andreajenkyns]...",


In [None]:
fig, ax = plt.subplots(figsize=(20,10))
g = sns.histplot(data=pol_df, x='tweet_date')
plt.setp(g.get_xticklabels(), rotation=45)
plt.show()

In [21]:
!mkdir output

In [22]:
pol_df.to_csv('output/pol_actor_users.csv')
all_df.to_csv('output/all_users.csv')

In [23]:
!zip -r /content/output.zip /content/output

  adding: content/output/ (stored 0%)
  adding: content/output/pol_actor_users.csv (deflated 75%)
  adding: content/output/all_users.csv (deflated 75%)
