In [1]:
import pandas as pd
import numpy as np

trudeau = pd.read_csv('data//trudeau_clean.csv', encoding='utf-8')
trump = pd.read_csv('data//trump_clean.csv', encoding='utf-8')
user_dfs = {'trudeau':trudeau, 'trump':trump}

In [9]:
# remove links from tweets for every user
for u in user_dfs.keys():
    user_dfs[u]['Tweets'].replace('', float('NaN'), inplace=True)
    user_dfs[u]['Tweets'].dropna(subset=['Tweets'], inplace=True)


In [27]:
# exploratory - regex for finding key words/phrases
import re

target_phrase = 'covid'

matching_tweets=[]
for t in trudeau['Tweets']:
    if re.search(target_phrase, t.lower()) is not None:
        matching_tweets.append(t)
        print(t)
        print('-----')
print(len(matching_tweets))

I’m giving an update on the steps we’re taking to keep you safe from COVID-19, and announcing new funding to help Indigenous families and communities during this crisis. Tune in now for the details:
-----
Every day, @UN peacekeepers lay the groundwork for true and lasting peace. They protect the most vulnerable, promote human rights, and are on the front lines in the fight against COVID-19. Today, we thank them for all they do and the sacrifices they make.
-----
We all have the same goal right now: ending this pandemic. And we know that ramping up the development and distribution of vaccines, testing, and treatment is the way to do it. That’s why we’re stepping up and supporting #GlobalGoalUnite in the fight against COVID-19.
-----
All countries are being tested by the COVID-19 pandemic. Lives and livelihoods everywhere are being threatened by the virus and its impacts. In order to successfully rebuild our economies, we need to work together - and we need to think outside the box.
----

In [32]:
from nltk.corpus import stopwords

# punctuation to strip from tweets (note omission of underscore)
punctuation = '!"$%&\'\’“()*+,-./:;<=>?[\\]^`{|}~'
# characters that appear as words after splitting (to remove)
bad_chars = [',','.',';','!',':','’','-','&','“','”','...','(',')','?','%',
             '', '&amp;','به','را','و','ایران','और','–']

english_stopwords = set(stopwords.words('english'))  # common uninteresting words
# empirically added words to remove
more_stopwords = {'dont','get','make','even','also','time','said','far','amp','new','would','like','us',
                  'back','two','its','many','want','done','made','really','yet','got','nothing',
                 'ever','read','one','last','well','way','total','see','look','complete','didnt',
                 'keep','today','go','going','must','years','much','pm','always','first','day','let',
                  'know','open','others','better','small','say','need','come','long','doesnt','weve'
                  'wrong','happen','true','everything','getting','three','zero','fact','knew','sure',
                 'ago','including','already','right','every','things','never','fast','im','youre','thats',
                 'around','since','met','weve'}
ism_txt = 'trudeauisms.txt'
words_to_remove = more_stopwords.union(english_stopwords)  # all words to remove
# want to count the following common Trump phrases as one word:
common_phrases = {}
with open('data//kw_ana//'+ism_txt) as f:
    for line in f:
        (key, val) = line.split(':')
        common_phrases[key] = val
        
def tokenize_tweet(tweet):
    tweet = tweet.lower()  # only want lowercase letters
    tweet = tweet.translate(str.maketrans('','',punctuation))
    for orig, new in common_phrases.items():
        tweet = tweet.replace(orig, new)

    words = tweet.split(sep=' ')
    keywords = []
    for w in words:
        if w not in words_to_remove and w not in bad_chars:
            keywords.append(w)
    return keywords

users = ['trudeau']
kw_dict = {u: None for u in users}
for u in users:
    keyword_freq = pd.DataFrame()
    for m in ['January','February','March','April','May']:  # get top words from each month, create df
        mth = m.lower()[0:3]  # first three letters of lowercase month
        mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']  # tweets from specific month
        words = pd.Series(np.concatenate([tokenize_tweet(t) for t in mth_tweets])).value_counts()[0:100]
        keyword_freq['kw_'+mth] = words.index
        keyword_freq['freq_'+mth] = words.values
    words = pd.Series(np.concatenate([tokenize_tweet(t) for t in user_dfs[u]['Tweets']])).value_counts()[0:100]
    keyword_freq['kw_all'] = words.index
    keyword_freq['freq_all'] = words.values

    kw_dict[u] = keyword_freq
    keyword_freq.to_csv('data//kw_ana//'+u+'_words.csv', header=True, encoding='utf-8',index=False)


In [33]:
# TODO: loop over months and user?

mth='may'
u='trudeau'
for mth in ['jan','feb','mar','apr','may','all']:
    top100_words = list(kw_dict[u]['kw_'+mth])
    top100_set = set(top100_words)
    adj_mat = pd.DataFrame(0, index=top100_words, columns=top100_words)  # weighted adjacency matrix (init with 0's)
    mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']

    for twt in mth_tweets:  # for each tweet
        tweet_words = set(tokenize_tweet(twt)) & top100_set  # set intersection (don't care about words outside top 100)
        for wrd in tweet_words:  # for each word in the tweet
            wrd_pos = top100_words.index(wrd)
            remaining_words = set(tweet_words) - {wrd}
            for rem in remaining_words:  # for each other word in the tweet
                rem_pos = top100_words.index(rem)
                if rem_pos<wrd_pos:  # want consistent ordering so that adj matrix is upper trian
                    adj_mat.loc[rem,wrd] = adj_mat.loc[rem,wrd] + 1
                else:
                    adj_mat.loc[wrd,rem] = adj_mat.loc[wrd,rem] + 1
    adj_mat.to_csv('data//kw_ana//'+u+'_adjmat_'+mth+'.csv',sep=';')