In [26]:
import pandas as pd
import numpy as np

trudeau = pd.read_csv('data//trudeau_clean.csv', encoding='utf-8')
trump = pd.read_csv('data//trump_clean.csv', encoding='utf-8')
user_dfs = {'trudeau':trudeau, 'trump':trump}

In [27]:
# remove all links
user_dfs['trump']['Tweets'] = user_dfs['trump']['Tweets'].str.replace('https:\/\/t\.co\/[-a-zA-Z0-9]{1,256}', '')
# trim off whitespace from front and back of tweets
user_dfs['trump']['Tweets'] = user_dfs['trump']['Tweets'].str.strip()


In [28]:
# exploratory - regex for finding key words/phrases
import re

target_phrase = ' '

matching_tweets=[]
for t in trump['Tweets']:
    if re.search(target_phrase, t.lower()) is not None:
        matching_tweets.append(t)
        print(t)
        print('-----')
print(len(matching_tweets))

0


In [4]:
from nltk.corpus import stopwords

english_stopwords = set(stopwords.words('english'))  # common uninteresting words
words_to_remove = {'https','amp'}.union(english_stopwords)  # other words I noticed
# want to count the following common Trump phrases as one word:
trumpisms = {'fake news':'fake_news', 'do nothing democrats':'do_nothing_democrats',
            'do nothing dems':'do_nothing_democrats','impeachment hoax':'impeachment_hoax',
            'white house':'white_house','thank you':'thank_you','sleepy joe':'sleepy_joe',
            'crazy bernie':'crazy_bernie', 'new york':'new_york'}
    
def tokenize_tweet(tweet):
    tweet = tweet.lower()  # only want lowercase letters
    for orig, new in trumpisms.items():
        tweet = tweet.replace(orig, new)
    
    words = tweet.split(sep=' ')
    keywords = []
    bad_chars = [',','.',';','!',':','’','-','&','“','”','...','(',')','?','%', '', '&amp;','به','را','و','ایران']
    for w in words:
        if w not in words_to_remove and w not in bad_chars:
            keywords.append(w)
    return keywords

users = ['trudeau','trump']
kw_dict = {u: None for u in users}
for u in users:
    keyword_freq = pd.DataFrame()
    for m in ['January','February','March','April']: # want cols of DF like this: jt_Jan_word, jt_Jan_freq, jt_Feb_word, jt_Feb_freq, etc
        mth = m.lower()[0:3]  # first three letters of lowercase month
        mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']
        words = pd.Series(np.concatenate([tokenize_tweet(t) for t in mth_tweets])).value_counts()[0:100]
        keyword_freq['kw_'+mth] = words.index
        keyword_freq['freq_'+mth] = words.values
    print(keyword_freq[0:100])
    kw_dict[u] = keyword_freq
    keyword_freq.to_csv('data//'+u+'_words.csv', header=True, encoding='utf-8')


      kw_jan  freq_jan     kw_feb  freq_feb     kw_mar  freq_mar      kw_apr  \
0      we’re        10      we’re        16      we’re        48       we’re   
1        get        10        get        12       help        26        keep   
2       keep         8    workers        10    support        21        help   
3       food         7       help        10       need        17       here:   
4     people         7       keep         9        get        16        need   
..       ...       ...        ...       ...        ...       ...         ...   
95   benefit         2    subsidy         2  essential         3  principles   
96  everyone         2  canadians         2      loved         3        task   
97    entire         2     below.         2      first         3       help,   
98     check         1     united         2    scotia.         3          go   
99  premiers         1      time.         2     better         3       loved   

    freq_apr  
0         27  
1        

In [5]:
list(kw_dict[u]['kw_'+mth])


['great',
 'people',
 'white_house',
 'fake_news',
 'thank_you',
 'states',
 'even',
 'get',
 'conference',
 'news',
 'thank_you!',
 'president',
 'new',
 'many',
 'p.m.',
 'back',
 'big',
 'media',
 'good',
 'today',
 'ventilators',
 'country',
 'job',
 'like',
 'total',
 'strong',
 'never',
 'complete',
 'far',
 'coronavirus',
 'small',
 'state',
 'second',
 'hard',
 'time',
 'congressman',
 'got',
 'lamestream',
 'military',
 'way',
 'endorsement!',
 'american',
 'crime',
 'businesses',
 'would',
 'border',
 'press',
 'said',
 'open',
 'spoke',
 'happy',
 'money',
 'eastern.',
 'york',
 'much',
 'china',
 'want',
 'left',
 'testing',
 'help',
 'united',
 'done',
 'fake_news!',
 'u.s.',
 'us',
 'fighter',
 'don’t',
 'long',
 'do_nothing_democrats',
 '@nytimes',
 'nothing',
 'must',
 'make',
 'failing',
 'democrats',
 'care',
 'invisible',
 'work',
 'it’s',
 'say',
 'always',
 'wanted',
 'governors',
 'take',
 'million',
 '@foxnews',
 'hospitals',
 'countries',
 'america',
 'radical',

In [6]:
# TODO: loop over months and user?

mth='apr'
u='trudeau'

top100_words = list(kw_dict[u]['kw_'+mth])
top100_set = set(top100_words)
adj_mat = pd.DataFrame(0, index=top100_words, columns=top100_words)  # weighted adjacency matrix (init with 0's)
mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']

for twt in mth_tweets:  # for each tweet
    tweet_words = set(tokenize_tweet(twt)) & top100_set  # set intersection (don't care about words outside top 100)
    for wrd in tweet_words:  # for each word in the tweet
        wrd_pos = top100_words.index(wrd)
        remaining_words = set(tweet_words) - {wrd}
        for rem in remaining_words:  # for each other word in the tweet
            rem_pos = top100_words.index(rem)
            if rem_pos<wrd_pos:  # want consistent ordering so that adj matrix is upper trian
                adj_mat.loc[rem,wrd] = adj_mat.loc[rem,wrd] + 1
            else:
                adj_mat.loc[wrd,rem] = adj_mat.loc[wrd,rem] + 1
adj_mat.to_csv('data//'+u+'_adjmat.csv',sep=';')