In [1]:
import pandas as pd
import numpy as np

trudeau = pd.read_csv('data//trudeau_clean.csv', encoding='utf-8')
trump = pd.read_csv('data//trump_clean.csv', encoding='utf-8')
user_dfs = {'trudeau':trudeau, 'trump':trump}

In [2]:
# remove all links
user_dfs['trump']['Tweets'] = user_dfs['trump']['Tweets'].str.replace('https:\/\/t\.co\/[-a-zA-Z0-9]{1,256}', '')
# trim off whitespace from front and back of tweets
user_dfs['trump']['Tweets'] = user_dfs['trump']['Tweets'].str.strip()


In [8]:
# exploratory - regex for finding key words/phrases
import re

target_phrase = 'covid'

matching_tweets=[]
for t in trump['Tweets']:
    if re.search(target_phrase, t.lower()) is not None:
        matching_tweets.append(t)
        print(t)
        print('-----')
print(len(matching_tweets))

CDC Director was totally misquoted by Fake News @CNN on Covid 19. He will be putting out a statement.
-----
“Economic impact of Covid-19.” @foxandfriends Very interesting analysis. Light at the end of the tunnel. Thank you!
-----
....to State/Local Governments for lost revenues from COVID 19 much needed Infrastructure Investments for Bridges Tunnels Broadband Tax Incentives for Restaurants Entertainment Sports and Payroll Tax Cuts to increase Economic Growth.
-----
“On February 19th there was a Democratic Debate in Las Vegas. Three words weren’t said: Virus CoronaVirus or COVID19. NEVER came up!” @BretBaier
-----
Having been involved in the negotiations to put it mildly the number that OPEC+ is looking to cut is 20 Million Barrels a day not the 10 Million that is generally being reported. If anything near this happens and the World gets back to business from the Covid 19.....
-----
America owes our very hard working food supply workers so much as they produce and deliver high quality f

In [4]:
from nltk.corpus import stopwords

# punctuation to strip from tweets (note omission of underscore)
punctuation = '!"$%&\'\’“()*+,-./:;<=>?[\\]^`{|}~'
# characters that appear as words after splitting (to remove)
bad_chars = [',','.',';','!',':','’','-','&','“','”','...','(',')','?','%',
             '', '&amp;','به','را','و','ایران','और','–']

english_stopwords = set(stopwords.words('english'))  # common uninteresting words
# empirically added words to remove
more_stopwords = {'dont','get','make','even','also','time','said','far','amp','new','would','like','us',
                  'back','two','its','many','want','done','made','really','yet','got','nothing',
                 'ever','read','one','last','well','way','total','see','look','complete','didnt',
                 'keep','today','go','going','must','years','much','pm','always','first','day','let',
                  'know','open','others','better','small','say','need','come','long','doesnt',
                  'wrong','happen','true','everything','getting','three','zero','fact','knew',
                 'ago','including','already','right','every','things','never','fast'}
# 'good','bad',''
words_to_remove = more_stopwords.union(english_stopwords)  # all words to remove
# want to count the following common Trump phrases as one word:
trumpisms = {'fake news':'fake_news', 'do nothing democrats':'do_nothing_democrats',
            'do nothing dems':'do_nothing_democrats','impeachment hoax':'impeachment_hoax',
            'white house':'white_house','thank you':'thank_you', 'second amendment':'second_amendment',
             'new york':'new_york', 'witch hunt':'witch_hunt',
            'mini mike':'mini_mike','cryin chuck':'cryin_chuck'}
            # 'sleepy joe':'sleepy_joe', 'crazy bernie':'crazy_bernie'
    
        
def tokenize_tweet(tweet):
    tweet = tweet.lower()  # only want lowercase letters
    tweet = tweet.translate(str.maketrans('','',punctuation))
    for orig, new in trumpisms.items():
        tweet = tweet.replace(orig, new)

    words = tweet.split(sep=' ')
    keywords = []
    for w in words:
        if w=="it's":
            print(tweet)
        if w not in words_to_remove and w not in bad_chars:
            keywords.append(w)
    return keywords

users = ['trump']
kw_dict = {u: None for u in users}
for u in users:
    keyword_freq = pd.DataFrame()
    for m in ['January','February','March','April','May']:  # get top words from each month, create df
        mth = m.lower()[0:3]  # first three letters of lowercase month
        mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']  # tweets from specific month
        words = pd.Series(np.concatenate([tokenize_tweet(t) for t in mth_tweets])).value_counts()[0:100]
        keyword_freq['kw_'+mth] = words.index
        keyword_freq['freq_'+mth] = words.values
    words = pd.Series(np.concatenate([tokenize_tweet(t) for t in user_dfs[u]['Tweets']])).value_counts()[0:100]
    keyword_freq['kw_all'] = words.index
    keyword_freq['freq_all'] = words.values
#     print(keyword_freq)
    kw_dict[u] = keyword_freq
    keyword_freq.to_csv('data//kw_ana//'+u+'_words.csv', header=True, encoding='utf-8',index=False)


In [5]:
user_dfs[u].loc[user_dfs[u]['Month']==m]

Unnamed: 0,Source,Tweets,Date,RTs,Favourites,isRT,id_str,Month
1598,Twitter for iPhone,Let New York’s Finest be New York’s Finest. Th...,2020-05-31 02:12:14,33863,202862,False,1266915358914621440,May
1599,Twitter for iPhone,The National Guard has been released in Minnea...,2020-05-31 02:08:42,62393,293293,False,1266914470066036736,May
1600,Twitter for iPhone,,2020-05-31 00:54:36,27811,87057,False,1266895821083234304,May
1601,Twitter for iPhone,MN Gov. Walz: We Estimate 80% of the Rioters A...,2020-05-31 00:20:50,20742,59221,False,1266887324505382912,May
1602,Twitter for iPhone,Hopefully a great successful and safe ROCKET L...,2020-05-30 18:37:07,19300,143675,False,1266800827621990400,May
...,...,...,...,...,...,...,...,...
2143,Twitter for iPhone,The Governor of Michigan should give a little ...,2020-05-01 12:42:23,40674,210242,False,1256202305680158720,May
2144,Twitter for iPhone,Cryin Chuck Schumer compared to what other Sen...,2020-05-01 12:14:36,21049,90556,False,1256195312965877760,May
2145,Twitter for iPhone,Cryin’ Chuck Schumer was on a late night show ...,2020-05-01 12:07:51,25324,110033,False,1256193615724007425,May
2146,Twitter for iPhone,Dirty Cop!,2020-05-01 11:55:42,19665,82710,False,1256190556839051264,May


In [6]:
# TODO: loop over months and user?

mth='may'
u='trump'

top100_words = list(kw_dict[u]['kw_'+mth])
top100_set = set(top100_words)
adj_mat = pd.DataFrame(0, index=top100_words, columns=top100_words)  # weighted adjacency matrix (init with 0's)
mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']

for twt in mth_tweets:  # for each tweet
    tweet_words = set(tokenize_tweet(twt)) & top100_set  # set intersection (don't care about words outside top 100)
    for wrd in tweet_words:  # for each word in the tweet
        wrd_pos = top100_words.index(wrd)
        remaining_words = set(tweet_words) - {wrd}
        for rem in remaining_words:  # for each other word in the tweet
            rem_pos = top100_words.index(rem)
            if rem_pos<wrd_pos:  # want consistent ordering so that adj matrix is upper trian
                adj_mat.loc[rem,wrd] = adj_mat.loc[rem,wrd] + 1
            else:
                adj_mat.loc[wrd,rem] = adj_mat.loc[wrd,rem] + 1
adj_mat.to_csv('data///kw_ana//'+u+'_adjmat_may.csv',sep=';')