In [49]:
import pandas as pd
import numpy as np

trudeau = pd.read_csv('data//trudeau_clean.csv', encoding='utf-8')
trump = pd.read_csv('data//trump_clean.csv', encoding='utf-8')
user_dfs = {'trudeau':trudeau, 'trump':trump}

In [50]:
# remove all links
user_dfs['trump']['Tweets'] = user_dfs['trump']['Tweets'].str.replace('https:\/\/t\.co\/[-a-zA-Z0-9]{1,256}', '')
# trim off whitespace from front and back of tweets
user_dfs['trump']['Tweets'] = user_dfs['trump']['Tweets'].str.strip()


In [138]:
# exploratory - regex for finding key words/phrases
import re

target_phrase = 'second amendment'

matching_tweets=[]
for t in trump['Tweets']:
    if re.search(target_phrase, t.lower()) is not None:
        matching_tweets.append(t)
        print(t)
        print('-----')
print(len(matching_tweets))

Governor @JimJusticeWV is a tremendous fighter for the incredible people of West Virginia. Big Jim is strong on Life the Second Amendment and Building the Wall! With Governors like Jim America will recover and get back to business. Jim has my Complete and Total Endorsement!
-----
Congressman @SteveStivers is doing a terrific job for the People of Ohio! He defends our Borders Supports our Veterans Strong on Crime and the Second Amendment. Steve has my Complete and Total Endorsement!
-----
Congressman Bill Posey is a tremendous fighter for the Great State of Florida. He is a big supporter of our #MAGA Agenda – Strong on Crime the Second Amendment and Loves our Veterans and Law Enforcement. Bill has my Complete and Total Endorsement!
-----
Congressman @MikeTurnerOH is a strong supporter and fighter for the People of Ohio! He will help us #MAGA! He’s Strong on the Border Tough on Crime will Protect our Vets and our GREAT Second Amendment. Mike has my Complete and Total Endorsement!
-----
C

In [141]:
from nltk.corpus import stopwords

# punctuation to strip from tweets (note omission of underscore)
punctuation = '!"$%&\'\’“()*+,-./:;<=>?[\\]^`{|}~'
# characters that appear as words after splitting (to remove)
bad_chars = [',','.',';','!',':','’','-','&','“','”','...','(',')','?','%',
             '', '&amp;','به','را','و','ایران','और','–']

english_stopwords = set(stopwords.words('english'))  # common uninteresting words
# empirically added words to remove
more_stopwords = {'dont','get','make','even','also','time','said','far','amp','new','would','like','us',
                  'back','two','its','many','want','done','made','really','yet','got','nothing',
                 'ever','read','one','last','well','way','total','see','look','complete','didnt',
                 'keep','today','go','going','must','years','much','pm','always','first','day','let',
                  'know','open','others','better','small','say','need','come','long','doesnt',
                  'wrong','happen','true','everything','getting','three','zero','fact','knew',
                 'ago','including','already','right','every','things','never','fast'}
# 'good','bad',''
words_to_remove = more_stopwords.union(english_stopwords)  # all words to remove
# want to count the following common Trump phrases as one word:
trumpisms = {'fake news':'fake_news', 'do nothing democrats':'do_nothing_democrats',
            'do nothing dems':'do_nothing_democrats','impeachment hoax':'impeachment_hoax',
            'white house':'white_house','thank you':'thank_you', 'second amendment':'second_amendment',
             'new york':'new_york', 'witch hunt':'witch_hunt',
            'mini mike':'mini_mike','cryin chuck':'cryin_chuck'}
            # 'sleepy joe':'sleepy_joe', 'crazy bernie':'crazy_bernie'
    
        
def tokenize_tweet(tweet):
    tweet = tweet.lower()  # only want lowercase letters
    tweet = tweet.translate(str.maketrans('','',punctuation))
    for orig, new in trumpisms.items():
        tweet = tweet.replace(orig, new)

    words = tweet.split(sep=' ')
    keywords = []
    for w in words:
        if w=="it's":
            print(tweet)
        if w not in words_to_remove and w not in bad_chars:
            keywords.append(w)
    return keywords

users = ['trump']
kw_dict = {u: None for u in users}
for u in users:
    keyword_freq = pd.DataFrame()
    for m in ['January','February','March','April','May']:  # get top words from each month, create df
        mth = m.lower()[0:3]  # first three letters of lowercase month
        mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']  # tweets from specific month
        words = pd.Series(np.concatenate([tokenize_tweet(t) for t in mth_tweets])).value_counts()[0:100]
        keyword_freq['kw_'+mth] = words.index
        keyword_freq['freq_'+mth] = words.values
    words = pd.Series(np.concatenate([tokenize_tweet(t) for t in user_dfs[u]['Tweets']])).value_counts()[0:100]
    keyword_freq['kw_all'] = words.index
    keyword_freq['freq_all'] = words.values
#     print(keyword_freq)
    kw_dict[u] = keyword_freq
    keyword_freq.to_csv('data//kw_ana//'+u+'_words.csv', header=True, encoding='utf-8',index=False)


In [41]:
user_dfs[u].loc[user_dfs[u]['Month']==m]

Unnamed: 0.1,Unnamed: 0,Tweets,Length,Date,Source,Favourites,RTs,Language,isRT,Month


In [142]:
# TODO: loop over months and user?

mth='all'
u='trump'

top100_words = list(kw_dict[u]['kw_'+mth])
top100_set = set(top100_words)
adj_mat = pd.DataFrame(0, index=top100_words, columns=top100_words)  # weighted adjacency matrix (init with 0's)
mth_tweets = user_dfs[u].loc[user_dfs[u]['Month']==m]['Tweets']

for twt in mth_tweets:  # for each tweet
    tweet_words = set(tokenize_tweet(twt)) & top100_set  # set intersection (don't care about words outside top 100)
    for wrd in tweet_words:  # for each word in the tweet
        wrd_pos = top100_words.index(wrd)
        remaining_words = set(tweet_words) - {wrd}
        for rem in remaining_words:  # for each other word in the tweet
            rem_pos = top100_words.index(rem)
            if rem_pos<wrd_pos:  # want consistent ordering so that adj matrix is upper trian
                adj_mat.loc[rem,wrd] = adj_mat.loc[rem,wrd] + 1
            else:
                adj_mat.loc[wrd,rem] = adj_mat.loc[wrd,rem] + 1
adj_mat.to_csv('data///kw_ana//'+u+'_adjmat.csv',sep=';')