In [1]:
import pandas as pd
import nltk
import string
import pickle
import re

import emoji

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
with open("subset1_df.pkl", 'rb') as picklefile: 
    df = pickle.load(picklefile)

In [3]:
df.shape

(278602, 12)

In [4]:
df_rtrolls = df[df['account_category'] == 'RightTroll']

In [5]:
df_rtrolls.shape

(128681, 12)

In [6]:
df_rtrolls = df_rtrolls[df_rtrolls['content'].apply(len) > 40]

In [7]:
aus_handles = set(df_rtrolls[df_rtrolls['content'].str.contains('auspol')].author)

df_rtrolls = df_rtrolls[~df_rtrolls['author'].isin(aus_handles)]
df_rtrolls.head()

Unnamed: 0,author,content,region,language,following,followers,updates,retweet,account_category,date,hour,day
0,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,1052,9636,253,0,RightTroll,2017-10-01,19,6
1,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,1054,9637,254,0,RightTroll,2017-10-01,22,6
2,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,1054,9637,255,1,RightTroll,2017-10-01,22,6
3,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,1062,9642,256,0,RightTroll,2017-10-01,23,6
4,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,1050,9645,246,1,RightTroll,2017-10-01,2,6


In [8]:
df_rtrolls.reset_index(drop=True, inplace=True)
df_rtrolls.head()

Unnamed: 0,author,content,region,language,following,followers,updates,retweet,account_category,date,hour,day
0,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,1052,9636,253,0,RightTroll,2017-10-01,19,6
1,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,1054,9637,254,0,RightTroll,2017-10-01,22,6
2,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,1054,9637,255,1,RightTroll,2017-10-01,22,6
3,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,1062,9642,256,0,RightTroll,2017-10-01,23,6
4,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,1050,9645,246,1,RightTroll,2017-10-01,2,6


In [9]:
# remove links
def remove_link(string):
    return re.sub(r'http[s]?\:\/\/[\S\s]\S+', '', string)

In [10]:
df_rtrolls['content'] = df_rtrolls['content'].apply(remove_link)

In [11]:
def custom_tokenizer(text):
    full_punc = '’‘“”.–…�🇺🇸★➠' + string.punctuation
    # remove punctuation
    remove_punct = str.maketrans('', '', full_punc)
    text = text.translate(remove_punct)

    # remove digits and convert to lower case
    remove_digits = str.maketrans('', '', string.digits)
    text = text.lower().translate(remove_digits)

    # tokenize
    tokens = word_tokenize(text)

    # remove stop words
    punc = [str(i) for i in string.punctuation]
    cust_stop_words = (['rt', 'retweet', 'get', 'one', 'im', 'thing', 'get', 'dont', 'wow',
                       'lol', 'amp', 'n', 'didnt', 'people', 'like', 'want', 'know', 'go',
                        'think', 'need', 'right', 'good', 'would', 'going', 'never', 'see',
                        'time', 'call', 'said', 'got', 'us', 'p', 'look', 'mr'])
    stop_words = cust_stop_words + stopwords.words('english')
    tokens_stop = [y for y in tokens if y not in stop_words]

    # stem
#    stemmer = SnowballStemmer('english')
#    tokens_stem = [stemmer.stem(y) for y in tokens_stop] 

    return tokens_stop

In [12]:
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer, min_df=5, max_df=0.85)
doc_vectors = tfidf.fit_transform(df_rtrolls.content)

In [13]:
nmf = NMF(n_components=20, alpha=.1, l1_ratio=.5)
nmf_vecs = nmf.fit_transform(doc_vectors)

In [14]:
feature_names = tfidf.get_feature_names()

In [15]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [16]:
print_top_words(nmf, feature_names, 15)

Topic #0:
rtamerica maga john cchinesus morning pjn rule exempting rescind life mary hypocrites kumar congress prosecuted
Topic #1:
trump president supporters react watch donald calls cnn coup military sides voters romney traitor gop
Topic #2:
army patriot enlist join freedom awaits enlistment stand read usfreedomarmy truth needs always soon constitution
Topic #3:
breaking arrested dem doj politician dead steve shes death isis state hillary another injured happening
Topic #4:
charlottesville truth tragedy reveals violence car reacts dinesh bombshell response antitrump black mayor exposes tweet
Topic #5:
realdonaldtrump potus president barbmuenchen america great foxnews make thank love support darlovesamerica alwaysactions country vfl
Topic #6:
korea north merkel nuclear stance failed sides comments humiliates scolds angela twofaced chinas fury missile
Topic #7:
pjnet maga hillary tcot american thinker aces flopping clinton russia uranium investigate gifts comey benghazi
Topic #8:
news 

In [19]:
topic_dict = {}
for topic_idx, topic in enumerate(nmf.components_):
    topic_dict[topic_idx] = ", ".join([feature_names[i] \
                                for i in topic.argsort()[:-10 - 1:-1]])
    
print("Dictionary of topics to words:")
print(topic_dict)

Dictionary of topics to words:
{0: 'rtamerica, maga, john, cchinesus, morning, pjn, rule, exempting, rescind, life', 1: 'trump, president, supporters, react, watch, donald, calls, cnn, coup, military', 2: 'army, patriot, enlist, join, freedom, awaits, enlistment, stand, read, usfreedomarmy', 3: 'breaking, arrested, dem, doj, politician, dead, steve, shes, death, isis', 4: 'charlottesville, truth, tragedy, reveals, violence, car, reacts, dinesh, bombshell, response', 5: 'realdonaldtrump, potus, president, barbmuenchen, america, great, foxnews, make, thank, love', 6: 'korea, north, merkel, nuclear, stance, failed, sides, comments, humiliates, scolds', 7: 'pjnet, maga, hillary, tcot, american, thinker, aces, flopping, clinton, russia', 8: 'news, fake, ignoring, today, cnn, fox, bad, change, antitrump, acosta', 9: 'media, dems, liberal, blm, ignore, sides, lying, mainstream, exposing, mcmaster', 10: 'white, house, supremacist, cnn, supremacists, mad, max, pundit, baiting, race', 11: 'enlis

In [21]:
with open('nmf.pkl', 'wb') as picklefile:
    pickle.dump(nmf, picklefile)
    
with open('tfidf.pkl', 'wb') as picklefile:
    pickle.dump(tfidf, picklefile)
    
import json
with open('topics2words.json', 'w') as fp:
    json.dump(topic_dict, fp)

### Assigning Topics

In [22]:
nmf_vecs[40]

array([0.        , 0.        , 0.        , 0.00026463, 0.        ,
       0.00067424, 0.        , 0.        , 0.        , 0.        ,
       0.00157964, 0.00048485, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00024661, 0.        ])

In [23]:
df_rtrolls.head()

Unnamed: 0,author,content,region,language,following,followers,updates,retweet,account_category,date,hour,day
0,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,1052,9636,253,0,RightTroll,2017-10-01,19,6
1,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,1054,9637,254,0,RightTroll,2017-10-01,22,6
2,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,1054,9637,255,1,RightTroll,2017-10-01,22,6
3,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,1062,9642,256,0,RightTroll,2017-10-01,23,6
4,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,1050,9645,246,1,RightTroll,2017-10-01,2,6


In [24]:
import operator
topics = []
for item in nmf_vecs:
    max_index, max_value = max(enumerate(item), key=operator.itemgetter(1))
    topics.append(max_index) 
    
df_rtrolls["topicnumber"] = pd.Series(topics, index=df_rtrolls.index)

In [25]:
df_rtrolls.head()

Unnamed: 0,author,content,region,language,following,followers,updates,retweet,account_category,date,hour,day,topicnumber
0,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,1052,9636,253,0,RightTroll,2017-10-01,19,6,9
1,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,1054,9637,254,0,RightTroll,2017-10-01,22,6,7
2,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,1054,9637,255,1,RightTroll,2017-10-01,22,6,12
3,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,1062,9642,256,0,RightTroll,2017-10-01,23,6,1
4,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,1050,9645,246,1,RightTroll,2017-10-01,2,6,3


In [26]:
topics_likelihood = []
for item in nmf_vecs:
    max_index, max_value = max(enumerate(item), key=operator.itemgetter(1))
    topics_likelihood.append(max_value)
    
df_rtrolls["strengthoftopic"] = pd.Series(topics_likelihood, index=df_rtrolls.index)     

In [28]:
print(df_rtrolls.topicnumber.value_counts()) #let's make sure this is a good model...

1     7885
5     6525
11    6450
3     4743
0     4526
12    4240
7     4180
19    3898
9     3772
18    3280
4     3257
17    2582
10    2566
14    2546
8     2505
2     2488
6     2370
13    1009
15     959
16     925
Name: topicnumber, dtype: int64


In [29]:
df_rtrolls['week'] = (pd.DatetimeIndex(df_rtrolls.date).week + 
                    (pd.DatetimeIndex(df_rtrolls.date).year-2015)*52)

In [31]:
with open("rtrolls_df.pkl", 'wb') as picklefile:
    pickle.dump(df_rtrolls, picklefile) 