In [1]:
import pandas as pd
import nltk
import string
import pickle
import re


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
with open("all_tweets_df.pkl", 'rb') as picklefile: 
    df = pickle.load(picklefile)

In [3]:
print("Loaded DF with shape: ", df.shape)

(381016, 15)

In [6]:
df_rtrolls = df[df['account_category'] == 'RightTroll']


In [None]:
df_rtrolls = df_rtrolls[df_rtrolls['content'].apply(len) > 40]

In [None]:
aus_handles = set(df_rtrolls[df_rtrolls['content'].str.contains('auspol')].author)

df_rtrolls = df_rtrolls[~df_rtrolls['author'].isin(aus_handles)]

In [7]:
print("Right trolls df shape: ", df_rtrolls.shape)

(128681, 15)

In [116]:
df_rtrolls.reset_index(drop=True, inplace=True)
df_rtrolls.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.06e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll
1,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,0,RightTroll
2,9.06e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,0,1,RightTroll
3,9.06e+17,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,0,RightTroll
4,9.06e+17,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,0,1,RightTroll


In [18]:
# remove links
def remove_link(string):
    return re.sub(r'http[s]?\:\/\/[\S\s]\S+', '', string)

In [24]:
df_rtrolls['content'] = df_rtrolls['content'].apply(remove_link)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [107]:
def custom_tokenizer(text):
    full_punc = '’‘“”.–…�🇺🇸★➠' + string.punctuation
    # remove punctuation
    remove_punct = str.maketrans('', '', full_punc)
    text = text.translate(remove_punct)

    # remove digits and convert to lower case
    remove_digits = str.maketrans('', '', string.digits)
    text = text.lower().translate(remove_digits)

    # tokenize
    tokens = word_tokenize(text)

    # remove stop words
    punc = [str(i) for i in string.punctuation]
    cust_stop_words = (['rt', 'retweet', 'get', 'one', 'im', 'thing', 'get', 'dont', 'wow',
                       'lol', 'amp', 'n', 'didnt', 'people', 'like', 'want', 'know', 'go',
                        'think', 'need', 'right', 'good', 'would', 'going', 'never', 'see',
                        'time', 'call', 'said', 'got', 'us', 'p', 'look', 'mr'])
    stop_words = cust_stop_words + stopwords.words('english')
    tokens_stop = [y for y in tokens if y not in stop_words]

    # stem
#     stemmer = SnowballStemmer('english')
#     tokens_stem = [stemmer.stem(y) for y in tokens_stop] 

    return tokens_stop

In [108]:
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer, min_df=5, max_df=0.85)
doc_vectors = tfidf.fit_transform(df_rtrolls.content)

In [109]:
nmf = NMF(n_components=20, alpha=.1, l1_ratio=.5)
nmf_vecs = nmf.fit_transform(doc_vectors)

In [110]:
feature_names = tfidf.get_feature_names()

In [105]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [111]:
print_top_words(nmf, feature_names, 15)

Topic #0:
rtamerica maga trumptrain ourboris americafirst trumppence john cchinesus p pjn morning good rescind exempting rule
Topic #1:
trump president supporters donald supporter calls watch react cnn coup romney voters military going trumptrain
Topic #2:
enlist army patriot usfa stand people read join freedom truth msm usfreedomarmy gt awaits patriots
Topic #3:
breaking arrested statue doj dead death state dem another steve isis shes confederate injured police
Topic #4:
amb auspol traitor mccain obamacare repeal mueller gop liberal mcmaster viral mikecarlton jeff mad watch
Topic #5:
hillary clinton bill doj campaign via election crooked remember deal fbi comey russia plea clintons
Topic #6:
charlottesville truth antifa tragedy media violence black reveals car dinesh bombshell response reacts mayor antitrump
Topic #7:
realdonaldtrump potus president great foxnews barbmuenchen people thank love like make mr vote support country
Topic #8:
news fake media fox cnn today ignoring bad chica

In [None]:
topic_dict = {}
for topic_idx, topic in enumerate(nmf.components_):
    topic_dict[topic_idx] = ", ".join([feature_names[i] \
                                for i in topic.argsort()[:-10 - 1:-1]])
    
print("Dictionary of topics to words:")
print(topic_dict)

### Storage

In [None]:
print("Storing data:")

In [None]:
    
with open('nmf.pkl', 'wb') as picklefile:
    pickle.dump(nmf, picklefile)
    
with open('tfidf.pkl', 'wb') as picklefile:
    pickle.dump(tfidf, picklefile)
    
import json
with open('topics2words.json', 'w') as fp:
    json.dump(topic_dict, fp)

In [None]:
print("adding topics to DF")

In [None]:
import operator
topics = []
for item in nmf_vecs:
    max_index, max_value = max(enumerate(item), key=operator.itemgetter(1))
    topics.append(max_index) 
    
df_rtrolls["topicnumber"] = pd.Series(topics, index=df_rtrolls.index)

In [None]:
topics_likelihood = []
for item in nmf_vecs:
    max_index, max_value = max(enumerate(item), key=operator.itemgetter(1))
    topics_likelihood.append(max_value)
    
df_rtrolls["strengthoftopic"] = pd.Series(topics_likelihood, index=df_rtrolls.index)     

In [None]:
print(df_rtrolls.topicnumber.value_counts()) #let's make sure this is a good model...

In [None]:
df_rtrolls['week'] = (pd.DatetimeIndex(df_rtrolls.date).week + 
                    (pd.DatetimeIndex(df_rtrolls.date).year-2015)*52)

In [None]:
with open("rtrolls_df.pkl", 'wb') as picklefile:
    pickle.dump(df_rtrolls, picklefile) 