In [31]:
import pandas as pd
import nltk
import string
import pickle
import re

import emoji

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
df = pd.read_csv('data/IRAhandle_tweets_1.csv')

In [3]:
df.shape

(381016, 15)

In [6]:
df_rtrolls = df[df['account_category'] == 'RightTroll']

In [7]:
df_rtrolls.shape

(128681, 15)

In [116]:
df_rtrolls.reset_index(drop=True, inplace=True)
df_rtrolls.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.06e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll
1,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,0,RightTroll
2,9.06e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,0,1,RightTroll
3,9.06e+17,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,0,RightTroll
4,9.06e+17,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,0,1,RightTroll


In [18]:
# remove links
def remove_link(string):
    return re.sub(r'http[s]?\:\/\/[\S\s]\S+', '', string)

In [24]:
df_rtrolls['content'] = df_rtrolls['content'].apply(remove_link)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [46]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [107]:
def custom_tokenizer(text):
    full_punc = '’‘“”.–…�🇺🇸★➠' + string.punctuation
    # remove punctuation
    remove_punct = str.maketrans('', '', full_punc)
    text = text.translate(remove_punct)

    # remove digits and convert to lower case
    remove_digits = str.maketrans('', '', string.digits)
    text = text.lower().translate(remove_digits)

    # tokenize
    tokens = word_tokenize(text)

    # remove stop words
    punc = [str(i) for i in string.punctuation]
    cust_stop_words = (['rt', 'retweet', 'get', 'one', 'im', 'thing', 'get', 'dont', 'wow',
                       'lol', 'amp', 'n', 'didnt'])
    stop_words = cust_stop_words + stopwords.words('english')
    tokens_stop = [y for y in tokens if y not in stop_words]

    # stem
#    stemmer = SnowballStemmer('english')
#    tokens_stem = [stemmer.stem(y) for y in tokens_stop] 

#    return tokens_stem

    return tokens_stop

In [108]:
tfidf = TfidfVectorizer(min_df=2, tokenizer=custom_tokenizer)
doc_vectors = tfidf.fit_transform(df_rtrolls.content)

In [109]:
nmf = NMF(n_components=20)
nmf_vecs = nmf.fit_transform(doc_vectors)

In [110]:
feature_names = tfidf.get_feature_names()

In [105]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [111]:
print_top_words(nmf, feature_names, 15)

Topic #0:
rtamerica maga trumptrain ourboris americafirst trumppence john cchinesus p pjn morning good rescind exempting rule
Topic #1:
trump president supporters donald supporter calls watch react cnn coup romney voters military going trumptrain
Topic #2:
enlist army patriot usfa stand people read join freedom truth msm usfreedomarmy gt awaits patriots
Topic #3:
breaking arrested statue doj dead death state dem another steve isis shes confederate injured police
Topic #4:
amb auspol traitor mccain obamacare repeal mueller gop liberal mcmaster viral mikecarlton jeff mad watch
Topic #5:
hillary clinton bill doj campaign via election crooked remember deal fbi comey russia plea clintons
Topic #6:
charlottesville truth antifa tragedy media violence black reveals car dinesh bombshell response reacts mayor antitrump
Topic #7:
realdonaldtrump potus president great foxnews barbmuenchen people thank love like make mr vote support country
Topic #8:
news fake media fox cnn today ignoring bad chica

### Without stemmer:

Topic #0:
rtamerica maga trumptrain ourboris americafirst trumppence john cchinesus p pjn morning good rescind exempting rule  
Topic #1:
trump president supporters donald watch supporter react calls cnn coup romney voters going military trumptrain  
Topic #2:
enlist army patriot usfa stand people read join freedom truth msm usfreedomarmy gt awaits enlistment  
Topic #3:
breaking arrested statue doj dead death state dem another steve isis confederate shes injured happening  
Topic #4:
amb auspol traitor mccain obamacare repeal mueller gop viral mcmaster mikecarlton watch jeff mad turnbull  
Topic #5:
hillary clinton bill doj campaign via election crooked remember deal fbi comey russia plea clintons  
Topic #6:
charlottesville truth tragedy antifa violence reveals black car bombshell dinesh reacts response antitrump mayor exposes  
Topic #7:
realdonaldtrump potus president great foxnews barbmuenchen people thank love like mr make vote support country  
Topic #8:
news fake fox cnn today bad ignoring chicago world change post terrible got antitrump man  
Topic #9:
us freedom like enlist join effect vote liberals help military china need tell election wants  
Topic #10:
look said liberals traitor got sides gop dem mccain wants politician nutty demanded pelosi took  
Topic #11:
korea north n merkel nuclear stance failed sides comments well humiliates fury scolds angela fire  
Topic #12:
new poll york shows times economy booming emails john prove mccain evidence implicated agree statues  
Topic #13:
white house people supremacist cnn supremacists mad max black supremacy race run pundit baiting accuses  
Topic #14:
obama iran isis flashback nuclear criminal enterprise deal attack collapsing barack terror left administration didnt  
Topic #15:
america join patriots fight save click stand dare bbsp make great back country first want  
Topic #16:
pjnet tcot maga american thinker ccot wakeupamerica aces flopping mt benghazi russia susan treachery rices  
Topic #17:
media liberal dems mainstream want blm ignore see fake lying today ignoring sides mcmaster exposing  
Topic #18:
video antifa police shock isis viral seattle arrested watch man shes epic american violent terrorists  
Topic #19:
trumps called assassination senator state boom hannity secret service looking blasts demands arrest petition immigration  

### With stemmer:

Topic #0:
trump presid support donald cnn attack react watch tweet slam side ralli tri blame romney  
Topic #1:
rtamerica maga pjnet tcot thinker unmask american trumptrain ace flop enterpris americafirst pjn collaps p  
Topic #2:
enlist patriot armi usfa join stand read freedom await truth need socialist msm usfreedomarmi gt  
Topic #3:
break arrest dem doj dead state death statu emerg back isi shes steve anoth polic  
Topic #4:
amb traitor auspol mccain mueller expos obamacar repeal gop viral dem mcmaster turnbul jeff mad  
Topic #5:
realdonaldtrump potus presid thank great mr foxnew barbmuenchen support love maga trumptrain seanhann pleas makeamericagreatagain  
Topic #6:
charlottesvill video antifa shock violenc truth polic tragedi thug expos violent terrorist report black media  
Topic #7:
ade rtt traitor mccain mueller expos obamacar dem gop watch mcmaster boll repeal eric mad  
Topic #8:
hillari clinton pjnet bill investig tcot email campaign russia comey elect doj uranium fbi benghazi  
Topic #9:
look said traitor like side got mccain dem gop nutti pelosi politician arr ara blame  
Topic #10:
korea north n merkel side nuclear fail stanc china prepar fire attack missil satellit warn  
Topic #11:
news fake media ignor cnn fox today bad report chicago post chang world show mainstream  
Topic #12:
obama pjnet tcot iran defend isi crimin islam flashback enterpris report collaps nuclear still deal  
Topic #13:
peopl go want vote like american say democrat know right need think time countri see  
Topic #14:
white hous supremacist cnn mad max accus race black claim supremaci pundit bait run leaker  
Topic #15:
liber statu remov confeder media monument histori disgrac destroy cover target watch terror hate attack  
Topic #16:
new poll show york time boom economi agre email expos prove john republican mccain post  
Topic #17:
us freedom enlist join effect militari help attack tell elect isi constitut year stand best  
Topic #18:
america make join fight stand great save bbsp click dare back first god countri patriot  
Topic #19:
call senat assassin state fire hanniti coup dem group leader deep terror gop servic demand  

### Vis

In [117]:
count = 0
for i in range(len(df_rtrolls.content)):
    tweet = df_rtrolls.loc[i, 'content']
    if count > 100:
        break
    if 'auspol' in tweet:
        count += 1
        print(df_rtrolls.loc[i, 'author'])

ALDRICH420
ALDRICH420
ALFREDTHREE
ALFREDTHREE
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERLINETR
AMBERL

In [120]:
df_rtrolls[df_rtrolls['author'] == 'AMBERLINETR']

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
24505,8.900000e+17,AMBERLINETR,RT JJ_Kirkwood: Huge turn out for the launch o...,United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,65,,Right,0,0,RightTroll
24506,8.900000e+17,AMBERLINETR,RT senatorsfanNS: n_e_e_v_ CanadianForces Spec...,United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,69,,Right,0,0,RightTroll
24507,8.900000e+17,AMBERLINETR,RT CartwheelPrint: Joyce says he gave water ba...,United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,71,,Right,0,0,RightTroll
24508,8.900000e+17,AMBERLINETR,realDonaldTrump realDonaldTrump Tiny manchild ...,United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,64,,Right,0,0,RightTroll
24509,8.900000e+17,AMBERLINETR,RT sokra_tease: What would Asbestos Julie know...,United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,66,,Right,0,0,RightTroll
24510,8.900000e+17,AMBERLINETR,"RT DanielHRLC: Unfortunately for the Govt, jus...",United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,67,,Right,0,0,RightTroll
24511,8.900000e+17,AMBERLINETR,Kimbobear60 theprojecttv Eeeeuuuwwwww lah lah ...,United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,70,,Right,0,0,RightTroll
24512,8.900000e+17,AMBERLINETR,RT CanadianForces: We welcome Cdns of all sexu...,United States,English,7/27/2017 10:03,7/27/2017 10:03,964,2,68,,Right,0,0,RightTroll
24513,8.900000e+17,AMBERLINETR,RT sokra_tease: COALition Scott Morrison's pub...,United States,English,7/27/2017 10:11,7/27/2017 10:11,964,2,73,,Right,0,0,RightTroll
24514,8.900000e+17,AMBERLINETR,RT abcnews: 'We will sue in a heartbeat': Righ...,United States,English,7/27/2017 10:11,7/27/2017 10:11,964,2,72,,Right,0,0,RightTroll
