In [5]:
import numpy as np
import pandas as pd

import string
import importlib

import russia_df_utils
importlib.reload(russia_df_utils)
from russia_df_utils import get_uncategorized_RT_df_rows

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim import corpora, models, similarities, matutils
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import NMF

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
clustering_text = get_uncategorized_RT_df_rows()['processed_text']

In [7]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=10)
vectorized_tweets = tfidf.fit_transform(clustering_text.values)

#pd.DataFrame(vectorized_tweets.todense(), 
#             columns=tfidf.get_feature_names()
#            ).head()

tfidf_corpus = matutils.Sparse2Corpus(vectorized_tweets.transpose())

# Row indices
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

# This is a hack for Python 3!
id2word = corpora.Dictionary.from_corpus(tfidf_corpus, 
                                         id2word=id2word)




2018-03-04 20:15:44,908 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:44,998 : INFO : adding document #10000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,090 : INFO : adding document #20000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,182 : INFO : adding document #30000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,272 : INFO : adding document #40000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,363 : INFO : adding document #50000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,454 : INFO : adding document #60000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,546 : INFO : adding document #70000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,639 : INFO : adding document #80000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,732 : INFO : adding document #90000 to Dictionary(0 unique tokens: [])
2018-03-04 20:15:45,825 : INFO : adding document #100000 to Dictionary(0 unique tokens: [])
20

In [8]:
#4:11
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=300, power_iters=10)

2018-03-04 20:15:46,365 : INFO : using serial LSI version on this node
2018-03-04 20:15:46,366 : INFO : updating model with new documents
2018-03-04 20:15:46,435 : INFO : preparing a new chunk of documents
2018-03-04 20:15:46,528 : INFO : using 100 extra samples and 10 power iterations
2018-03-04 20:15:46,529 : INFO : 1st phase: constructing (21335, 400) action matrix
2018-03-04 20:15:47,034 : INFO : orthonormalizing (21335, 400) action matrix
2018-03-04 20:15:53,055 : INFO : 2nd phase: running dense svd on (400, 20000) matrix
2018-03-04 20:15:54,111 : INFO : computing the final decomposition
2018-03-04 20:15:54,114 : INFO : keeping 300 factors (discarding 15.902% of energy spectrum)
2018-03-04 20:15:54,252 : INFO : processed documents up to #20000
2018-03-04 20:15:54,264 : INFO : topic #0(9.057): 0.693*"realdonaldtrump" + 0.400*"hillaryclinton" + 0.137*"obama" + 0.106*"people" + 0.103*"just" + 0.100*"president" + 0.092*"vote" + 0.091*"like" + 0.088*"america" + 0.086*"don"
2018-03-04 2

In [9]:
lsi.show_topics()[:5]

[(0,
  '0.702*"realdonaldtrump" + 0.414*"hillaryclinton" + 0.140*"obama" + 0.101*"people" + 0.099*"just" + 0.092*"president" + 0.088*"like" + 0.087*"vote" + 0.085*"new" + 0.080*"maga"'),
 (1,
  '-0.789*"hillaryclinton" + 0.541*"realdonaldtrump" + -0.087*"obama" + -0.077*"campaign" + -0.069*"foundation" + -0.067*"hillaryclinton campaign" + -0.065*"hillaryclinton foundation" + -0.058*"wikileaks" + -0.054*"emails" + -0.045*"fbi"'),
 (2,
  '-0.348*"realdonaldtrump" + -0.311*"hillaryclinton" + 0.297*"obama" + 0.280*"people" + 0.277*"don" + 0.253*"like" + 0.225*"just" + 0.163*"new" + 0.127*"know" + 0.123*"news"'),
 (3,
  '-0.410*"new" + -0.399*"post" + -0.388*"news" + -0.371*"conservatexian" + -0.235*"news post" + -0.235*"conservatexian news" + 0.210*"don" + 0.190*"people" + 0.182*"like" + -0.173*"new post"'),
 (4,
  '0.761*"obama" + -0.221*"don" + -0.189*"people" + -0.175*"like" + 0.172*"tcot" + 0.159*"president" + -0.133*"new" + 0.130*"pjnet" + -0.117*"just" + -0.097*"post"')]

In [10]:
# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[tfidf_corpus]

# Dump the resulting document vectors into a list so we can take a look
doc_vecs = [doc for doc in lsi_corpus]
ng_lsi = matutils.corpus2dense(lsi_corpus, num_terms=300).transpose()


In [11]:
kmeans = KMeans(n_clusters=8)

# Cluster
ng_lsi_clusters = kmeans.fit_predict(ng_lsi)


In [12]:
print(ng_lsi_clusters[0:25])
print(clustering_text.values[0:25])

[0 3 0 0 0 7 0 7 0 0 0 0 5 0 0 0 4 0 0 0 0 0 0 0 0]
[' @ltapoll: Who was/is the best president of the past 25 years? (Vote & Retweet)'
 " @jww372: I don't have to guess your religion! #ChristmasAftermath"
 ' @Shareblue: Pence and his lawyers decided which of his official emails the public could see\n\n by @alisonrose711'
 ' @MDBlanchfield: You’ll never guess who tweeted something false that he saw on TV - The Washington Post '
 ' @100PercFEDUP: New post: WATCH: DIAMOND AND SILK Rip On John Kerry Over Israel Comments (VIDEO) '
 ' @AriaWilsonGOP: 3 Women Face Charges After Being Caught Stealing Dozens Of @realDonaldTrump Signs  '
 " @ElPenguinito: #myfarewellwordswouldbe I've buried my fortune in the park under a giant..."
 ' @America_1st_: CW: "The thing that impressed me was that @realDonaldTrump is always comfortable in own skin, but now he was comfortable being the P…'
 ' @AIIAmericanGirI: 🇺🇸\nObama to add 450 Iraq military advisers '
 ' @RadioACR: Chuck Todd vs Kellyanne Conway... a

# NMF #

In [13]:
nmf = NMF(n_components=50, random_state=42, verbose=True)
nmf_vecs = nmf.fit_transform(vectorized_tweets)

violation: 1.0
violation: 0.2461281225890938
violation: 0.1452092029585538
violation: 0.07952825221431065
violation: 0.07446987653475022
violation: 0.052586663956053444
violation: 0.03045346993545264
violation: 0.01835236338043699
violation: 0.012035856211256142
violation: 0.008689437646966554
violation: 0.006756356204306493
violation: 0.005578075828643977
violation: 0.004782908298547507
violation: 0.004311174891833146
violation: 0.003942252286432447
violation: 0.0037729757521743753
violation: 0.0036882787244867966
violation: 0.0036597476028326784
violation: 0.0036694981676755833
violation: 0.0037131669718605096
violation: 0.003778451433905039
violation: 0.003846823722209038
violation: 0.0038907078732588717
violation: 0.003942165131570739
violation: 0.003991962731535739
violation: 0.00402351679581964
violation: 0.004049354308822327
violation: 0.004069724089910894
violation: 0.004086269589801775
violation: 0.0040872505416875895
violation: 0.004072637985002181
violation: 0.00402916941090

In [14]:
nmf5 = NMF(n_components=20, random_state=42, verbose=True)
nmf5_vecs = nmf5.fit_transform(vectorized_tweets)

violation: 1.0
violation: 0.38065049394503037
violation: 0.200338270445008
violation: 0.0941451771686548
violation: 0.05550344037620619
violation: 0.03868659341261819
violation: 0.029172674047618176
violation: 0.023128099007763164
violation: 0.01920107505261189
violation: 0.016666717013419537
violation: 0.015093326107775272
violation: 0.014018673213088929
violation: 0.013262134103046613
violation: 0.012695209126430686
violation: 0.012295348680886881
violation: 0.012057796635908788
violation: 0.011913119554762904
violation: 0.011817942575718245
violation: 0.011526312723809254
violation: 0.01145948812763975
violation: 0.01149037276950792
violation: 0.011629690813824712
violation: 0.011880934458351164
violation: 0.012149637483164279
violation: 0.01238415835377196
violation: 0.01278969233222047
violation: 0.01332128921700283
violation: 0.013968760710192447
violation: 0.014712726766971166
violation: 0.015576281521136168
violation: 0.016577190857849446
violation: 0.017738144707116883
violati

In [15]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit_predict(nmf5_vecs)

In [16]:
#agglo_model = AgglomerativeClustering(affinity='cosine', linkage='average')
#a_clusters = agglo_model.fit_predict(nmf_vecs)

#a_clusters[0:5]

In [17]:
lower = 75
upper = 100
zipped_clusters = list(zip(clusters[lower:upper], clustering_text.values[lower:upper]))
for cluster in zipped_clusters:
    print(cluster)
#print(clustering_text.values[25:50])

(1, ' @prchristen: ')
(1, ' @DrewMcKissick: Profile on what RNC & #GOPFaith Engagement Director @ChadConnelly is doing to engage Christian conservatives ')
(1, " @FeministaJones: I will bet cash money @realDonaldTrump doesn't know what district John Lewis represents")
(1, ' @CJperkins48: ENOUGH SAID!!!! ')
(1, ' @unapologetic_us: This so much. ')
(1, ' @cfresq: Ivanka or Tiffany?  #rejecteddebatetopics')
(0, ' @anthony9843: Fact\nShe used paid seat fillers & actors against Obama. ')
(1, ' @prchristen: ')
(1, ' @Latent_Fury: #TeamCap or #TeamIronMan \n\n#RejectedDebateTopics\n\n&lt;As @HillaryClinton whispers&gt; "Hail Hydra..." ')
(3, ' @BobMacAZ: Benghazi Marine: @HillaryClinton Put Her Political Career Ahead Of Doing What&#8217;s Right ')
(1, ' @BitcoinzMan: #Bitcoin foundation welcomes bruce fenton as new executive director #crypto #cryptocurrency  ')
(0, ' @DanMartin_cards: MT @fredbookmann: The Obama administration has perpetrated one scam after another for special interests. ')
(

In [18]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


In [19]:
display_topics(nmf5, tfidf.get_feature_names(),30)

Topic 0:
realdonaldtrump maga realdonaldtrump realdonaldtrump supporters media realdonaldtrump supporters rally support says cnn realdonaldtrump hillaryclinton voting debate voting realdonaldtrump said supporter video trump2016 trumptrain realdonaldtrump supporter trumpsfavoriteheadline poll anti hillaryclinton realdonaldtrump win realdonaldtrump rally won support realdonaldtrump gop anti realdonaldtrump
Topic 1:
hillaryclinton campaign foundation hillaryclinton campaign hillaryclinton foundation wikileaks emails fbi realdonaldtrump hillaryclinton email hillaryclinton hillaryclinton hillaryclinton realdonaldtrump neverhillary debate dnc state obama hillaryclinton breaking video benghazi cnn media hillaryclinton email hillaryclinton emails says watch crooked corruption did haiti
Topic 2:
new new post conservatexian new york new york year video post conservatexian new year music times old york times new video new music happy watch poll check blicqer happy new realdonaldtrump new new hill

# LDA #

In [20]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=10)
count_vectorized_tweets = count_vectorizer.fit_transform(clustering_text.values)
corpus = matutils.Sparse2Corpus(count_vectorized_tweets.transpose())
id2word_lda = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())


In [21]:
lda = models.LdaMulticore(corpus=corpus, num_topics=20, id2word=id2word_lda, passes=20)


2018-03-04 20:19:55,302 : INFO : using symmetric alpha at 0.05
2018-03-04 20:19:55,303 : INFO : using symmetric eta at 0.05
2018-03-04 20:19:55,306 : INFO : using serial LDA version on this node
2018-03-04 20:20:00,055 : INFO : running online LDA training, 20 topics, 20 passes over the supplied corpus of 147723 documents, updating every 14000 documents, evaluating every ~140000 documents, iterating 50x with a convergence threshold of 0.001000
2018-03-04 20:20:00,058 : INFO : training LDA model using 7 processes
2018-03-04 20:20:00,449 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/147723, outstanding queue size 1
2018-03-04 20:20:00,630 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/147723, outstanding queue size 2
2018-03-04 20:20:00,902 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/147723, outstanding queue size 3
2018-03-04 20:20:00,910 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/1477

In [22]:
lda.print_topics(num_words=20)

2018-03-04 20:31:04,920 : INFO : topic #0 (0.050): 0.035*"realdonaldtrump" + 0.035*"white" + 0.026*"said" + 0.023*"house" + 0.013*"hillaryclinton" + 0.011*"white house" + 0.009*"foxnews" + 0.007*"soros" + 0.006*"says" + 0.005*"obama" + 0.005*"black" + 0.005*"people" + 0.005*"camp" + 0.004*"30" + 0.004*"allenwestrepub" + 0.004*"george" + 0.004*"kelly" + 0.004*"team" + 0.004*"going" + 0.004*"story"
2018-03-04 20:31:04,921 : INFO : topic #1 (0.050): 0.019*"party" + 0.014*"police" + 0.008*"chicago" + 0.007*"2016in4words" + 0.007*"control" + 0.006*"senate" + 0.006*"thingsmoretrustedthanhillary" + 0.006*"water" + 0.005*"democrat" + 0.005*"ban" + 0.004*"democratic" + 0.004*"elect" + 0.004*"violence" + 0.004*"realdonaldtrump" + 0.004*"baltimore" + 0.004*"deep" + 0.004*"legal" + 0.004*"political" + 0.004*"democrats" + 0.004*"officers"
2018-03-04 20:31:04,922 : INFO : topic #2 (0.050): 0.028*"president" + 0.025*"realdonaldtrump" + 0.015*"mt" + 0.012*"states" + 0.012*"pjnet" + 0.008*"tedcruz" + 0

[(0,
  '0.035*"realdonaldtrump" + 0.035*"white" + 0.026*"said" + 0.023*"house" + 0.013*"hillaryclinton" + 0.011*"white house" + 0.009*"foxnews" + 0.007*"soros" + 0.006*"says" + 0.005*"obama" + 0.005*"black" + 0.005*"people" + 0.005*"camp" + 0.004*"30" + 0.004*"allenwestrepub" + 0.004*"george" + 0.004*"kelly" + 0.004*"team" + 0.004*"going" + 0.004*"story"'),
 (1,
  '0.019*"party" + 0.014*"police" + 0.008*"chicago" + 0.007*"2016in4words" + 0.007*"control" + 0.006*"senate" + 0.006*"thingsmoretrustedthanhillary" + 0.006*"water" + 0.005*"democrat" + 0.005*"ban" + 0.004*"democratic" + 0.004*"elect" + 0.004*"violence" + 0.004*"realdonaldtrump" + 0.004*"baltimore" + 0.004*"deep" + 0.004*"legal" + 0.004*"political" + 0.004*"democrats" + 0.004*"officers"'),
 (2,
  '0.028*"president" + 0.025*"realdonaldtrump" + 0.015*"mt" + 0.012*"states" + 0.012*"pjnet" + 0.008*"tedcruz" + 0.008*"united" + 0.007*"president realdonaldtrump" + 0.007*"ago" + 0.007*"hillaryclinton" + 0.007*"years" + 0.006*"america" 

In [23]:
lda_corpus = lda[corpus]
lda_docs = [doc for doc in lda_corpus]
lda_docs[0:5]

[[(2, 0.47945994), (3, 0.128222), (4, 0.205), (11, 0.10731801)],
 [(11, 0.18401067), (15, 0.38203514), (17, 0.29228756)],
 [(3, 0.17952), (15, 0.14999999), (17, 0.5490514)],
 [(0, 0.14604317), (12, 0.42497894), (15, 0.35170513)],
 [(4, 0.07126545), (5, 0.19728139), (6, 0.3874147), (12, 0.29697967)]]

In [24]:
len(lda_docs)

147723

In [25]:
lda.top_topics(corpus)

2018-03-04 20:33:23,926 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2018-03-04 20:33:23,934 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2018-03-04 20:33:23,942 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2018-03-04 20:33:23,950 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2018-03-04 20:33:23,958 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2018-03-04 20:33:23,966 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2018-03-04 20:33:23,974 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2018-03-04 20:33:23,981 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2018-03-04 20:33:23,990 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2018-03-04 20:33:23,998 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2018-03-04 20:33:24,005 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2018-03-04 20:33:24

[([(0.02319336, 'people'),
   (0.014539365, 'realdonaldtrump'),
   (0.011794468, 'does'),
   (0.010719129, 'things'),
   (0.010181446, 'just'),
   (0.010170504, 'let'),
   (0.009749908, 'don'),
   (0.009194117, 'like'),
   (0.008720424, 'know'),
   (0.0075774677, 'need'),
   (0.007231572, 'real'),
   (0.0064592115, 'say'),
   (0.0063733547, 'happy'),
   (0.006227384, 'thing'),
   (0.0060654967, 'really'),
   (0.005932468, 'talking'),
   (0.0056976327, 'hell'),
   (0.005456358, 'life'),
   (0.005401547, 'doing'),
   (0.0052273604, 'isn')],
  -4.73253675974556),
 ([(0.09800326, 'realdonaldtrump'),
   (0.029963769, 'maga'),
   (0.023350336, 'hillaryclinton'),
   (0.015658705, 'america'),
   (0.014015118, 'great'),
   (0.011591438, 'thank'),
   (0.009843191, 'god'),
   (0.008806232, 'rally'),
   (0.007589515, 'new'),
   (0.00746998, 'trumptrain'),
   (0.006622258, 'neverhillary'),
   (0.0063026724, 'trumppence16'),
   (0.005603052, 'voting'),
   (0.005472317, 'jobs'),
   (0.005310502, 'mak

In [26]:
clustering_text.values[1]

" @jww372: I don't have to guess your religion! #ChristmasAftermath"

In [27]:
hdp = models.HdpModel(corpus, id2word=id2word_lda, random_state=42)


2018-03-04 20:34:47,275 : INFO : (0, '0.001*realdonaldtrump + 0.001*hillaryclinton + 0.001*tcot + 0.000*obama gets + 0.000*editor + 0.000*teaparty + 0.000*obama + 0.000*patriotjewel + 0.000*genericza + 0.000*surrogate')
2018-03-04 20:34:47,295 : INFO : (1, '0.001*realdonaldtrump + 0.001*hillaryclinton + 0.000*courage + 0.000*teen + 0.000*datpiff promotion + 0.000*title track + 0.000*people + 0.000*staying + 0.000*ring + 0.000*stump')
2018-03-04 20:34:47,315 : INFO : (2, '0.001*realdonaldtrump + 0.001*hillaryclinton + 0.000*carminezozzora steph93065 + 0.000*rnrillinois remember + 0.000*djaytiger listen + 0.000*new + 0.000*years ago + 0.000*obama + 0.000*boston + 0.000*virtual')
2018-03-04 20:34:47,335 : INFO : (3, '0.001*realdonaldtrump + 0.001*hillaryclinton + 0.000*final + 0.000*beayahus + 0.000*fprnradio rundown + 0.000*ga + 0.000*makeamericagreatagain + 0.000*don + 0.000*gop house + 0.000*aborted')
2018-03-04 20:34:47,354 : INFO : (4, '0.001*realdonaldtrump + 0.001*hillaryclinton + 

In [28]:
topics = hdp.print_topics(num_topics=20, num_words=20)


2018-03-04 20:55:00,577 : INFO : (0, '0.010*realdonaldtrump + 0.008*hillaryclinton + 0.003*post + 0.003*new + 0.003*obama + 0.003*conservatexian + 0.003*news + 0.002*just + 0.002*people + 0.002*news post + 0.002*like + 0.002*conservatexian news + 0.001*don + 0.001*new post + 0.001*president + 0.001*conservatexian new + 0.001*says + 0.001*maga + 0.001*white + 0.001*america')
2018-03-04 20:55:00,597 : INFO : (1, '0.009*realdonaldtrump + 0.007*hillaryclinton + 0.004*tcot + 0.003*obama + 0.002*pjnet + 0.002*ccot + 0.002*people + 0.002*just + 0.001*new + 0.001*teaparty + 0.001*maga + 0.001*like + 0.001*tcot ccot + 0.001*don + 0.001*gop + 0.001*p2 + 0.001*petefrt + 0.001*news + 0.001*black + 0.001*president')
2018-03-04 20:55:00,617 : INFO : (2, '0.011*realdonaldtrump + 0.006*hillaryclinton + 0.003*obama + 0.002*people + 0.002*just + 0.002*new + 0.002*like + 0.001*enlist + 0.001*don + 0.001*vote + 0.001*abninfvet + 0.001*america + 0.001*media + 0.001*president + 0.001*news + 0.001*maga + 0.0

In [29]:
topics[50:60]

[]

In [30]:
def topic_prob_extractor(gensim_hdp):
    shown_topics = gensim_hdp.show_topics(num_topics=-1, formatted=False)
    topics_nos = [x[0] for x in shown_topics ]
    weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]

    return pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})
