In [1]:
import numpy as np
import pandas as pd

import string
import importlib

import russia_df_utils
importlib.reload(russia_df_utils)
from russia_df_utils import get_uncategorized_df_rows

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim import corpora, models, similarities, matutils
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import NMF

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = TweetTokenizer(preserve_case=False)
    def __call__(self, doc):
        tokens = self.tokenizer.tokenize(doc)
        tokens = [word for word in tokens if word.isalpha()]
        return [self.wnl.lemmatize(t) for t in tokens]


In [3]:
clustering_text = get_uncategorized_df_rows(drop_retweets=True)['processed_text']

In [4]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=10)
vectorized_tweets = tfidf.fit_transform(clustering_text.values)

#pd.DataFrame(vectorized_tweets.todense(), 
#             columns=tfidf.get_feature_names()
#            ).head()

tfidf_corpus = matutils.Sparse2Corpus(vectorized_tweets.transpose())

# Row indices
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

# This is a hack for Python 3!
id2word = corpora.Dictionary.from_corpus(tfidf_corpus, 
                                         id2word=id2word)




2018-03-05 21:33:44,088 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-05 21:33:44,161 : INFO : adding document #10000 to Dictionary(0 unique tokens: [])
2018-03-05 21:33:44,235 : INFO : adding document #20000 to Dictionary(0 unique tokens: [])
2018-03-05 21:33:44,309 : INFO : adding document #30000 to Dictionary(0 unique tokens: [])
2018-03-05 21:33:44,383 : INFO : adding document #40000 to Dictionary(0 unique tokens: [])
2018-03-05 21:33:44,457 : INFO : adding document #50000 to Dictionary(0 unique tokens: [])
2018-03-05 21:33:44,503 : INFO : built Dictionary(6340 unique tokens: ['realdonaldtrump administration', 'unknown', 'presidential candidates', 'realdonaldtrump good', 'elected']...) from 51867 documents (total 116337 corpus positions)


In [5]:
#4:11
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=300, power_iters=10)

2018-03-05 21:33:44,509 : INFO : using serial LSI version on this node
2018-03-05 21:33:44,510 : INFO : updating model with new documents
2018-03-05 21:33:44,651 : INFO : preparing a new chunk of documents
2018-03-05 21:33:44,738 : INFO : using 100 extra samples and 10 power iterations
2018-03-05 21:33:44,741 : INFO : 1st phase: constructing (6340, 400) action matrix
2018-03-05 21:33:45,191 : INFO : orthonormalizing (6340, 400) action matrix
2018-03-05 21:33:47,612 : INFO : 2nd phase: running dense svd on (400, 20000) matrix
2018-03-05 21:33:48,574 : INFO : computing the final decomposition
2018-03-05 21:33:48,578 : INFO : keeping 300 factors (discarding 14.627% of energy spectrum)
2018-03-05 21:33:48,608 : INFO : processed documents up to #20000
2018-03-05 21:33:48,615 : INFO : topic #0(12.545): 0.664*"realdonaldtrump" + 0.471*"hillaryclinton" + 0.347*"politics" + 0.111*"obama" + 0.106*"news" + 0.090*"campaign" + 0.086*"says" + 0.081*"hillaryclinton realdonaldtrump" + 0.076*"people" +

In [6]:
lsi.show_topics()[:5]

[(0,
  '0.684*"realdonaldtrump" + 0.461*"hillaryclinton" + 0.342*"politics" + 0.110*"news" + 0.102*"obama" + 0.090*"campaign" + 0.082*"says" + 0.076*"hillaryclinton realdonaldtrump" + 0.072*"realdonaldtrump hillaryclinton" + 0.069*"people"'),
 (1,
  '0.759*"hillaryclinton" + -0.584*"realdonaldtrump" + 0.073*"don" + 0.053*"like" + 0.053*"just" + 0.050*"hillaryclinton campaign" + 0.049*"trumpforpresident" + 0.047*"people" + 0.044*"obama" + 0.044*"know"'),
 (2,
  '0.402*"don" + 0.322*"people" + 0.288*"like" + 0.277*"love" + -0.242*"hillaryclinton" + 0.218*"just" + 0.192*"want" + -0.192*"politics" + 0.161*"know" + 0.130*"islamkills"'),
 (3,
  '0.701*"trumpforpresident" + 0.406*"hillaryforprison2016" + 0.273*"hillaryforprison2016 trumpforpresident" + -0.249*"politics" + -0.151*"obama" + 0.118*"maga" + 0.111*"trumppence16" + 0.106*"vote" + 0.094*"trump2k16" + 0.091*"realdonaldtrump"'),
 (4,
  '-0.901*"love" + 0.209*"obama" + 0.144*"islamkills" + 0.100*"like" + 0.099*"people" + 0.088*"refugee

In [7]:
# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[tfidf_corpus]

# Dump the resulting document vectors into a list so we can take a look
doc_vecs = [doc for doc in lsi_corpus]
ng_lsi = matutils.corpus2dense(lsi_corpus, num_terms=300).transpose()


In [8]:
kmeans = KMeans(n_clusters=8)

# Cluster
ng_lsi_clusters = kmeans.fit_predict(ng_lsi)


In [9]:
print(ng_lsi_clusters[0:25])
print(clustering_text.values[0:25])

[3 6 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 0 3 6 6 3 3]
['#IslamKills Are you trying to say that there were no terrorist attacks in Europe before refugees were let in?'
 '@HillaryClinton: @realDonaldTrump should’ve apologized more, attacked less '
 '@ModicaGiunta me, too!'
 'One of the ways to remind that #BlackLivesMatter #BlackPressDay'
 'Dave Chappelle: "blacklivesmatter" is the worst slogan I\'ve ever heard! How about "enough is enough"? VotingTrump! '
 '#My2017BiggestHope to reach this level of pettiness '
 'The war is here! \nThis gentleman made more sense in 30 sec than #Obama for all time of his presidency.. '
 "Obama on @realDonaldTrump winning: 'Anything's possible'  #politics"
 'it’s impossible! #TexasJihad'
 'Bewaffnete attackieren Bus mit koptischen Christen #Islamisten #ISIS \n'
 'The bright example of our failing education '
 '@sendavidperdue How are they gonna protect us if they just let a bunch of terrorist walk the cities of our city? #StopIslam #IslamKills'
 'FC Barcelon

# NMF #

In [12]:
nmf = NMF(n_components=50, random_state=42, verbose=True)
nmf_vecs = nmf.fit_transform(vectorized_tweets)

violation: 1.0
violation: 0.20416876729542677
violation: 0.10140874147201431
violation: 0.05503858589276304
violation: 0.037237837360234435
violation: 0.03351818757651213
violation: 0.034615198252984355
violation: 0.01962061654325611
violation: 0.011645723828396556
violation: 0.00850062126796758
violation: 0.0068851316745896195
violation: 0.005701861583678972
violation: 0.004791486399580732
violation: 0.0040508308092777
violation: 0.0034942826392379456
violation: 0.003031967724829557
violation: 0.0025872818821764355
violation: 0.0023509107030671044
violation: 0.002194392520670959
violation: 0.00209408193815759
violation: 0.0020620237315415453
violation: 0.0020779268793024467
violation: 0.0021260039065640435
violation: 0.0022011183841074097
violation: 0.0022797382130662182
violation: 0.002401121283151761
violation: 0.0024622634666986803
violation: 0.0024657196126994835
violation: 0.0024023391876014603
violation: 0.0022815307133172676
violation: 0.002107098105879616
violation: 0.00190729

In [13]:
nmf5 = NMF(n_components=20, random_state=42, verbose=True)
nmf5_vecs = nmf5.fit_transform(vectorized_tweets)

violation: 1.0
violation: 0.3185685343303778
violation: 0.1461754050022826
violation: 0.08518156712224928
violation: 0.05528242484762138
violation: 0.0418307642192521
violation: 0.03485910370612976
violation: 0.030352762151508082
violation: 0.026025711721994697
violation: 0.021981088146432938
violation: 0.0187284522312216
violation: 0.016488453514572555
violation: 0.013987063437205155
violation: 0.012346388217148504
violation: 0.011451386490604003
violation: 0.010782266837816414
violation: 0.010111634972094765
violation: 0.00953879516253799
violation: 0.00943849749906012
violation: 0.009505567019445035
violation: 0.009769202354640839
violation: 0.010232920145394496
violation: 0.010890256462860317
violation: 0.011815144081656377
violation: 0.013038274773777874
violation: 0.014711488474852067
violation: 0.01703573573840461
violation: 0.01989010347747923
violation: 0.022470019520870393
violation: 0.023174111961465113
violation: 0.021030636794829177
violation: 0.016893339661045843
violatio

In [14]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit_predict(nmf5_vecs)

In [15]:
#agglo_model = AgglomerativeClustering(affinity='cosine', linkage='average')
#a_clusters = agglo_model.fit_predict(nmf_vecs)

#a_clusters[0:5]

In [16]:
lower = 75
upper = 100
zipped_clusters = list(zip(clusters[lower:upper], clustering_text.values[lower:upper]))
for cluster in zipped_clusters:
    print(cluster)
#print(clustering_text.values[25:50])

(0, 'eeeew ')
(0, "#Shooting incidents make people feel like tracked animal. It's sad. #RooseveltFieldMall  #GardenCityShooting")
(1, "Today's the day, go out and vote for @realDonaldTrump #TrumpPence16 #Trump2k16 #MAGA #Election2016 #MakeAmericaGreatAgain #TrumpForPresident")
(0, 'Obama’s Kenyan half-brother says he supports @realDonaldTrump  #news')
(0, 'Get the Latest DueyDialer  #LastMinuteGifts2016 #DueyDialerNews #Marketing #YourRights #DUI #Lawyers')
(0, '@GigaWalrus He also lost the popular vote, you forgot to mention it.')
(2, "The Nation doesn't deserve another spectacle of a @HillaryClinton in the WH #HillaryNoThnx")
(0, 'Texas Democrats Indicted for Buying Votes With Cocaine   #tcot #PJNET #ccot #WakeUpAmerica #RedNationRising')
(0, "#TrumpBecause And you ain't seen nothing yet #DonaldTrumpforPresident will make #America great again 🍺🍺🍺")
(0, 'Burned coal is better than no coal!!  #SecondhandGifts')
(0, '#HappyBirthdayHarryTruman He had guts to drop the bomb')
(0, 'NBC Nigh

In [17]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


In [18]:
display_topics(nmf5, tfidf.get_feature_names(),30)

Topic 0:
realdonaldtrump hillaryclinton realdonaldtrump realdonaldtrump hillaryclinton realdonaldtrump realdonaldtrump vote rally media debate vote realdonaldtrump president immigration support says new realdonaldtrump campaign campaign supporters topvideo win realdonaldtrump supporters realdonaldtrump says poll speech mexico tower breaking realdonaldtrump rally realdonaldtrump president melania support realdonaldtrump
Topic 1:
hillaryclinton hillaryclinton realdonaldtrump realdonaldtrump hillaryclinton debate campaign hillaryclinton campaign fbi emails poll foundation hillaryclinton foundation email new health debatenight hillaryclinton hillaryclinton vote hillaryclinton voters benghazi hillaryclinton president vote hillaryclinton politics neverhillary women supporters hillaryclinton says pneumonia prison lies money
Topic 2:
don don know don want don need care don care need don like let don let vote don think people don understand forget don understand don believe believe don forget w

# LDA #

In [21]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=10)
count_vectorized_tweets = count_vectorizer.fit_transform(clustering_text.values)
corpus = matutils.Sparse2Corpus(count_vectorized_tweets.transpose())
id2word_lda = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())


In [22]:
lda = models.LdaMulticore(corpus=corpus, num_topics=15, id2word=id2word_lda, passes=20)


2018-03-05 22:10:19,597 : INFO : using symmetric alpha at 0.06666666666666667
2018-03-05 22:10:19,598 : INFO : using symmetric eta at 0.06666666666666667
2018-03-05 22:10:19,600 : INFO : using serial LDA version on this node
2018-03-05 22:10:20,664 : INFO : running online LDA training, 15 topics, 20 passes over the supplied corpus of 51867 documents, updating every 14000 documents, evaluating every ~51867 documents, iterating 50x with a convergence threshold of 0.001000
2018-03-05 22:10:20,666 : INFO : training LDA model using 7 processes
2018-03-05 22:10:20,830 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/51867, outstanding queue size 1
2018-03-05 22:10:20,937 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/51867, outstanding queue size 2
2018-03-05 22:10:21,038 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/51867, outstanding queue size 3
2018-03-05 22:10:21,139 : INFO : PROGRESS: pass 0, dispatched chunk #3 = d

In [23]:
lda.print_topics(num_words=20)

2018-03-05 22:14:03,040 : INFO : topic #0 (0.067): 0.041*"need" + 0.019*"american" + 0.014*"realdonaldtrump" + 0.011*"water" + 0.011*"debatenight" + 0.010*"hillaryclinton" + 0.010*"vegasgopdebate" + 0.010*"know" + 0.010*"strong" + 0.009*"2016" + 0.009*"history" + 0.008*"phosphorusdisaster" + 0.008*"gopdebate" + 0.008*"voter" + 0.007*"candidate" + 0.007*"falls" + 0.007*"video" + 0.007*"debates" + 0.007*"lastminutegifts2016" + 0.007*"election"
2018-03-05 22:14:03,040 : INFO : topic #1 (0.067): 0.092*"people" + 0.026*"think" + 0.017*"don" + 0.011*"money" + 0.011*"care" + 0.010*"want" + 0.008*"black" + 0.008*"wall" + 0.007*"just" + 0.007*"pay" + 0.007*"did" + 0.007*"demdebate" + 0.006*"american" + 0.006*"sheriff" + 0.006*"problems" + 0.006*"like" + 0.005*"stupid" + 0.005*"demndebate" + 0.005*"david" + 0.005*"hillaryclinton"
2018-03-05 22:14:03,041 : INFO : topic #2 (0.067): 0.027*"right" + 0.020*"prayers4california" + 0.018*"gun" + 0.012*"just" + 0.012*"guns" + 0.011*"control" + 0.010*"hea

[(0,
  '0.041*"need" + 0.019*"american" + 0.014*"realdonaldtrump" + 0.011*"water" + 0.011*"debatenight" + 0.010*"hillaryclinton" + 0.010*"vegasgopdebate" + 0.010*"know" + 0.010*"strong" + 0.009*"2016" + 0.009*"history" + 0.008*"phosphorusdisaster" + 0.008*"gopdebate" + 0.008*"voter" + 0.007*"candidate" + 0.007*"falls" + 0.007*"video" + 0.007*"debates" + 0.007*"lastminutegifts2016" + 0.007*"election"'),
 (1,
  '0.092*"people" + 0.026*"think" + 0.017*"don" + 0.011*"money" + 0.011*"care" + 0.010*"want" + 0.008*"black" + 0.008*"wall" + 0.007*"just" + 0.007*"pay" + 0.007*"did" + 0.007*"demdebate" + 0.006*"american" + 0.006*"sheriff" + 0.006*"problems" + 0.006*"like" + 0.005*"stupid" + 0.005*"demndebate" + 0.005*"david" + 0.005*"hillaryclinton"'),
 (2,
  '0.027*"right" + 0.020*"prayers4california" + 0.018*"gun" + 0.012*"just" + 0.012*"guns" + 0.011*"control" + 0.010*"heart" + 0.010*"life" + 0.009*"oh" + 0.008*"2a" + 0.008*"point" + 0.008*"time" + 0.007*"second" + 0.007*"truth" + 0.007*"love"

In [24]:
lda_corpus = lda[corpus]
lda_docs = [doc for doc in lda_corpus]
lda_docs[0:5]

[[(3, 0.26706737), (11, 0.64626586)],
 [(0, 0.011111123),
  (1, 0.011111119),
  (2, 0.011111111),
  (3, 0.34971496),
  (4, 0.011111122),
  (5, 0.011111115),
  (6, 0.5058405),
  (7, 0.011111113),
  (8, 0.011111128),
  (9, 0.011111114),
  (10, 0.011111111),
  (11, 0.011111124),
  (12, 0.011111144),
  (13, 0.011111119),
  (14, 0.011111128)],
 [(0, 0.06666667),
  (1, 0.06666667),
  (2, 0.06666667),
  (3, 0.06666667),
  (4, 0.06666667),
  (5, 0.06666667),
  (6, 0.06666667),
  (7, 0.06666667),
  (8, 0.06666667),
  (9, 0.06666667),
  (10, 0.06666667),
  (11, 0.06666667),
  (12, 0.06666667),
  (13, 0.06666667),
  (14, 0.06666667)],
 [(0, 0.013333374),
  (1, 0.013333361),
  (2, 0.013333336),
  (3, 0.15132403),
  (4, 0.013333348),
  (5, 0.0133333355),
  (6, 0.0133333355),
  (7, 0.013333338),
  (8, 0.013333338),
  (9, 0.013333341),
  (10, 0.0133333355),
  (11, 0.0133333355),
  (12, 0.6753425),
  (13, 0.013333341),
  (14, 0.0133333355)],
 [(0, 0.7237306), (3, 0.167936)]]

In [25]:
len(lda_docs)

51867

In [26]:
lda.top_topics(corpus)

2018-03-05 22:14:40,726 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2018-03-05 22:14:40,734 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2018-03-05 22:14:40,741 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2018-03-05 22:14:40,748 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2018-03-05 22:14:40,756 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2018-03-05 22:14:40,764 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2018-03-05 22:14:40,771 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2018-03-05 22:14:40,779 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2018-03-05 22:14:40,786 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2018-03-05 22:14:40,794 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2018-03-05 22:14:40,801 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2018-03-05 22:14:40

[([(0.046855733, 'don'),
   (0.020158034, 'just'),
   (0.019096669, 'know'),
   (0.0174484, 'like'),
   (0.01628358, 'make'),
   (0.015891112, 'let'),
   (0.013742514, 'll'),
   (0.012903472, 'way'),
   (0.012814646, 'things'),
   (0.0121584, 'oscarhasnocolor'),
   (0.010629916, 'good'),
   (0.010231976, 'teapartynews'),
   (0.010215997, 'want'),
   (0.01014447, 'feel'),
   (0.00969856, 'god'),
   (0.009584805, 'america'),
   (0.008208342, 'really'),
   (0.007830505, 'say'),
   (0.0076780855, 'change'),
   (0.0074653905, 'oscars')],
  -4.752068626873247),
 ([(0.112153135, 'realdonaldtrump'),
   (0.06901939, 'hillaryclinton'),
   (0.06240386, 'politics'),
   (0.018095834, 'says'),
   (0.014348176, 'new'),
   (0.013564908, 'debate'),
   (0.010744272, 'news'),
   (0.009936315, 'obama'),
   (0.009736702, 'gop'),
   (0.009428804, 'realdonaldtrump hillaryclinton'),
   (0.009184326, 'poll'),
   (0.007912887, 'hillaryclinton realdonaldtrump'),
   (0.007820748, 'ohio'),
   (0.007682375, 'voters

In [33]:
clustering_text.values[2]

'@ModicaGiunta me, too!'

In [28]:
hdp = models.HdpModel(corpus, id2word=id2word_lda, random_state=42)


2018-03-05 22:15:40,486 : INFO : (0, '0.001*forces + 0.001*iphone + 0.001*til + 0.001*realdonaldtrump + 0.001*baseball + 0.001*today realdonaldtrump + 0.001*ad + 0.001*bedroom + 0.001*river + 0.001*international')
2018-03-05 22:15:40,492 : INFO : (1, '0.002*ears + 0.001*guten + 0.001*directly + 0.001*realdonaldtrump + 0.001*hillaryclinton + 0.001*hosted + 0.001*cyber censorship + 0.001*pace + 0.001*trying save + 0.001*sheep')
2018-03-05 22:15:40,498 : INFO : (2, '0.002*notmypresident + 0.001*crazy + 0.001*special + 0.001*realdonaldtrump make + 0.001*helps + 0.001*want realdonaldtrump + 0.001*mlk + 0.001*2a prayers4california + 0.001*colors + 0.001*typical')
2018-03-05 22:15:40,504 : INFO : (3, '0.001*way control + 0.001*winner + 0.001*realdonaldtrump stop + 0.001*hillaryclinton team + 0.001*reporters + 0.001*transparent + 0.001*ahead + 0.001*highest + 0.001*whereshillary + 0.001*maga hillaryforprison2016')
2018-03-05 22:15:40,510 : INFO : (4, '0.001*realdonaldtrump + 0.001*delay + 0.00

In [29]:
topics = hdp.print_topics(num_topics=90, num_words=20)


2018-03-05 22:20:17,431 : INFO : (0, '0.005*realdonaldtrump + 0.004*hillaryclinton + 0.002*politics + 0.002*news + 0.002*ears + 0.001*obama + 0.001*guten + 0.001*america + 0.001*directly + 0.001*tcot + 0.001*think + 0.001*hosted + 0.001*rnc + 0.001*report + 0.001*cyber censorship + 0.001*pace + 0.001*don + 0.001*trying save + 0.001*people + 0.001*chant')
2018-03-05 22:20:17,438 : INFO : (1, '0.005*realdonaldtrump + 0.003*hillaryclinton + 0.002*obama + 0.002*politics + 0.001*forces + 0.001*news + 0.001*don + 0.001*like + 0.001*sign + 0.001*iphone + 0.001*national + 0.001*til + 0.001*say + 0.001*baseball + 0.001*want + 0.001*ad + 0.001*strong + 0.001*today realdonaldtrump + 0.001*trumpforpresident + 0.001*people')
2018-03-05 22:20:17,444 : INFO : (2, '0.005*realdonaldtrump + 0.004*hillaryclinton + 0.002*politics + 0.001*just + 0.001*obama + 0.001*people + 0.001*don + 0.001*books + 0.001*time + 0.001*eaten + 0.001*zero + 0.001*thingsmoretrustedthanhillary + 0.001*trumpforpresident + 0.001

In [30]:
hdp_corpus = hdp[corpus]
hdp_docs = [doc for doc in hdp_corpus]
hdp_docs[0:5]

[[(5, 0.44733569652744504), (84, 0.45400800573729094)],
 [(56, 0.834451617200027)],
 [],
 [(33, 0.5183615322920262), (137, 0.2843118016342204)],
 [(85, 0.3098128053921488), (96, 0.5668642784883587)]]

In [32]:
hdp_docs[10:20]

[[(8, 0.8013655892744475)],
 [(6, 0.9236032379521291)],
 [(83, 0.9172370295274734)],
 [(80, 0.5033296966064782)],
 [(102, 0.7516666696004879)],
 [(40, 0.6689661054049558)],
 [(62, 0.503290503741422)],
 [(103, 0.668868668309872)],
 [(41, 0.3200050004572698), (147, 0.5390333067258386)],
 [(106, 0.900672293280412)]]

In [None]:
max(hdp_docs,key=lambda item:item[0])

In [None]:
def topic_prob_extractor(gensim_hdp):
    shown_topics = gensim_hdp.show_topics(num_topics=-1, formatted=False)
    topics_nos = [x[0] for x in shown_topics ]
    weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]

    return pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})


# LDA Clustering #

In [34]:
#lda_docs
lda_docs[0:3]

[[(3, 0.26706737), (11, 0.64626586)],
 [(0, 0.011111123),
  (1, 0.011111119),
  (2, 0.011111111),
  (3, 0.34971496),
  (4, 0.011111122),
  (5, 0.011111115),
  (6, 0.5058405),
  (7, 0.011111113),
  (8, 0.011111128),
  (9, 0.011111114),
  (10, 0.011111111),
  (11, 0.011111124),
  (12, 0.011111144),
  (13, 0.011111119),
  (14, 0.011111128)],
 [(0, 0.06666667),
  (1, 0.06666667),
  (2, 0.06666667),
  (3, 0.06666667),
  (4, 0.06666667),
  (5, 0.06666667),
  (6, 0.06666667),
  (7, 0.06666667),
  (8, 0.06666667),
  (9, 0.06666667),
  (10, 0.06666667),
  (11, 0.06666667),
  (12, 0.06666667),
  (13, 0.06666667),
  (14, 0.06666667)]]