In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [4]:
import re
import unidecode
import numpy as np
import pandas as pd

In [39]:
import spacy
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string

In [64]:
import wordninja
# Importing Gensim
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

In [71]:
# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

  and should_run_async(code)


In [68]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting funcy
  Downloading funcy-1.16-py2.py3-none-any.whl (32 kB)
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: pyLDAvis, sklearn
  Building wheel for pyLDAvis (PEP 517): started
  Building wheel for pyLDAvis (PEP 517): finished with status 'done'
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136897 sha256=e0dec6362355654a3f06eea32d1f1fdab399bc79ab02030f309ceb0b2dd2cd3f
  Stored in directory: c:\users\l_drago\appdata\lo

In [58]:
from pprint import pprint

In [5]:
df=pd.read_csv('Assignment 2_Web Data.csv')

In [6]:
#Selecting only entity with English Language.
df_English=df[df.Language=='English']

In [7]:
#selecting only SNO and Mention Content column as we have to Topic modeeling for comments only.
df_Eng_doc=df_English.loc[:,['SNO','Mention Content']]

In [8]:
#Dropping the rows which do not contain any Data
df_Eng_doc.dropna(inplace=True)

In [9]:
df_Eng_doc.head()

Unnamed: 0,SNO,Mention Content
0,1,90Hz #motog10power #motog30 #AsliAllRounderCha...
1,2,RT @ManojSaru: Another Giveaway for #TechGyan ...
2,3,Wow Magnificent With Amazing Tech Features #Po...
3,4,@motorolaindia A3) Think Shield Follow all pla...
4,5,RT @OnePlusClub: #OnePlus9Pro in Morning Mist


In [42]:
# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

def clean_doc(text):
    
    # lower-case all characters
    text=text.lower()
    
    # remove twitter handles
    text= re.sub(r'@\S+', '',text) 
    
    # remove urls
    text= re.sub(r'http\S+', '',text) 
    text= re.sub(r'pic.\S+', '',text)
      
    # replace unidecode characters
    text=unidecode.unidecode(text)
      
    # regex only keeps characters
    text= re.sub(r"[^a-zA-Z+']", ' ',text)
    
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ') 

    # split words like 'whatisthis' to 'what is this'
    def preprocess_wordninja(sentence):      
        def split_words(x):
            x=wordninja.split(x)
            x= [word for word in x if len(word)>1]
            return x
        new_sentence=[ ' '.join(split_words(word)) for word in sentence.split() ]
        return ' '.join(new_sentence)
    
    text=preprocess_wordninja(text)
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in text.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
 
    # regex removes repeated spaces, strip removes leading and trailing spaces
    text= re.sub("\s[\s]+", " ",normalized).strip()  
    
    return text

In [43]:
# clean data stored in a new list
clean_corpus = [clean_doc(doc).split() for doc in df_Eng_doc['Mention Content']]

In [44]:
clean_corpus[:5]

[['hz', 'moto', 'power', 'moto', 'li', 'rounder', 'challenge'],
 ['rt',
  'another',
  'giveaway',
  'tech',
  'gy',
  'family',
  'yes',
  'giving',
  'away',
  'opp',
  'o',
  'new',
  'smartphone',
  'opp',
  'pro',
  'forget',
  'participate',
  'participate',
  'link',
  'feel',
  'free',
  'like',
  'tweet',
  'post',
  'giveaway',
  'flaunt',
  'night'],
 ['wow',
  'magnificent',
  'amazing',
  'tech',
  'feature',
  'powerful',
  'rounder',
  'moto',
  'power',
  'motorola'],
 ['think',
  'shield',
  'follow',
  'platform',
  'li',
  'rounder',
  'challenge',
  'moto',
  'power',
  'tag',
  'friend'],
 ['rt', 'one', 'plus', 'pro', 'morning', 'mist']]

In [48]:
# Creating the term dictionary of our courpus that is of all the words (Sepcific to Genism syntax perspective), 
# where every unique term is assigned an index. 

dict_ = corpora.Dictionary(clean_corpus)
print(dict_)

Dictionary(7083 unique tokens: ['challenge', 'hz', 'li', 'moto', 'power']...)


In [49]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary 
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1)],
 [(6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 2),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(3, 1),
  (4, 1),
  (5, 1),
  (27, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1)],
 [(0, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1)],
 [(24, 1), (25, 1), (42, 1), (43, 1), (44, 1), (45, 1)],
 [(23, 1),
  (25, 1),
  (40, 1),
  (44, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 2),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1)],
 [(0, 1), (2, 1), (3, 1), (4, 1

In [50]:
# Creating the object for LDA model using gensim library

Lda = gensim.models.ldamodel.LdaModel

In [51]:
# Running and Training LDA model on the document term matrix.

ldamodel = Lda(doc_term_matrix, num_topics=20, id2word = dict_, passes=1, random_state=0, eval_every=None)

In [52]:
ldamodel.print_topics()

[(0,
  '0.023*"phone" + 0.020*"iphone" + 0.020*"business" + 0.020*"ki" + 0.017*"real" + 0.016*"nah" + 0.016*"thanks" + 0.016*"ye" + 0.015*"market" + 0.015*"hai"'),
 (1,
  '0.038*"watch" + 0.034*"price" + 0.022*"mane" + 0.015*"le" + 0.015*"moto" + 0.012*"htc" + 0.012*"rounder" + 0.011*"wildfire" + 0.011*"motorola" + 0.010*"r"'),
 (2,
  '0.290*"trak" + 0.157*"tech" + 0.145*"keep" + 0.007*"sharma" + 0.007*"ghosh" + 0.007*"moto" + 0.007*"rounder" + 0.006*"management" + 0.006*"supa" + 0.006*"power"'),
 (3,
  '0.046*"alert" + 0.037*"rt" + 0.037*"contest" + 0.034*"opp" + 0.029*"pro" + 0.021*"chance" + 0.020*"stand" + 0.019*"mall" + 0.016*"series" + 0.016*"camera"'),
 (4,
  '0.060*"samsung" + 0.048*"galaxy" + 0.047*"review" + 0.030*"giveaway" + 0.030*"sensor" + 0.027*"pro" + 0.022*"even" + 0.022*"opp" + 0.022*"get" + 0.021*"note"'),
 (5,
  '0.127*"moto" + 0.110*"rounder" + 0.085*"li" + 0.072*"power" + 0.067*"challenge" + 0.035*"motorola" + 0.022*"join" + 0.016*"mah" + 0.015*"shield" + 0.013*"s

In [53]:
print(ldamodel.print_topics(num_topics=6, num_words=5))

# num_topics mean: how many topics want to extract 
# num_words: the number of words that want per topic

[(8, '0.062*"sensor" + 0.040*"mode" + 0.037*"night" + 0.033*"contest" + 0.031*"li"'), (2, '0.290*"trak" + 0.157*"tech" + 0.145*"keep" + 0.007*"sharma" + 0.007*"ghosh"'), (15, '0.031*"rounder" + 0.030*"moto" + 0.025*"li" + 0.023*"motorola" + 0.017*"challenge"'), (1, '0.038*"watch" + 0.034*"price" + 0.022*"mane" + 0.015*"le" + 0.015*"moto"'), (16, '0.056*"rounder" + 0.035*"moto" + 0.033*"rt" + 0.032*"li" + 0.031*"challenge"'), (12, '0.069*"mi" + 0.042*"red" + 0.031*"note" + 0.029*"phone" + 0.025*"good"')]


In [54]:
# printing the topic associations with the documents
count = 0
for i in ldamodel[doc_term_matrix]:
    print("doc : ",count,i)
    count += 1

doc :  0 [(5, 0.8812497)]
doc :  1 [(11, 0.9660713)]
doc :  2 [(2, 0.09913396), (5, 0.7174303), (6, 0.10614443)]
doc :  3 [(5, 0.8326405), (7, 0.092356324)]
doc :  4 [(9, 0.8642854)]
doc :  5 [(10, 0.9176556), (16, 0.059266157)]
doc :  6 [(5, 0.89444405)]
doc :  7 [(0, 0.016666744), (1, 0.016666744), (2, 0.016666744), (3, 0.016666744), (4, 0.016666744), (5, 0.016666744), (6, 0.016666744), (7, 0.016666744), (8, 0.016666744), (9, 0.016666744), (10, 0.016666744), (11, 0.016666744), (12, 0.016666748), (13, 0.68333185), (14, 0.016666744), (15, 0.016666744), (16, 0.016666744), (17, 0.016666744), (18, 0.016666744), (19, 0.016666744)]
doc :  8 [(11, 0.9660713)]
doc :  9 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007)

doc :  309 [(6, 0.11715912), (10, 0.080763645), (11, 0.040679503), (12, 0.04648368), (19, 0.68704563)]
doc :  310 [(5, 0.1383533), (10, 0.13249883), (12, 0.129412), (14, 0.5687946)]
doc :  311 [(0, 0.025000058), (1, 0.025000058), (2, 0.025000058), (3, 0.025000058), (4, 0.025000058), (5, 0.025000058), (6, 0.025000058), (7, 0.025000058), (8, 0.025000058), (9, 0.025000058), (10, 0.025000058), (11, 0.025000058), (12, 0.025000058), (13, 0.52499884), (14, 0.025000058), (15, 0.025000058), (16, 0.025000058), (17, 0.025000058), (18, 0.025000058), (19, 0.025000058)]
doc :  312 [(5, 0.5020769), (18, 0.4416144)]
doc :  313 [(4, 0.28964934), (7, 0.0560562), (10, 0.4811152), (12, 0.13507685)]
doc :  314 [(13, 0.84166646)]
doc :  315 [(3, 0.9499996)]
doc :  316 [(3, 0.5560811), (9, 0.4117744)]
doc :  317 [(17, 0.9634614)]
doc :  318 [(0, 0.012517898), (1, 0.012517897), (2, 0.012517897), (3, 0.012517897), (4, 0.012517897), (5, 0.012517897), (6, 0.012517897), (7, 0.22891247), (8, 0.26283243), (9, 0.012

doc :  636 [(3, 0.13641267), (4, 0.5300668), (5, 0.12609497), (10, 0.18026562)]
doc :  637 [(5, 0.9441102)]
doc :  638 [(11, 0.9660713)]
doc :  639 [(0, 0.16462725), (1, 0.45625192), (2, 0.010019434), (3, 0.010019434), (4, 0.010019434), (5, 0.010019434), (6, 0.010019434), (7, 0.010019435), (8, 0.010019434), (9, 0.010019434), (10, 0.010019434), (11, 0.010019434), (12, 0.010019434), (13, 0.010019434), (14, 0.010019434), (15, 0.010019434), (16, 0.010019434), (17, 0.010019434), (18, 0.010019434), (19, 0.20879044)]
doc :  640 [(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (4, 0.05), (5, 0.05), (6, 0.05), (7, 0.05), (8, 0.05), (9, 0.05), (10, 0.05), (11, 0.05), (12, 0.05), (13, 0.05), (14, 0.05), (15, 0.05), (16, 0.05), (17, 0.05), (18, 0.05), (19, 0.05)]
doc :  641 [(9, 0.94411737)]
doc :  642 [(0, 0.0100001795), (1, 0.45517904), (2, 0.0100001795), (3, 0.0100001795), (4, 0.0100001795), (5, 0.0100001795), (6, 0.0100001795), (7, 0.0100001795), (8, 0.0100001795), (9, 0.0100001795), (10, 0.010000

doc :  956 [(10, 0.93666625)]
doc :  957 [(5, 0.9030174), (8, 0.04960339)]
doc :  958 [(3, 0.24945234), (7, 0.54676527), (16, 0.105568245), (17, 0.07770016)]
doc :  959 [(10, 0.88124883)]
doc :  960 [(0, 0.018231316), (1, 0.05640742), (2, 0.011477851), (3, 0.018090129), (5, 0.05743031), (6, 0.06528008), (7, 0.12208519), (9, 0.022983972), (10, 0.08159326), (13, 0.0635836), (14, 0.12583295), (15, 0.16083516), (16, 0.12945522), (19, 0.06497323)]
doc :  961 [(3, 0.059964035), (5, 0.8259313), (18, 0.06687428)]
doc :  962 [(0, 0.025000101), (1, 0.52499807), (2, 0.025000101), (3, 0.025000101), (4, 0.025000101), (5, 0.025000101), (6, 0.025000101), (7, 0.025000101), (8, 0.025000101), (9, 0.025000101), (10, 0.025000101), (11, 0.025000101), (12, 0.025000101), (13, 0.025000101), (14, 0.025000101), (15, 0.025000101), (16, 0.025000101), (17, 0.025000101), (18, 0.025000101), (19, 0.025000101)]
doc :  963 [(13, 0.84166646)]
doc :  964 [(3, 0.9604162)]
doc :  965 [(11, 0.9660713)]
doc :  966 [(3, 0.920

doc :  1271 [(5, 0.920833)]
doc :  1272 [(6, 0.05921281), (10, 0.1815695), (13, 0.044844463), (14, 0.07341029), (18, 0.6130639)]
doc :  1273 [(8, 0.5283701), (10, 0.19629963), (15, 0.22219554)]
doc :  1274 [(2, 0.71811163), (7, 0.21191515)]
doc :  1275 [(14, 0.967241)]
doc :  1276 [(11, 0.9660713)]
doc :  1277 [(1, 0.031701587), (5, 0.41619715), (8, 0.48466712), (15, 0.046917044)]
doc :  1278 [(5, 0.82139885), (15, 0.109368585)]
doc :  1279 [(7, 0.96199614)]
doc :  1280 [(5, 0.8812497)]
doc :  1281 [(6, 0.96724087)]
doc :  1282 [(3, 0.82689923), (7, 0.1355937)]
doc :  1283 [(7, 0.5707278), (9, 0.27926975)]
doc :  1284 [(0, 0.012504159), (1, 0.012504159), (2, 0.012504159), (3, 0.012504159), (4, 0.012504159), (5, 0.012504159), (6, 0.012504159), (7, 0.012504159), (8, 0.012504159), (9, 0.012504159), (10, 0.51185626), (11, 0.0125041595), (12, 0.012504159), (13, 0.012504159), (14, 0.012504159), (15, 0.26306885), (16, 0.012504159), (17, 0.012504159), (18, 0.012504159), (19, 0.012504159)]
doc 

doc :  1595 [(7, 0.85831785), (9, 0.077380046)]
doc :  1596 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010000006), (6, 0.010000006), (7, 0.010000006), (8, 0.010000006), (9, 0.010000006), (10, 0.010000006), (11, 0.010000006), (12, 0.010000006), (13, 0.010000006), (14, 0.010000006), (15, 0.010000006), (16, 0.010000006), (17, 0.010000006), (18, 0.010000006), (19, 0.010000006)]
doc :  1597 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  1598 [(5, 0.6124246), (7, 0.021361252), (8, 0.033754915), (15, 0.19256891), (16, 0.12633596)]
doc :  1599 [(5, 0.90499836)]
doc :  1600 [(5, 0.51190484), (12, 0.020450715), (15, 0.44

doc :  1871 [(4, 0.9586954)]
doc :  1872 [(0, 0.12801306), (18, 0.031804513), (19, 0.8116451)]
doc :  1873 [(11, 0.9660713)]
doc :  1874 [(17, 0.9634614)]
doc :  1875 [(1, 0.123721525), (5, 0.7301324), (15, 0.116832614)]
doc :  1876 [(13, 0.28731725), (19, 0.5841009)]
doc :  1877 [(1, 0.06202216), (4, 0.3585628), (9, 0.5186828)]
doc :  1878 [(14, 0.13869084), (16, 0.6003444), (18, 0.23057897)]
doc :  1879 [(0, 0.08302699), (9, 0.09361995), (14, 0.33287725), (15, 0.4588465)]
doc :  1880 [(0, 0.25550872), (1, 0.10252853), (6, 0.18064101), (10, 0.3885632)]
doc :  1881 [(5, 0.69989514), (15, 0.15279399), (18, 0.130942)]
doc :  1882 [(13, 0.84166646)]
doc :  1883 [(5, 0.9366645)]
doc :  1884 [(0, 0.21788783), (1, 0.010417873), (2, 0.010417873), (3, 0.5945905), (4, 0.010417873), (5, 0.010417873), (6, 0.010417873), (7, 0.010417873), (8, 0.010417874), (9, 0.010417873), (10, 0.010417873), (11, 0.010417873), (12, 0.010417873), (13, 0.010417873), (14, 0.010417873), (15, 0.010417873), (16, 0.01041

doc :  2147 [(3, 0.25735098), (10, 0.66082966)]
doc :  2148 [(2, 0.8142668), (9, 0.10386375)]
doc :  2149 [(5, 0.92083263)]
doc :  2150 [(5, 0.6009829), (15, 0.37986088)]
doc :  2151 [(7, 0.97563934)]
doc :  2152 [(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (4, 0.05), (5, 0.05), (6, 0.05), (7, 0.05), (8, 0.05), (9, 0.05), (10, 0.05), (11, 0.05), (12, 0.05), (13, 0.05), (14, 0.05), (15, 0.05), (16, 0.05), (17, 0.05), (18, 0.05), (19, 0.05)]
doc :  2153 [(5, 0.43065548), (15, 0.5264853)]
doc :  2154 [(5, 0.25292218), (15, 0.7180423)]
doc :  2155 [(3, 0.9499995)]
doc :  2156 [(0, 0.36488944), (1, 0.016667025), (2, 0.016667025), (3, 0.016667025), (4, 0.016667025), (5, 0.016667025), (6, 0.016667025), (7, 0.016667025), (8, 0.016667025), (9, 0.016667025), (10, 0.016667025), (11, 0.016667025), (12, 0.016667025), (13, 0.33510405), (14, 0.016667025), (15, 0.016667025), (16, 0.016667025), (17, 0.016667025), (18, 0.016667025), (19, 0.016667025)]
doc :  2157 [(0, 0.016666943), (1, 0.016666943), (2,

doc :  2446 [(5, 0.89444405)]
doc :  2447 [(5, 0.49229696), (14, 0.06072488), (15, 0.39974973)]
doc :  2448 [(13, 0.84166646)]
doc :  2449 [(3, 0.9499995)]
doc :  2450 [(0, 0.016666751), (1, 0.016666751), (2, 0.016666751), (3, 0.016666751), (4, 0.016666751), (5, 0.016666751), (6, 0.016666751), (7, 0.016666751), (8, 0.016666751), (9, 0.016666751), (10, 0.016666751), (11, 0.016666751), (12, 0.016666751), (13, 0.6833317), (14, 0.016666751), (15, 0.016666751), (16, 0.016666751), (17, 0.016666751), (18, 0.016666751), (19, 0.016666751)]
doc :  2451 [(5, 0.8812497)]
doc :  2452 [(0, 0.016667226), (1, 0.35000736), (2, 0.016667226), (3, 0.016667226), (4, 0.016667226), (5, 0.016667226), (6, 0.016667226), (7, 0.016667226), (8, 0.016667226), (9, 0.016667226), (10, 0.34998256), (11, 0.016667226), (12, 0.016667226), (13, 0.016667226), (14, 0.016667226), (15, 0.016667226), (16, 0.016667226), (17, 0.016667226), (18, 0.016667226), (19, 0.016667226)]
doc :  2453 [(5, 0.89444405)]
doc :  2454 [(9, 0.1298

doc :  2705 [(0, 0.20640454), (2, 0.21365823), (3, 0.14652243), (6, 0.09519481), (10, 0.20997831), (14, 0.094852045)]
doc :  2706 [(4, 0.9586954)]
doc :  2707 [(13, 0.84166646)]
doc :  2708 [(3, 0.96199924)]
doc :  2709 [(0, 0.16791813), (1, 0.13748546), (4, 0.07966738), (5, 0.5008778), (18, 0.09131793)]
doc :  2710 [(5, 0.89444417)]
doc :  2711 [(17, 0.9634614)]
doc :  2712 [(0, 0.057277367), (4, 0.1346033), (5, 0.43119827), (7, 0.047073055), (12, 0.113070555), (13, 0.03073726), (14, 0.1072219), (15, 0.047057662), (17, 0.020295106)]
doc :  2713 [(6, 0.25485766), (7, 0.19261794), (10, 0.2433303), (14, 0.039803553), (16, 0.03770041), (17, 0.1698382), (18, 0.038629413)]
doc :  2714 [(0, 0.010000088), (1, 0.63609076), (2, 0.010000088), (3, 0.010000088), (4, 0.010000088), (5, 0.010000088), (6, 0.010000088), (7, 0.010000088), (8, 0.010000088), (9, 0.010000088), (10, 0.010000088), (11, 0.18390769), (12, 0.010000088), (13, 0.010000088), (14, 0.010000088), (15, 0.010000088), (16, 0.010000088),

doc :  3030 [(5, 0.30208567), (8, 0.6505457)]
doc :  3031 [(10, 0.47987944), (14, 0.44511512)]
doc :  3032 [(14, 0.96833295)]
doc :  3033 [(0, 0.07096233), (3, 0.043308042), (4, 0.17388599), (6, 0.113608316), (7, 0.19786178), (10, 0.101348795), (14, 0.10811501), (16, 0.057816606), (17, 0.11106632)]
doc :  3034 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  3035 [(13, 0.84166646)]
doc :  3036 [(5, 0.9269216)]
doc :  3037 [(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (4, 0.05), (5, 0.05), (6, 0.05), (7, 0.05), (8, 0.05), (9, 0.05), (10, 0.05), (11, 0.05), (12, 0.05), (13, 0.05), (14, 0.05), (15, 0.05), (16, 0.05), (17, 0.05), (18, 0.05), (19, 0.05)]
doc :  3038 [(13

doc :  3294 [(3, 0.23395252), (8, 0.06136794), (10, 0.27973408), (16, 0.3849336)]
doc :  3295 [(3, 0.527658), (4, 0.3723412)]
doc :  3296 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010000006), (6, 0.010000006), (7, 0.010000006), (8, 0.010000006), (9, 0.010000006), (10, 0.010000006), (11, 0.010000006), (12, 0.010000006), (13, 0.010000006), (14, 0.010000006), (15, 0.010000006), (16, 0.010000006), (17, 0.010000006), (18, 0.010000006), (19, 0.010000006)]
doc :  3297 [(11, 0.9660713)]
doc :  3298 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  3299 [(5, 0.73886466), (15, 0.1483429), (17, 0.06276052)]
doc :  3300 [(13

doc :  3527 [(6, 0.42041957), (7, 0.17854337), (8, 0.20862071), (10, 0.12571923)]
doc :  3528 [(0, 0.010000267), (1, 0.010000267), (2, 0.010000267), (3, 0.010000267), (4, 0.010000267), (5, 0.010000267), (6, 0.29599625), (7, 0.010000267), (8, 0.010000267), (9, 0.010000267), (10, 0.010000267), (11, 0.010000267), (12, 0.24323753), (13, 0.010000267), (14, 0.010000267), (15, 0.010000267), (16, 0.010000267), (17, 0.010000267), (18, 0.010000267), (19, 0.29076174)]
doc :  3529 [(9, 0.33047068), (16, 0.61952835)]
doc :  3530 [(16, 0.96607125)]
doc :  3531 [(5, 0.904999)]
doc :  3532 [(17, 0.9634614)]
doc :  3533 [(5, 0.9049994)]
doc :  3534 [(3, 0.5561013), (9, 0.41175416)]
doc :  3535 [(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (4, 0.05), (5, 0.05), (6, 0.05), (7, 0.05), (8, 0.05), (9, 0.05), (10, 0.05), (11, 0.05), (12, 0.05), (13, 0.05), (14, 0.05), (15, 0.05), (16, 0.05), (17, 0.05), (18, 0.05), (19, 0.05)]
doc :  3536 [(1, 0.12228742), (3, 0.038752276), (5, 0.54961437), (12, 0.12713373), 

doc :  3852 [(11, 0.11141655), (14, 0.3348633), (15, 0.34791857), (17, 0.116899185)]
doc :  3853 [(9, 0.9634587)]
doc :  3854 [(13, 0.84166557)]
doc :  3855 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  3856 [(5, 0.9110945), (7, 0.04799036)]
doc :  3857 [(1, 0.10753237), (4, 0.08605179), (6, 0.16818665), (7, 0.11124429), (9, 0.14062896), (14, 0.17109369), (17, 0.08932964), (18, 0.09858207)]
doc :  3858 [(5, 0.92083204)]
doc :  3859 [(5, 0.904999)]
doc :  3860 [(0, 0.3331542), (4, 0.42068198), (9, 0.113077335), (14, 0.07972522)]
doc :  3861 [(11, 0.9660713)]
doc :  3862 [(0, 0.025006356), (1, 0.025006356), (2, 0.025006356), (3, 0.025006356), (4, 0.025006356), (5,

doc :  4192 [(9, 0.12069149), (11, 0.2117839), (14, 0.63056654)]
doc :  4193 [(3, 0.9499995)]
doc :  4194 [(5, 0.90499943)]
doc :  4195 [(5, 0.7929045), (15, 0.13785289)]
doc :  4196 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  4197 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  4198 [(2, 0.6753871), (4, 0.17452599)]
doc :  4199 [(3, 0.59

doc :  4526 [(6, 0.10669331), (10, 0.58447826), (11, 0.079104915), (18, 0.16817272)]
doc :  4527 [(0, 0.025000948), (1, 0.025000948), (2, 0.025000948), (3, 0.025000948), (4, 0.025000948), (5, 0.025000948), (6, 0.025000948), (7, 0.025000948), (8, 0.025000948), (9, 0.025000948), (10, 0.025000948), (11, 0.025000948), (12, 0.025000948), (13, 0.025000948), (14, 0.524982), (15, 0.025000948), (16, 0.025000948), (17, 0.025000948), (18, 0.025000948), (19, 0.025000948)]
doc :  4528 [(1, 0.1685788), (5, 0.46506757), (15, 0.25046527), (16, 0.08625113)]
doc :  4529 [(1, 0.16121818), (5, 0.66245157), (18, 0.12910312)]
doc :  4530 [(13, 0.84166646)]
doc :  4531 [(10, 0.21852854), (12, 0.15000851), (18, 0.15913771), (19, 0.39959303)]
doc :  4532 [(0, 0.025000107), (1, 0.025000107), (2, 0.025000107), (3, 0.025000107), (4, 0.025000107), (5, 0.025000107), (6, 0.025000107), (7, 0.025000107), (8, 0.025000107), (9, 0.025000107), (10, 0.025000107), (11, 0.025000107), (12, 0.025000107), (13, 0.52499795), (14,

doc :  4795 [(2, 0.02656474), (5, 0.89297247), (12, 0.038252603), (18, 0.0255377)]
doc :  4796 [(5, 0.86428463)]
doc :  4797 [(0, 0.13660242), (6, 0.07311564), (7, 0.10177118), (10, 0.415148), (16, 0.20517662)]
doc :  4798 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010000006), (6, 0.010000006), (7, 0.010000006), (8, 0.010000006), (9, 0.010000006), (10, 0.010000006), (11, 0.010000006), (12, 0.010000006), (13, 0.010000006), (14, 0.010000006), (15, 0.010000006), (16, 0.010000006), (17, 0.010000006), (18, 0.010000006), (19, 0.010000006)]
doc :  4799 [(2, 0.68375516), (12, 0.18767242)]
doc :  4800 [(0, 0.3118194), (7, 0.33259916), (9, 0.20392199), (19, 0.090099886)]
doc :  4801 [(5, 0.894444)]
doc :  4802 [(0, 0.32341382), (1, 0.37656814), (2, 0.016667668), (3, 0.016667668), (4, 0.016667668), (5, 0.016667668), (6, 0.016667668), (7, 0.016667668), (8, 0.016667668), (9, 0.016667668), (10, 0.016667668), (11, 0.016667668), (12, 0.016667668), (

doc :  5085 [(1, 0.058459204), (3, 0.087605424), (6, 0.13563709), (9, 0.219477), (10, 0.46131486)]
doc :  5086 [(0, 0.016667528), (1, 0.016667528), (2, 0.016667528), (3, 0.016667528), (4, 0.016667528), (5, 0.016667528), (6, 0.016667528), (7, 0.016667528), (8, 0.016667528), (9, 0.016667528), (10, 0.016667528), (11, 0.016667528), (12, 0.016667537), (13, 0.35000992), (14, 0.016667528), (15, 0.016667528), (16, 0.016667528), (17, 0.016667528), (18, 0.016667528), (19, 0.34997454)]
doc :  5087 [(4, 0.7348312), (5, 0.234134)]
doc :  5088 [(1, 0.74174047), (5, 0.22922693)]
doc :  5089 [(1, 0.10304697), (5, 0.84064263)]
doc :  5090 [(0, 0.025000526), (1, 0.025000526), (2, 0.025000526), (3, 0.025000526), (4, 0.025000526), (5, 0.025000526), (6, 0.025000526), (7, 0.025000526), (8, 0.025000526), (9, 0.025000526), (10, 0.025000526), (11, 0.025000526), (12, 0.025000526), (13, 0.025000526), (14, 0.52498996), (15, 0.025000526), (16, 0.025000526), (17, 0.025000526), (18, 0.025000526), (19, 0.02500053)]
d

doc :  5407 [(5, 0.6622702), (8, 0.14134948), (15, 0.13971049)]
doc :  5408 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010000006), (6, 0.010000006), (7, 0.010000006), (8, 0.010000006), (9, 0.010000006), (10, 0.010000006), (11, 0.010000006), (12, 0.010000006), (13, 0.010000006), (14, 0.010000006), (15, 0.010000006), (16, 0.010000006), (17, 0.010000006), (18, 0.010000006), (19, 0.010000006)]
doc :  5409 [(0, 0.012500937), (1, 0.51245046), (2, 0.012500937), (3, 0.012500937), (4, 0.012500937), (5, 0.012500937), (6, 0.012500937), (7, 0.012500937), (8, 0.012500937), (9, 0.012500937), (10, 0.012500937), (11, 0.012500937), (12, 0.012500937), (13, 0.26253268), (14, 0.012500937), (15, 0.012500937), (16, 0.012500937), (17, 0.012500937), (18, 0.012500937), (19, 0.012500937)]
doc :  5410 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), 

doc :  5696 [(5, 0.30201402), (8, 0.6506174)]
doc :  5697 [(17, 0.9634614)]
doc :  5698 [(1, 0.15283996), (7, 0.6091331), (9, 0.07797859), (17, 0.10667169)]
doc :  5699 [(1, 0.12757607), (5, 0.41236478), (15, 0.40693268)]
doc :  5700 [(5, 0.91363555)]
doc :  5701 [(17, 0.9634614)]
doc :  5702 [(12, 0.9743239)]
doc :  5703 [(0, 0.2623816), (1, 0.012501283), (2, 0.012501284), (3, 0.012501283), (4, 0.012501283), (5, 0.012501284), (6, 0.012501283), (7, 0.012501283), (8, 0.012501283), (9, 0.012501283), (10, 0.012501283), (11, 0.012501283), (12, 0.012501283), (13, 0.012501283), (14, 0.012501283), (15, 0.5125953), (16, 0.012501283), (17, 0.012501283), (18, 0.012501283), (19, 0.012501283)]
doc :  5704 [(17, 0.9634614)]
doc :  5705 [(5, 0.8020108), (15, 0.086652644), (18, 0.06409112)]
doc :  5706 [(0, 0.012500255), (1, 0.012500255), (2, 0.012500255), (3, 0.012500255), (4, 0.012500255), (5, 0.012500255), (6, 0.012500255), (7, 0.26231393), (8, 0.012500255), (9, 0.012500255), (10, 0.012500255), (1

doc :  6029 [(5, 0.18669537), (8, 0.75705445)]
doc :  6030 [(5, 0.9208327)]
doc :  6031 [(5, 0.19090912), (6, 0.18667094), (7, 0.27007303), (8, 0.17716517), (12, 0.10696867)]
doc :  6032 [(5, 0.9366651)]
doc :  6033 [(5, 0.88124955)]
doc :  6034 [(17, 0.9634614)]
doc :  6035 [(5, 0.90499955)]
doc :  6036 [(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (4, 0.05), (5, 0.05), (6, 0.05), (7, 0.05), (8, 0.05), (9, 0.05), (10, 0.05), (11, 0.05), (12, 0.05), (13, 0.05), (14, 0.05), (15, 0.05), (16, 0.05), (17, 0.05), (18, 0.05), (19, 0.05)]
doc :  6037 [(5, 0.11344501), (8, 0.8336136)]
doc :  6038 [(0, 0.12850368), (6, 0.15656704), (7, 0.058304746), (8, 0.08435323), (15, 0.07268322), (17, 0.2072956), (18, 0.25165346)]
doc :  6039 [(2, 0.032792933), (3, 0.09294549), (5, 0.54438037), (10, 0.13419153), (15, 0.17425393)]
doc :  6040 [(17, 0.9634614)]
doc :  6041 [(0, 0.04364649), (3, 0.039026298), (8, 0.037006367), (10, 0.083045244), (12, 0.070754774), (15, 0.054237686), (18, 0.45466623), (19, 0.197

doc :  6213 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  6214 [(3, 0.94999945)]
doc :  6215 [(1, 0.31638834), (4, 0.17744619), (6, 0.36449128)]
doc :  6216 [(5, 0.89443946)]
doc :  6217 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010000006), (6, 0.010000006), (7, 0.010000006), (8, 0.010000006), (9, 0.010000006), (10, 0.010000006), (11, 0.010000006), (12, 0.010000006), (13, 0.010000006), (14, 0.010000006), (15, 0.010000006), (16, 0.010000006), (17, 0.010000006), (18, 0.010000006), (19, 0.010000006)]
doc :  6218 [(5, 0.9049996)]
doc :  6219 [(5, 0.9136358)]
doc :  6220 [(5, 0.9136358)]
doc :  6221 [(7, 0.2310823

doc :  6485 [(0, 0.24086662), (1, 0.081413016), (4, 0.09555411), (7, 0.15505931), (8, 0.12736222), (11, 0.08043896), (12, 0.16928264)]
doc :  6486 [(6, 0.8416625)]
doc :  6487 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  6488 [(5, 0.936653)]
doc :  6489 [(5, 0.9269224)]
doc :  6490 [(0, 0.012500079), (1, 0.7624985), (2, 0.012500079), (3, 0.012500079), (4, 0.012500079), (5, 0.012500079), (6, 0.012500079), (7, 0.012500079), (8, 0.012500079), (9, 0.012500079), (10, 0.012500079), (11, 0.012500079), (12, 0.012500079), (13, 0.012500079), (14, 0.012500079), (15, 0.012500079), (16, 0.012500079), (17, 0.012500079), (18, 0.012500079), (19, 0.012500079)]
doc :  6491 [(4, 

doc :  6809 [(17, 0.9634614)]
doc :  6810 [(13, 0.84166646)]
doc :  6811 [(0, 0.016666783), (1, 0.016666783), (2, 0.016666783), (3, 0.016666783), (4, 0.016666783), (5, 0.016666783), (6, 0.016666783), (7, 0.016666783), (8, 0.016666783), (9, 0.016666783), (10, 0.016666783), (11, 0.016666783), (12, 0.016666783), (13, 0.68333113), (14, 0.016666783), (15, 0.016666783), (16, 0.016666783), (17, 0.016666783), (18, 0.016666783), (19, 0.016666783)]
doc :  6812 [(11, 0.9660713)]
doc :  6813 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  6814 [(0, 0.04494192), (1, 0.13688646), (5, 0.45290068), (7, 0.23994957), (15, 0.058412638), (16, 0.039982315)]
doc :  6815 [(0, 0.01250008

doc :  7154 [(1, 0.14547381), (3, 0.044909783), (4, 0.09113946), (6, 0.060966678), (7, 0.19835767), (12, 0.2835592), (16, 0.04637926), (17, 0.048344046), (18, 0.057889998)]
doc :  7155 [(3, 0.9499996)]
doc :  7156 [(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (4, 0.05), (5, 0.05), (6, 0.05), (7, 0.05), (8, 0.05), (9, 0.05), (10, 0.05), (11, 0.05), (12, 0.05), (13, 0.05), (14, 0.05), (15, 0.05), (16, 0.05), (17, 0.05), (18, 0.05), (19, 0.05)]
doc :  7157 [(2, 0.57828593), (9, 0.12507193), (12, 0.1752126)]
doc :  7158 [(0, 0.016666746), (1, 0.016666746), (2, 0.016666746), (3, 0.016666746), (4, 0.3860159), (5, 0.016666746), (6, 0.016666746), (7, 0.016666746), (8, 0.016666746), (9, 0.016666746), (10, 0.016666746), (11, 0.016666746), (12, 0.016666746), (13, 0.31398267), (14, 0.016666746), (15, 0.016666746), (16, 0.016666746), (17, 0.016666746), (18, 0.016666746), (19, 0.016666746)]
doc :  7159 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010

doc :  7417 [(5, 0.78615916), (19, 0.1446091)]
doc :  7418 [(0, 0.016666714), (1, 0.016666714), (2, 0.016666714), (3, 0.016666714), (4, 0.016666714), (5, 0.016666714), (6, 0.016666714), (7, 0.016666714), (8, 0.016666714), (9, 0.016666714), (10, 0.016666714), (11, 0.016666714), (12, 0.016666714), (13, 0.68333244), (14, 0.016666714), (15, 0.016666714), (16, 0.016666714), (17, 0.016666714), (18, 0.016666714), (19, 0.016666714)]
doc :  7419 [(1, 0.07697575), (3, 0.040523257), (5, 0.4229836), (15, 0.2542906), (16, 0.1843588)]
doc :  7420 [(7, 0.94062436)]
doc :  7421 [(1, 0.43000537), (2, 0.040084783), (5, 0.36998436), (12, 0.03989726), (15, 0.10560359)]
doc :  7422 [(1, 0.10222794), (5, 0.55758184), (8, 0.105949365), (15, 0.19785835)]
doc :  7423 [(7, 0.29697883), (10, 0.18493398), (14, 0.22283186), (19, 0.22858116)]
doc :  7424 [(1, 0.09722314), (6, 0.06794626), (7, 0.3035286), (10, 0.47796276)]
doc :  7425 [(17, 0.9634614)]
doc :  7426 [(15, 0.9472204)]
doc :  7427 [(3, 0.9499995)]
doc :

doc :  7708 [(9, 0.5743472), (10, 0.3506519)]
doc :  7709 [(3, 0.19397084), (7, 0.74002796), (10, 0.043629825)]
doc :  7710 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  7711 [(0, 0.016666725), (1, 0.6833322), (2, 0.016666725), (3, 0.016666725), (4, 0.016666725), (5, 0.016666725), (6, 0.016666725), (7, 0.016666725), (8, 0.016666725), (9, 0.016666725), (10, 0.016666725), (11, 0.016666725), (12, 0.016666725), (13, 0.016666725), (14, 0.016666725), (15, 0.016666725), (16, 0.016666725), (17, 0.016666725), (18, 0.016666725), (19, 0.016666725)]
doc :  7712 [(11, 0.9660713)]
doc :  7713 [(11, 0.9660713)]
doc :  7714 [(0, 0.30800632), (5, 0.07460321), (7, 0.30926785), (9

doc :  8057 [(16, 0.96607125)]
doc :  8058 [(0, 0.05352812), (9, 0.76800233), (12, 0.06734773), (13, 0.071116164)]
doc :  8059 [(5, 0.8944438)]
doc :  8060 [(10, 0.75462174), (19, 0.13285983)]
doc :  8061 [(17, 0.9634614)]
doc :  8062 [(16, 0.96607125)]
doc :  8063 [(0, 0.18400803), (7, 0.3199423), (9, 0.13552298), (10, 0.1352905), (12, 0.13147135)]
doc :  8064 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010000006), (6, 0.010000006), (7, 0.010000006), (8, 0.010000006), (9, 0.010000006), (10, 0.010000006), (11, 0.010000006), (12, 0.010000006), (13, 0.010000006), (14, 0.010000006), (15, 0.010000006), (16, 0.010000006), (17, 0.010000006), (18, 0.010000006), (19, 0.010000006)]
doc :  8065 [(16, 0.96607125)]
doc :  8066 [(1, 0.40100157), (6, 0.18356597), (15, 0.07225011), (16, 0.19111972), (18, 0.11037163)]
doc :  8067 [(17, 0.9634614)]
doc :  8068 [(9, 0.8642854)]
doc :  8069 [(5, 0.16192874), (16, 0.5656754), (19, 0.2500272)]
doc :  8070

doc :  8293 [(0, 0.039730977), (1, 0.101014666), (4, 0.06495848), (5, 0.37324134), (13, 0.07238571), (15, 0.3286612)]
doc :  8294 [(2, 0.71541405), (16, 0.15595636)]
doc :  8295 [(1, 0.15803392), (7, 0.16425617), (13, 0.12691495), (14, 0.43650043)]
doc :  8296 [(3, 0.17407262), (10, 0.69732916)]
doc :  8297 [(2, 0.6930438), (13, 0.15695578)]
doc :  8298 [(0, 0.38518366), (3, 0.06520421), (4, 0.10621712), (5, 0.06486664), (7, 0.22765708), (10, 0.08004886), (13, 0.04915096)]
doc :  8299 [(12, 0.96724075)]
doc :  8300 [(5, 0.88124955)]
doc :  8301 [(9, 0.3273427), (16, 0.6252879)]
doc :  8302 [(16, 0.96607125)]
doc :  8303 [(1, 0.053607788), (5, 0.41846263), (13, 0.25542545), (15, 0.23878363), (18, 0.026837746)]
doc :  8304 [(0, 0.15072846), (2, 0.052160297), (4, 0.2075816), (9, 0.067455806), (10, 0.20574433), (12, 0.11065948), (14, 0.17144161)]
doc :  8305 [(5, 0.8339218), (12, 0.11607709)]
doc :  8306 [(2, 0.062853694), (5, 0.88713527)]
doc :  8307 [(0, 0.016669668), (1, 0.016669668), (

doc :  8553 [(0, 0.010000006), (1, 0.010000006), (2, 0.80999994), (3, 0.010000006), (4, 0.010000006), (5, 0.010000006), (6, 0.010000006), (7, 0.010000006), (8, 0.010000006), (9, 0.010000006), (10, 0.010000006), (11, 0.010000006), (12, 0.010000006), (13, 0.010000006), (14, 0.010000006), (15, 0.010000006), (16, 0.010000006), (17, 0.010000006), (18, 0.010000006), (19, 0.010000006)]
doc :  8554 [(13, 0.84166646)]
doc :  8555 [(5, 0.9049992)]
doc :  8556 [(2, 0.063944675), (3, 0.07154722), (6, 0.1855281), (7, 0.25235444), (10, 0.19495004), (14, 0.06754294), (18, 0.12587689)]
doc :  8557 [(5, 0.055353824), (15, 0.9017772)]
doc :  8558 [(17, 0.9634614)]
doc :  8559 [(17, 0.9634614)]
doc :  8560 [(10, 0.67909276), (15, 0.19231728)]
doc :  8561 [(11, 0.9660713)]
doc :  8562 [(1, 0.11138148), (2, 0.11140891), (6, 0.13633002), (10, 0.4988554), (17, 0.10253541)]
doc :  8563 [(0, 0.016667185), (1, 0.6833235), (2, 0.016667185), (3, 0.016667185), (4, 0.016667185), (5, 0.016667185), (6, 0.016667185), 

doc :  8848 [(4, 0.43296027), (5, 0.23446384), (15, 0.2924487), (17, 0.023095777)]
doc :  8849 [(0, 0.01666673), (1, 0.01666673), (2, 0.01666673), (3, 0.01666673), (4, 0.01666673), (5, 0.01666673), (6, 0.01666673), (7, 0.01666673), (8, 0.01666673), (9, 0.01666673), (10, 0.01666673), (11, 0.01666673), (12, 0.01666673), (13, 0.6833321), (14, 0.01666673), (15, 0.01666673), (16, 0.01666673), (17, 0.01666673), (18, 0.01666673), (19, 0.01666673)]
doc :  8850 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.010000007), (8, 0.010000007), (9, 0.010000007), (10, 0.010000007), (11, 0.010000007), (12, 0.010000007), (13, 0.010000007), (14, 0.010000007), (15, 0.010000007), (16, 0.010000007), (17, 0.010000007), (18, 0.010000007), (19, 0.010000007)]
doc :  8851 [(8, 0.9568176)]
doc :  8852 [(0, 0.010000007), (1, 0.010000007), (2, 0.8099999), (3, 0.010000007), (4, 0.010000007), (5, 0.010000007), (6, 0.010000007), (7, 0.01

# Model 2


In [55]:
ldamodel2 = Lda(doc_term_matrix, num_topics=20, id2word = dict_, passes=10,chunksize=100, random_state=0, eval_every=None,alpha='auto',per_word_topics=True)

In [59]:
pprint(ldamodel2.print_topics())

[(0,
  '0.182*"pro" + 0.146*"opp" + 0.136*"camera" + 0.103*"series" + '
  '0.073*"giveaway" + 0.068*"rt" + 0.066*"mp" + 0.037*"iphone" + 0.024*"apple" '
  '+ 0.023*"medium"'),
 (1,
  '0.117*"hi" + 0.088*"sharma" + 0.077*"al" + 0.071*"tagging" + '
  '0.039*"vaishali" + 0.037*"mai" + 0.036*"ha" + 0.033*"nice" + 0.033*"suri" + '
  '0.033*"bro"'),
 (2,
  '0.154*"make" + 0.142*"online" + 0.118*"week" + 0.112*"rt" + 0.062*"come" + '
  '0.026*"book" + 0.019*"mega" + 0.010*"lakshmi" + 0.007*"car" + 0.003*"age"'),
 (3,
  '0.127*"smartphone" + 0.114*"smart" + 0.069*"google" + 0.058*"year" + '
  '0.055*"pixel" + 0.051*"back" + 0.049*"nokia" + 0.045*"htc" + 0.043*"done" + '
  '0.029*"new"'),
 (4,
  '0.306*"feature" + 0.196*"super" + 0.068*"even" + 0.000*"real" + '
  '0.000*"live" + 0.000*"mi" + 0.000*"red" + 0.000*"excited" + 0.000*"say" + '
  '0.000*"flash"'),
 (5,
  '0.201*"moto" + 0.185*"rounder" + 0.140*"li" + 0.114*"challenge" + '
  '0.110*"power" + 0.057*"motorola" + 0.034*"join" + 0.024*"so

In [61]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel2.log_perplexity(doc_term_matrix))  # a measure of how good the model is. lower the better.



Perplexity:  -12.093692840999907


In [62]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix))  # a measure of how good the model is. lower the better.



Perplexity:  -6.691321110423637


In [65]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel2, texts=clean_corpus, dictionary=dict_, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4931485290162132


In [66]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=clean_corpus, dictionary=dict_, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.511287578669512


In [72]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(ldamodel2, doc_term_matrix, dict_)
vis

  and should_run_async(code)


In [74]:
def format_topics_sentences(ldamodel=ldamodel2, corpus=doc_term_matrix, texts=clean_corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel2[doc_term_matrix]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel2.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

  and should_run_async(code)


In [75]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel2, corpus=doc_term_matrix, texts=clean_corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

  and should_run_async(code)


TypeError: '<' not supported between instances of 'int' and 'tuple'

In [76]:
i

  and should_run_async(code)


[(0, 0.010000007),
 (1, 0.010000007),
 (2, 0.8099999),
 (3, 0.010000007),
 (4, 0.010000007),
 (5, 0.010000007),
 (6, 0.010000007),
 (7, 0.010000007),
 (8, 0.010000007),
 (9, 0.010000007),
 (10, 0.010000007),
 (11, 0.010000007),
 (12, 0.010000007),
 (13, 0.010000007),
 (14, 0.010000007),
 (15, 0.010000007),
 (16, 0.010000007),
 (17, 0.010000007),
 (18, 0.010000007),
 (19, 0.010000007)]

In [77]:
row

  and should_run_async(code)


NameError: name 'row' is not defined

In [79]:
count = 0
for i in ldamodel2[doc_term_matrix]:
    pprint("doc : ",count,i)
    count += 1

  and should_run_async(code)


TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'

In [None]:
# Apply Preprocessing on the Corpus

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    #To remove any number and alien character
    review = re.sub('[^a-zA-Z]', ' ', doc)
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in review.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized