In [1]:
import pandas as pd
import json
import numpy as np

### Load JSON Data

In [2]:
data = []
with open('topic_modeling_data.json') as data_file:    
     for line in data_file:
        data.append(json.loads(line))

### Convert the json object to dataframe

In [3]:
documents = pd.DataFrame(data)
print(documents[:5])
topic=['Topic1','Topic2','Topic3','Topic4','Topic5']
prob =['Prob1','Prob2', 'Prob3', 'Prob4','Prob5']
for i in range(len(topic)):
    documents[topic[i]]=np.nan
    documents[prob[i]]=np.nan
print(documents[:5])

                                                 _id  \
0  04ed93fc36f0995d3225b830cf2a07e03953cd8ca76e14...   
1  05fa50761cbdc510c6b464e6f162f2b17b7b94735aad35...   
2  076a6d4900a5ee316cfedcb8310b2701ef1dad86d6eb19...   
3  077da9052c3e1a9e894203030887e825faf0013463e689...   
4  07a9891ecef7bc7175fa205fb036764cc30cd4c9dc50d6...   

                                                text  
0   Windows Has Detected a Malicious Virus On You...  
1  Your session has timed out Click OK to sign in...  
3   Please hold a while as OneDrive Security is s...  
4  Your session has timed out Click OK to sign in...  
                                                 _id  \
0  04ed93fc36f0995d3225b830cf2a07e03953cd8ca76e14...   
1  05fa50761cbdc510c6b464e6f162f2b17b7b94735aad35...   
2  076a6d4900a5ee316cfedcb8310b2701ef1dad86d6eb19...   
3  077da9052c3e1a9e894203030887e825faf0013463e689...   
4  07a9891ecef7bc7175fa205fb036764cc30cd4c9dc50d6...   

                                                tex

In [4]:
documents.head(2)

Unnamed: 0,_id,text,Topic1,Prob1,Topic2,Prob2,Topic3,Prob3,Topic4,Prob4,Topic5,Prob5
0,04ed93fc36f0995d3225b830cf2a07e03953cd8ca76e14...,Windows Has Detected a Malicious Virus On You...,,,,,,,,,,
1,05fa50761cbdc510c6b464e6f162f2b17b7b94735aad35...,Your session has timed out Click OK to sign in...,,,,,,,,,,


### Data Preprocessing

In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2018)



In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [8]:
# Preprocess the data
processed_docs = documents['text'].map(preprocess)

### Bag of words on the dataset

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)
# Saving the dictionary
dictionary.save('dictionary.gensim')

In [10]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 assist
1 certifi
2 contact
3 detect
4 error
5 failur
6 immedi
7 malici
8 restart
9 shutdown
10 support


In [11]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=50000)

In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# Saving the corpus
import pickle
pickle.dump(bow_corpus, open('bow_corpus.pkl', 'wb'))
bow_corpus[150]

[(10, 1),
 (16, 1),
 (24, 2),
 (41, 1),
 (43, 2),
 (44, 1),
 (63, 2),
 (68, 2),
 (69, 1),
 (73, 1),
 (77, 1),
 (128, 1),
 (130, 3),
 (243, 2),
 (267, 1),
 (305, 1),
 (562, 1),
 (563, 1),
 (564, 3),
 (565, 1),
 (566, 1),
 (567, 1),
 (568, 1),
 (569, 1),
 (570, 1),
 (571, 1)]

In [13]:
bow_doc_1500 = bow_corpus[1500]

for i in range(len(bow_doc_1500)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1500[i][1], 
                                                     dictionary[bow_doc_1500[i][1]], 
                                                     bow_doc_1500[i][1]))

Word 1 ("certifi") appears 1 time.
Word 2 ("contact") appears 2 time.
Word 80 ("sign") appears 80 time.
Word 2 ("contact") appears 2 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 2 ("contact") appears 2 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 2 ("contact") appears 2 time.
Word 1 ("certifi") appears 1 time.
Word 5 ("failur") appears 5 time.
Word 1 ("certifi") appears 1 time.
Word 2 ("contact") appears 2 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 22 ("alttd") appears 22 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appears 1 time.
Word 3 ("detect") appears 3 time.
Word 1 ("certifi") appears 1 time.
Word 1 ("certifi") appe

### TF-IDF

In [14]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)

In [15]:
corpus_tfidf = tfidf[bow_corpus]

In [16]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.33647944042396016),
 (1, 0.36403882290982759),
 (2, 0.13505646578212077),
 (3, 0.15159355960230564),
 (4, 0.11650315604499903),
 (5, 0.1810120874186556),
 (6, 0.27304292264904145),
 (7, 0.18001895151732664),
 (8, 0.22186023852065423),
 (9, 0.25847686949333859),
 (10, 0.055174332689037292),
 (11, 0.21188909303395032),
 (12, 0.52625532727907631),
 (13, 0.23140335771435575),
 (14, 0.097024120684999979),
 (15, 0.2363462036293083)]


### Running LDA using Bag of Words

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2, random_state= 2)
# Saving the model
lda_model.save('lda_model.gensim')

In [18]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"download" + 0.009*"load" + 0.007*"cuenta" + 0.007*"account" + 0.007*"spin" + 0.007*"quatro" + 0.007*"para" + 0.007*"condit" + 0.006*"tout" + 0.006*"pour"
Topic: 1 
Words: 0.054*"download" + 0.040*"deposit" + 0.020*"spin" + 0.018*"machin" + 0.015*"song" + 0.011*"movi" + 0.007*"music" + 0.006*"australia" + 0.006*"mobil" + 0.006*"win"
Topic: 2 
Words: 0.014*"live" + 0.011*"blackjack" + 0.011*"poker" + 0.010*"spin" + 0.008*"tabl" + 0.008*"download" + 0.008*"popular" + 0.008*"wild" + 0.007*"reel" + 0.006*"deposit"
Topic: 3 
Words: 0.054*"descargar" + 0.031*"grati" + 0.027*"para" + 0.015*"descarga" + 0.013*"download" + 0.009*"song" + 0.009*"juego" + 0.009*"window" + 0.007*"musica" + 0.007*"espaol"
Topic: 4 
Words: 0.028*"load" + 0.025*"account" + 0.013*"deposit" + 0.009*"wager" + 0.009*"requir" + 0.009*"promot" + 0.008*"term" + 0.008*"right" + 0.008*"win" + 0.008*"withdraw"


### Running LDA using TF-IDF

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4, random_state=2)
# Saving the model
lda_model.save('lda_model_tf_idf.gensim')

In [20]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"option" + 0.009*"browser" + 0.008*"webmail" + 0.007*"script" + 0.005*"secur" + 0.005*"select" + 0.005*"enabl" + 0.005*"chip" + 0.005*"continu" + 0.005*"click"
Topic: 1 Word: 0.017*"deposit" + 0.013*"download" + 0.009*"machin" + 0.008*"spin" + 0.007*"song" + 0.007*"claim" + 0.005*"australia" + 0.005*"wager" + 0.005*"movi" + 0.005*"award"
Topic: 2 Word: 0.007*"alert" + 0.006*"prevent" + 0.005*"window" + 0.005*"immedi" + 0.005*"disabl" + 0.004*"technician" + 0.004*"toll" + 0.004*"cooki" + 0.004*"rolla" + 0.004*"login"
Topic: 3 Word: 0.020*"descargar" + 0.009*"para" + 0.009*"grati" + 0.006*"descarga" + 0.004*"live" + 0.004*"deposit" + 0.003*"juego" + 0.003*"blackjack" + 0.003*"bitcoin" + 0.003*"song"
Topic: 4 Word: 0.007*"deposit" + 0.005*"wager" + 0.005*"spin" + 0.004*"account" + 0.004*"withdraw" + 0.003*"requir" + 0.003*"poker" + 0.003*"bet" + 0.003*"money" + 0.003*"term"


### Visualising the models

In [21]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [22]:
lda_display_tfidf = pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_tfidf)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [23]:
pyLDAvis.save_html(lda_display, 'pyLDAvis_output.html')
pyLDAvis.save_html(lda_display_tfidf, 'pyLDAvis_output_tfidf.html')

### Classification of the topics
### Performance evaluation by classifying sample document using LDA Bag of Words model


In [24]:
processed_docs[150]

['onedr',
 'offic',
 'onedrivemi',
 'accountmpmer',
 'profilemi',
 'accountsign',
 'right',
 'nowhelpset',
 'access',
 'file',
 'share',
 'onedr',
 'select',
 'email',
 'provid',
 'offic',
 'email',
 'document',
 'lock',
 'privat',
 'documentsign',
 'continuelock',
 'privat',
 'documentsign',
 'continuemor',
 'onedr',
 'pad',
 'color',
 'white',
 'span',
 'classwfiw',
 'msiconalert',
 'stylefonts',
 'pxspan',
 'stylefonts',
 'display',
 'inlin',
 'fontfamili',
 'sego',
 'regular',
 'westeuropean',
 'sego',
 'sego',
 'tahoma',
 'arial',
 'sansserif',
 'page',
 'use',
 'javascript',
 'browser',
 'support',
 'javascript',
 'turn',
 'page',
 'mean',
 'appear',
 'javascript',
 'enabl',
 'browser',
 'ciddb',
 'authmsa',
 'geoweu',
 'depfa',
 'mruplusplusv',
 'rworldwid']

In [25]:
for index, score in sorted(lda_model[bow_corpus[1500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9982871413230896	 
Topic: 0.054*"download" + 0.040*"deposit" + 0.020*"spin" + 0.018*"machin" + 0.015*"song" + 0.011*"movi" + 0.007*"music" + 0.006*"australia" + 0.006*"mobil" + 0.006*"win"


### Performance evaluation by classifying sample document using LDA TF-IDF model

In [26]:
for index, score in sorted(lda_model_tfidf[bow_corpus[1500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.998295247554779	 
Topic: 0.017*"deposit" + 0.013*"download" + 0.009*"machin" + 0.008*"spin" + 0.007*"song" + 0.007*"claim" + 0.005*"australia" + 0.005*"wager" + 0.005*"movi" + 0.005*"award"


### Testing model on seen document

In [27]:
bow_vector = dictionary.doc2bow(preprocess(documents['text'][3]))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.9180396199226379	 Topic: 0.054*"download" + 0.040*"deposit" + 0.020*"spin" + 0.018*"machin" + 0.015*"song"
Score: 0.020586485043168068	 Topic: 0.014*"live" + 0.011*"blackjack" + 0.011*"poker" + 0.010*"spin" + 0.008*"tabl"
Score: 0.020548172295093536	 Topic: 0.028*"load" + 0.025*"account" + 0.013*"deposit" + 0.009*"wager" + 0.009*"requir"
Score: 0.02046196721494198	 Topic: 0.011*"download" + 0.009*"load" + 0.007*"cuenta" + 0.007*"account" + 0.007*"spin"
Score: 0.02036377042531967	 Topic: 0.054*"descargar" + 0.031*"grati" + 0.027*"para" + 0.015*"descarga" + 0.013*"download"


### Updating the topic and probability columns with respective values in decreasing order

In [28]:
for index in range(len(documents)):
    topicsOrder=[]
    bow_vector = dictionary.doc2bow(preprocess(documents['text'][index]))
    topicsOrder = list(map(list, list(lda_model.get_document_topics(bow_vector))))
    topicsOrder.sort(key=lambda x: x[1],reverse=True)
    # Printing the sorted probabilities
    # print(topicsOrder)
    for i in range(len(topicsOrder)):
        topic = "Topic"+str(i+1)
        documents.at[index,topic] = (topicsOrder[i][0])
        prob = "Prob"+str(i+1)
        documents.at[index,prob] = (topicsOrder[i][1])

[[4, 0.96601343]]
[[0, 0.99364626]]
[[2, 0.99171883]]
[[1, 0.91803712], [2, 0.02058897], [4, 0.020548185], [0, 0.02046198], [3, 0.020363782]]
[[0, 0.99364507]]
[[4, 0.48250982], [2, 0.40627778], [1, 0.10592489]]
[[2, 0.5191263], [4, 0.47562799]]
[[0, 0.99364686]]
[[4, 0.48197883], [2, 0.40510777], [1, 0.10762594]]
[[0, 0.99364489]]
[[2, 0.99171782]]
[[2, 0.99171948]]
[[1, 0.91803664], [2, 0.020589408], [4, 0.020548198], [0, 0.020461984], [3, 0.020363783]]
[[2, 0.99603039]]
[[0, 0.9936471]]
[[0, 0.993792]]
[[2, 0.99171877]]
[[2, 0.99171811]]
[[0, 0.99364686]]
[[0, 0.99364698]]
[[4, 0.48238117], [2, 0.40568939], [1, 0.10664203]]
[[4, 0.9660185]]
[[4, 0.96601301]]
[[4, 0.96602029]]
[[0, 0.99364632]]
[[4, 0.96602035]]
[[0, 0.99364656]]
[[2, 0.99171937]]
[[2, 0.99690789]]
[[0, 0.99364638]]
[[2, 0.99690741]]
[[2, 0.99603051]]
[[4, 0.48242426], [2, 0.40589082], [1, 0.10639747]]
[[0, 0.99379283]]
[[2, 0.69168526], [4, 0.28022495], [1, 0.025042901]]
[[1, 0.91804934], [2, 0.020576926], [4, 0.020

[[2, 0.91524434], [4, 0.079114206]]
[[4, 0.96602345]]
[[2, 0.91652757], [4, 0.077830955]]
[[1, 0.38803294], [0, 0.31444314], [4, 0.24620695], [2, 0.025835918], [3, 0.025481062]]
[[0, 0.9936468]]
[[1, 0.9180333], [2, 0.02059263], [4, 0.020548258], [0, 0.020462003], [3, 0.020363798]]
[[2, 0.71100545], [1, 0.28250697]]
[[0, 0.99364656]]
[[2, 0.91630876], [4, 0.078049764]]
[[2, 0.99171686]]
[[0, 0.99364614]]
[[1, 0.70928586], [2, 0.25315464], [4, 0.036922958]]
[[4, 0.96602297]]
[[2, 0.91738206], [4, 0.0769765]]
[[2, 0.99690837]]
[[0, 0.99364626]]
[[1, 0.91804665], [2, 0.020579603], [4, 0.020548103], [0, 0.020461926], [3, 0.020363741]]
[[0, 0.9936465]]
[[2, 0.99811065]]
[[1, 0.99789596]]
[[2, 0.65084553], [4, 0.34809938]]
[[2, 0.80611867], [4, 0.19217263]]
[[2, 0.7428906], [1, 0.22542295], [0, 0.030467227]]
[[4, 0.99973834]]
[[2, 0.9974578]]
[[2, 0.99956548]]
[[4, 0.83279061], [2, 0.16711792]]
[[1, 0.99789035]]
[[1, 0.99789071]]
[[2, 0.99836302]]
[[2, 0.6576767], [4, 0.33537921]]
[[2, 0.769

[[2, 0.84247631], [4, 0.13880809], [1, 0.017270725]]
[[2, 0.79444015], [0, 0.20045884]]
[[4, 0.54853284], [0, 0.23452792], [2, 0.21485467]]
[[1, 0.99789065]]
[[1, 0.86277026], [4, 0.13379878]]
[[1, 0.86265528], [4, 0.13391376]]
[[1, 0.86263418], [4, 0.13393484]]
[[1, 0.99789065]]
[[4, 0.75540292], [0, 0.14493589], [2, 0.099357925]]
[[1, 0.86252105], [4, 0.13404799]]
[[2, 0.58493686], [4, 0.41408092]]
[[4, 0.54492271], [2, 0.44634399]]
[[0, 0.96480244], [1, 0.032899044]]
[[0, 0.99951667]]
[[0, 0.96492189], [1, 0.032779649]]
[[4, 0.83260882], [2, 0.16607918]]
[[4, 0.99380225]]
[[0, 0.99842536]]
[[1, 0.86282426], [4, 0.13374481]]
[[1, 0.86246037], [4, 0.13410866]]
[[4, 0.99820334]]
[[1, 0.86283207], [4, 0.13373697]]
[[2, 0.68892241], [1, 0.16483049], [4, 0.14502522]]
[[1, 0.86276966], [4, 0.13379939]]
[[2, 0.99305773]]
[[1, 0.44234961], [4, 0.38095677], [2, 0.17562094]]
[[1, 0.86258638], [4, 0.13398263]]
[[2, 0.65081054], [4, 0.3481344]]
[[0, 0.99484813]]
[[2, 0.58538085], [4, 0.4136385]]

[[4, 0.99973834]]
[[1, 0.86272025], [4, 0.13384883]]
[[2, 0.73384136], [4, 0.26300102]]
[[1, 0.99789065]]
[[4, 0.96124589], [2, 0.03118854]]
[[4, 0.5449186], [2, 0.44634804]]
[[2, 0.99956548]]
[[1, 0.99789613]]
[[4, 0.92360473], [2, 0.076274939]]
[[2, 0.73906839], [0, 0.23092361], [1, 0.027887441]]
[[4, 0.99500751]]
[[4, 0.99972886]]
[[2, 0.75766313], [4, 0.24154596]]
[[2, 0.99956548]]
[[2, 0.74588627], [4, 0.19341008], [1, 0.058723833]]
[[4, 0.51887804], [2, 0.47759873]]
[[2, 0.99847847]]
[[1, 0.86235785], [4, 0.13421118]]
[[2, 0.83881992], [4, 0.15873598]]
[[2, 0.99956268]]
[[4, 0.7245639], [2, 0.27273563]]
[[4, 0.84430724], [1, 0.11922504], [0, 0.033784892]]
[[2, 0.99695522]]
[[2, 0.5856505], [4, 0.41336825]]
[[2, 0.55884355], [0, 0.3587774], [4, 0.080652446]]
[[1, 0.99789047]]
[[4, 0.82253993], [0, 0.17546883]]
[[4, 0.99416202]]
[[4, 0.68101728], [2, 0.31669948]]
[[2, 0.99759567]]
[[2, 0.6812886], [4, 0.31673396]]
[[1, 0.99789065]]
[[1, 0.86267841], [4, 0.13389061]]
[[2, 0.7786907]

[[4, 0.96056098], [2, 0.031777374]]
[[1, 0.86242408], [4, 0.13414502]]
[[4, 0.99973017]]
[[1, 0.99788517]]
[[2, 0.99876171]]
[[2, 0.99838358]]
[[0, 0.78121924], [2, 0.21663521]]
[[4, 0.99972719]]
[[2, 0.6291185], [4, 0.32286456], [0, 0.045944791]]
[[2, 0.82133698], [1, 0.17667405]]
[[2, 0.99956548]]
[[1, 0.8627398], [4, 0.1338293]]
[[2, 0.71553719], [4, 0.16848548], [0, 0.11529887]]
[[4, 0.99407572]]
[[4, 0.50986356], [2, 0.48731157]]
[[2, 0.99959016]]
[[4, 0.99680829]]
[[2, 0.99631661]]
[[1, 0.9978959]]
[[4, 0.98909068]]
[[1, 0.86276174], [4, 0.13380732]]
[[1, 0.86244369], [4, 0.13412535]]
[[2, 0.75698406], [4, 0.24222612]]
[[2, 0.73665237], [0, 0.1101831], [1, 0.078908816], [4, 0.073496848]]
[[2, 0.77743316], [4, 0.18574622], [0, 0.035410549]]
[[4, 0.99380189]]
[[2, 0.62175894], [4, 0.37615475]]
[[1, 0.59242713], [4, 0.3475669], [2, 0.057581402]]
[[4, 0.46021551], [2, 0.43486622], [0, 0.1025574]]
[[2, 0.98424935]]
[[4, 0.82572514], [0, 0.17228355]]
[[4, 0.81392986], [1, 0.18322855]]


[[2, 0.9995901]]
[[0, 0.99783254]]
[[4, 0.87408686], [2, 0.12175239]]
[[2, 0.99956542]]
[[4, 0.99407524]]
[[0, 0.59317815], [2, 0.3944332], [4, 0.011469286]]
[[4, 0.82170391], [0, 0.17630479]]
[[4, 0.99398857]]
[[2, 0.68935269], [1, 0.164684], [4, 0.14474145]]
[[2, 0.99992865]]
[[1, 0.99796373]]
[[4, 0.89992064], [1, 0.095988795]]
[[1, 0.86249423], [4, 0.13407476]]
[[4, 0.9930039]]
[[2, 0.83384717], [4, 0.16341199]]
[[1, 0.99797893]]
[[0, 0.99502355]]
[[4, 0.84125435], [2, 0.15217085]]
[[4, 0.81394821], [1, 0.18321022]]
[[0, 0.44101188], [2, 0.43264064], [4, 0.12578033]]
[[1, 0.86264592], [4, 0.13392316]]
[[4, 0.99909955]]
[[2, 0.99808842]]
[[1, 0.99789053]]
[[4, 0.99973834]]
[[2, 0.8029629], [4, 0.18444066]]
[[2, 0.74141937], [4, 0.25758046]]
[[4, 0.99972719]]
[[4, 0.99500686]]
[[1, 0.99789065]]
[[4, 0.55727017], [2, 0.43974492]]
[[1, 0.99789071]]
[[4, 0.81394935], [1, 0.18320905]]
[[2, 0.99959016]]
[[2, 0.99623722]]
[[1, 0.99789065]]
[[1, 0.86279488], [4, 0.13377412]]
[[2, 0.59234095

[[1, 0.9988941]]
[[1, 0.99912959]]
[[1, 0.99891555]]
[[1, 0.99645698]]
[[3, 0.99925256]]
[[1, 0.99885929]]
[[3, 0.99912107]]
[[3, 0.99913079]]
[[1, 0.99893689]]
[[1, 0.99890459]]
[[3, 0.99916112]]
[[1, 0.99898392]]
[[3, 0.99924791]]
[[3, 0.99923527]]
[[3, 0.99920738]]
[[3, 0.92318088], [1, 0.076183684]]
[[1, 0.99901843]]
[[1, 0.99881732]]
[[1, 0.99901098]]
[[1, 0.99878591]]
[[1, 0.99896264]]
[[1, 0.99901891]]
[[1, 0.99889064]]
[[3, 0.99919683]]
[[3, 0.99921089]]
[[1, 0.99896872]]
[[1, 0.99889863]]
[[1, 0.99901217]]
[[1, 0.9934997]]
[[3, 0.99922866]]
[[1, 0.99911141]]
[[1, 0.99900311]]
[[3, 0.99925536]]
[[3, 0.99916983]]
[[1, 0.98369616], [2, 0.015948569]]
[[3, 0.99922186]]
[[1, 0.99901849]]
[[1, 0.99895215]]
[[1, 0.99901986]]
[[1, 0.99885786]]
[[1, 0.99891335]]
[[3, 0.99916482]]
[[1, 0.9989115]]
[[3, 0.99929529]]
[[1, 0.99888361]]
[[3, 0.99920928]]
[[3, 0.99929303]]
[[1, 0.99886525]]
[[1, 0.94273275], [3, 0.043471903], [2, 0.01331212]]
[[3, 0.99927002]]
[[1, 0.99897963]]
[[3, 0.9992385

In [29]:
# Labeling the topic numbers in documents dataframe
documents.Topic1=documents.Topic1.replace(0, "First Topic").replace(1, "Second Topic").replace(2, "Third Topic").replace(3, "Fourth Topic").replace(4, "Fifth Topic")
documents.Topic2=documents.Topic2.replace(0, "First Topic").replace(1, "Second Topic").replace(2, "Third Topic").replace(3, "Fourth Topic").replace(4, "Fifth Topic")
documents.Topic3=documents.Topic3.replace(0, "First Topic").replace(1, "Second Topic").replace(2, "Third Topic").replace(3, "Fourth Topic").replace(4, "Fifth Topic")
documents.Topic4=documents.Topic4.replace(0, "First Topic").replace(1, "Second Topic").replace(2, "Third Topic").replace(3, "Fourth Topic").replace(4, "Fifth Topic")
documents.Topic5=documents.Topic5.replace(0, "First Topic").replace(1, "Second Topic").replace(2, "Third Topic").replace(3, "Fourth Topic").replace(4, "Fifth Topic")

In [30]:
documents.head(5)

Unnamed: 0,_id,text,Topic1,Prob1,Topic2,Prob2,Topic3,Prob3,Topic4,Prob4,Topic5,Prob5
0,04ed93fc36f0995d3225b830cf2a07e03953cd8ca76e14...,Windows Has Detected a Malicious Virus On You...,Fifth Topic,0.966013,,,,,,,,
1,05fa50761cbdc510c6b464e6f162f2b17b7b94735aad35...,Your session has timed out Click OK to sign in...,First Topic,0.993646,,,,,,,,
2,076a6d4900a5ee316cfedcb8310b2701ef1dad86d6eb19...,Microsoft Warning Alert ERROR 268d3x89383 ...,Third Topic,0.991719,,,,,,,,
3,077da9052c3e1a9e894203030887e825faf0013463e689...,Please hold a while as OneDrive Security is s...,Second Topic,0.918037,Third Topic,0.020589,Fifth Topic,0.020548,First Topic,0.020462,Fourth Topic,0.020364
4,07a9891ecef7bc7175fa205fb036764cc30cd4c9dc50d6...,Your session has timed out Click OK to sign in...,First Topic,0.993645,,,,,,,,


### Dropping the unnecessary columns from the dataframe documents

In [31]:
documents_dict=documents
documents_dict=documents_dict.drop('text',axis=1)

### Creating a list of dictionary in a proper format

In [32]:
documents_dict = documents_dict.replace(np.nan, '', regex=True)
# Creates a list of dictionaries
documents_dict = documents_dict.to_dict('record')
# Delete the keys with null values
for dict1 in documents_dict:
    empty_key=[]
    for key, value in dict1.items():
        if(value == ''):
            empty_key.append(key)
    for name in empty_key:
        dict1.pop(name)

In [33]:
# Saving the json file
with open('topic_document_lda_model.json', 'w') as f:
    json.dump(documents_dict, f, ensure_ascii=False)

In [34]:
## Loading the dictionary, corpus and model to visualize the data
# dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
# corpus = pickle.load(open('bow_corpus.pkl', 'rb'))
# lda = gensim.models.ldamodel.LdaModel.load('lda_model.gensim')

# import pyLDAvis.gensim
# lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
# pyLDAvis.display(lda_display)
# pyLDAvis.save_html(lda_display, 'pyLDAvis_output.html')