# LDA Topic Modelling with NLTK and Gensim

# Text Cleaning

In [2]:
import spacy

In [3]:
#spacy.load('en')
from spacy.lang.en import English
parser = English()

In [4]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if (token.orth_.isspace()):
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    
    return lda_tokens
        

we use  

NLTK's Wordnet --> to find the meanings of words, synonyms, antonyms, and more.

NLTK's WorNetLemmatizer --> to get the root word

In [5]:
import nltk

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/amante/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.corpus import wordnet as wn

In [8]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [9]:
from nltk.stem.wordnet import WordNetLemmatizer

In [10]:
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

Filter out stop words

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/amante/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
en_stop = set(nltk.corpus.stopwords.words('english'))

Now define function for preprocessing the text for topic modelling

In [13]:
def preprocess_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

open up our data,

read line by line and for each line, prepare the text for LDA

then add to a list


In [28]:
import random 
train_data = []
with open('data/test_doc') as f:
    for line in f:
        #print(line)
        tokens = preprocess_text_for_lda(line)
        
        #print(tokens)
        if random.random() > .99:
            print(tokens)
        train_data.append(tokens)

print(train_data)

[['toward', 'democratic', 'lawful', 'citizenship', 'robot', 'corporation', 'goertzel', 'singularitynet', 'share', 'thought', 'citizenship', 'writing', 'plane', 'flying', 'malta', 'spoke', 'singularitynet', 'malta', 'blockchain', 'summit', 'first', 'malta', 'event', 'afternoon', 'explore', 'elegant', 'quaint', 'ancient', 'neighborhood', 'island', 'walking', 'medieval', 'alleyway', 'rocky', 'coast', 'ironic', 'contrast', 'elegant', 'surroundings', 'reason', 'decide', 'allocate', 'couple', 'insanely', 'schedule', 'malta', 'event', 'conference', 'opportunity', 'level', 'malta', 'government', 'discus', 'enablement', 'maltese', 'citizenship', 'robot', 'automate', 'corporation', 'folk', 'build', 'stone', 'wall', 'lining', 'narrow', 'maltese', 'road', 'still', 'standing', 'strong', 'century', 'later', 'probably', 'foresee', 'lap', 'island', 'become', 'nexus', 'thinking', 'intersection', 'general', 'intelligence', 'theory', 'cryptography', 'distribute', 'system', 'advance', 'legal', 'theory', '

But the problem in Preprocessing 

still word are not prefectly changing to their lemma form, this means the WordNetLemmatizer is not genaral lemmatiter

e.g gathering needs have to be changed to form gather but still gathering

Programming to program but still program, 

reccomendation have to be changed to recommend but still recommendation also so many like this

it counts some name as stop words like "Ben Geortzel", it doesnt know whether Ben Geortzel is a name, and it counts Ben as stop words and removes it

even check from the blogs it the same clustering, pricing, simulation, ....

# Creating a Dicitionary

Now Create a Dictionary from the data,

then convert this Dictionary to bag-of-words corpus

then save both dictionary and bag-of-words corpus to the file for future use in the model.


In [29]:
from gensim import corpora
dictionary = corpora.Dictionary(train_data)
bwd_corpus = [dictionary.doc2bow(text) for text in train_data]

In [30]:
import pickle
pickle.dump(bwd_corpus, open('data/bwd_corpus.pkl', 'wb'))
dictionary.save('data/dictionary.gensim')

In [31]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f399919de80>

In [21]:
#bwd_corpus

# LDA with Gensim

We are asking to find 5 topics in the data

In [32]:
import gensim

In [33]:
NUM_TOPICS = 5
# classifying the topics of our train data or documents into NUM_TOPICs

# then for the new test data or new documents finding which topic this given document is belonging.

# but this one does not make sense. because first why we need to have the classification topics, because our new document must be
# in this category of first predefined topics, what about if its out of the train data. what will happen
# this means our model is not going to finds topi

In [34]:
ldamodel = gensim.models.ldamodel.LdaModel(bwd_corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15)

In [35]:
ldamodel.save('model5.gensim')

In [36]:
topics = ldamodel.print_topics(num_words=4)

In [37]:
for topic in topics:
    print(topic)

(0, '0.002*"citizenship" + 0.002*"human" + 0.002*"citizen" + 0.002*"general"')
(1, '0.002*"citizenship" + 0.002*"human" + 0.002*"intelligence" + 0.002*"legal"')
(2, '0.002*"human" + 0.002*"citizenship" + 0.002*"citizen" + 0.002*"legal"')
(3, '0.030*"citizenship" + 0.019*"human" + 0.015*"citizen" + 0.013*"legal"')
(4, '0.002*"citizenship" + 0.002*"human" + 0.002*"citizen" + 0.002*"intelligence"')


# Let's try by creating new document

In [38]:
test_data = 'practical Bayesian Optimization of machine learning algorithims'
#test_data = ""

In [40]:
preprocess_testdata = preprocess_text_for_lda(test_data)

In [41]:
bow_testdata = dictionary.doc2bow(preprocess_testdata)

In [42]:
print(bow_testdata)

[(331, 1), (435, 1)]


In [43]:
print(ldamodel.get_document_topics(bow_testdata))

[(0, 0.06671533), (1, 0.06671529), (2, 0.06671475), (3, 0.73313975), (4, 0.06671485)]


remember that the above 5 probabilities add up to 1

# now we are asking LDA to find 3 topics in the data

In [151]:
ldamodel3 = gensim.models.ldamodel.LdaModel(bwd_corpus, num_topics=3, id2word=dictionary, passes=15)

In [152]:
ldamodel3.save('model3.gensim')

In [153]:
topics = ldamodel3.print_topics(num_words=4)

In [154]:
for topic in topics:
    print(topic)

(0, '0.017*"database" + 0.014*"using" + 0.013*"system" + 0.008*"network"')
(1, '0.018*"base" + 0.018*"network" + 0.017*"power" + 0.014*"system"')
(2, '0.024*"network" + 0.019*"base" + 0.013*"multi" + 0.011*"search"')


# pyLDAvis

pyLDAvis is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data.

the package extracts information from a fitted LDA topic model to inform an interactive web based visualization.

Visualizing 5 topics

In [155]:
dictionary = gensim.corpora.Dictionary.load('data/dictionary.gensim')
corpus = pickle.load(open('data/bwd_corpus.pkl', 'rb'))

In [156]:
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [157]:
import pyLDAvis.gensim

In [158]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [159]:
pyLDAvis.display(lda_display)

Saliency: a measure of how much the term tells you about the topic.