In [0]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Lemmatization using WordNet

In [0]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [5]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [9]:
import random
text_data = []
with open('mission.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .90:
            print(tokens)
            text_data.append(tokens)

[]
['अंतरिक्ष']
['SCREEN_NAME', 'ऐतिहासिक', 'शक्ति', 'बनाने', 'हमारे', 'वैज्ञानिकों', 'ऑर्बिट', 'सैटेलाइट']
[]
['मजबूर', 'सरकार', 'मजबूत', 'सरकार', 'missionshakti']
['SCREEN_NAME', 'despite', 'hurdles', 'doubt', 'criticism', 'missionshakti', 'represent', 'india', 'rising', 'space', 'capability', 'futuri']
[]
['nobel', 'prize', 'mathematics', 'madam', 'SCREEN_NAME', 'hereby', 'nominate', 'nobel', 'prize']
[]
[]
['SCREEN_NAME', 'satellite', 'missile', 'works']
[]
['ancona', 'ancora', 'barcaespanyol', 'barca', 'missionshakti', 'ascoltatelo', 'merita']
['सुरक्षा', 'दृष्टिकोण', 'सराहनीय', 'कदम।जय', 'missionshakti']
['missionshakti', 'SCREEN_NAME']
['तुम्हाला', 'माहितीये', 'वर्षात', 'काहीच', 'झालं.']
[]
['SCREEN_NAME', 'proud', 'india', 'advance', 'select', 'group', 'nation', 'posse', 'satellite', 'missile', 'congratulations']
['हमारे', 'वैज्ञानिकों', 'अंतरिक्ष', 'earth', 'orbit', 'सैटेलाइट', 'गिराया']
['small', 'debris', 'damage', 'international', 'space', 'station', 'astronaut', 'increase'

## LDA with Gensim

Creating a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use

In [0]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [0]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [0]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

## Using LDA to find 5 topics in the data

In [0]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [15]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.062*"nobel" + 0.062*"prize" + 0.062*"अंतरिक्ष" + 0.034*"missionshakti"')
(1, '0.059*"missionshakti" + 0.058*"barca" + 0.058*"ancora" + 0.058*"barcaespanyol"')
(2, '0.077*"सरकार" + 0.042*"missionshakti" + 0.042*"शक्ति" + 0.042*"झालं."')
(3, '0.047*"missionshakti" + 0.047*"SCREEN_NAME" + 0.047*"space" + 0.047*"india"')
(4, '0.088*"SCREEN_NAME" + 0.046*"satellite" + 0.046*"missile" + 0.025*"group"')


# New Doc

In [16]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]


In [17]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.045*"सरकार" + 0.045*"SCREEN_NAME" + 0.045*"missile" + 0.045*"satellite"')
(1, '0.066*"SCREEN_NAME" + 0.046*"nobel" + 0.046*"prize" + 0.046*"missionshakti"')
(2, '0.043*"missionshakti" + 0.042*"SCREEN_NAME" + 0.042*"हमारे" + 0.042*"सैटेलाइट"')


## For finding 10 topics

In [18]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.112*"सरकार" + 0.059*"अंतरिक्ष" + 0.059*"missionshakti" + 0.059*"हमारे"')
(1, '0.086*"missionshakti" + 0.086*"कदम।जय" + 0.086*"दृष्टिकोण" + 0.086*"सराहनीय"')
(2, '0.120*"SCREEN_NAME" + 0.081*"nobel" + 0.081*"prize" + 0.043*"ऑर्बिट"')
(3, '0.070*"space" + 0.070*"cloud" + 0.070*"astronaut" + 0.070*"increase"')
(4, '0.062*"missionshakti" + 0.062*"ancora" + 0.062*"barcaespanyol" + 0.062*"works"')
(5, '0.015*"SCREEN_NAME" + 0.015*"missionshakti" + 0.015*"अंतरिक्ष" + 0.015*"missile"')
(6, '0.065*"शक्ति" + 0.065*"वर्षात" + 0.065*"झालं." + 0.065*"तुम्हाला"')
(7, '0.059*"missionshakti" + 0.059*"SCREEN_NAME" + 0.059*"capability" + 0.059*"criticism"')
(8, '0.062*"SCREEN_NAME" + 0.062*"missile" + 0.062*"satellite" + 0.062*"india"')
(9, '0.015*"अंतरिक्ष" + 0.015*"SCREEN_NAME" + 0.015*"missionshakti" + 0.015*"missile"')


# pyLDAvis

In [0]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [20]:
pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 13.8MB/s 
Collecting funcy (from pyLDAvis)
  Downloading https://files.pythonhosted.org/packages/47/a4/204fa23012e913839c2da4514b92f17da82bf5fc8c2c3d902fa3fa3c6eec/funcy-1.11-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.11 pyLDAvis-2.1.2


In [21]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [22]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)