In [12]:
import spacy
from stop_words import get_stop_words
spacy.load('en')
from spacy.lang.en import English
parser = English()

In [13]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [14]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /home/akul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [16]:
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [17]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    en_stop = set(get_stop_words('en'))
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [18]:
text_data = []
with open('video2.txt') as f:
    for line in f:
        line = line.replace('%HESITATION', '')
        tokens = prepare_text_for_lda(line)
        text_data.append(tokens)

In [19]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [20]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.023*"think" + 0.017*"designer" + 0.015*"design" + 0.015*"things"')
(1, '0.001*"think" + 0.001*"designer" + 0.001*"right" + 0.001*"people"')
(2, '0.001*"think" + 0.001*"designer" + 0.001*"things" + 0.001*"people"')
(3, '0.001*"think" + 0.001*"designer" + 0.001*"actually" + 0.001*"things"')
(4, '0.001*"think" + 0.001*"design" + 0.001*"things" + 0.001*"people"')


In [21]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(516, 1), (620, 1)]
[(0, 0.7324773), (1, 0.0668801), (2, 0.0668836), (3, 0.06687914), (4, 0.066879846)]


In [22]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.001*"think" + 0.001*"things" + 0.001*"people" + 0.001*"designer"')
(1, '0.001*"think" + 0.001*"things" + 0.001*"right" + 0.001*"design"')
(2, '0.022*"think" + 0.016*"designer" + 0.015*"design" + 0.015*"things"')
