In [0]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [0]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [0]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [0]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [0]:
from google.colab import files
uploaded = files.upload()

Saving poetry.csv to poetry.csv


In [0]:
import random
text_data = []
with open('poetry.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .90:
            print(tokens)
            text_data.append(tokens)

['unruly', 'scrawl']
[]
['worldpoetryday']
['worldpoetryday']
['beautiful', 'reflection', 'charlotte', 'delbo', 'auschwitz', 'survivor']
[]
['SCREEN_NAME', 'rhyme']
['worldpoetryday']
['SCREEN_NAME', 'courageous']
['SCREEN_NAME', 'worldpoetryday', 'hashtagmu', 'bukan']
['recent', 'relax', 'spread']
['greenness']
['SCREEN_NAME', 'brexit']
[]
[]
[]
[]
['dream']
['tread', 'softly', 'tread']
['seem', 'sound']
[]
['another']
['worldpoetryday']
['robert', 'junior']
['eyesight', 'fail']
['violet']


In [0]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [0]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [0]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [0]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [0]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.289*"worldpoetryday" + 0.122*"SCREEN_NAME" + 0.067*"hashtagmu" + 0.067*"bukan"')
(1, '0.080*"delbo" + 0.080*"reflection" + 0.080*"charlotte" + 0.080*"survivor"')
(2, '0.086*"relax" + 0.086*"recent" + 0.086*"spread" + 0.086*"junior"')
(3, '0.134*"SCREEN_NAME" + 0.133*"brexit" + 0.133*"violet" + 0.022*"worldpoetryday"')
(4, '0.183*"tread" + 0.100*"softly" + 0.100*"courageous" + 0.100*"dream"')


# New Doc

In [0]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[]
[(0, 0.1), (1, 0.1), (2, 0.1), (3, 0.1), (4, 0.1), (5, 0.1), (6, 0.1), (7, 0.1), (8, 0.1), (9, 0.1)]


In [0]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.206*"worldpoetryday" + 0.052*"recent" + 0.052*"spread" + 0.052*"relax"')
(1, '0.106*"tread" + 0.061*"auschwitz" + 0.061*"survivor" + 0.061*"beautiful"')
(2, '0.173*"SCREEN_NAME" + 0.066*"brexit" + 0.066*"fail" + 0.066*"eyesight"')


In [0]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.275*"greenness" + 0.025*"worldpoetryday" + 0.025*"SCREEN_NAME" + 0.025*"another"')
(1, '0.100*"recent" + 0.100*"relax" + 0.100*"spread" + 0.100*"sound"')
(2, '0.138*"SCREEN_NAME" + 0.137*"bukan" + 0.137*"hashtagmu" + 0.137*"dream"')
(3, '0.034*"worldpoetryday" + 0.033*"SCREEN_NAME" + 0.033*"dream" + 0.033*"greenness"')
(4, '0.220*"scrawl" + 0.220*"unruly" + 0.020*"worldpoetryday" + 0.020*"SCREEN_NAME"')
(5, '0.455*"worldpoetryday" + 0.122*"eyesight" + 0.122*"fail" + 0.011*"SCREEN_NAME"')
(6, '0.233*"tread" + 0.122*"softly" + 0.122*"robert" + 0.122*"junior"')
(7, '0.220*"SCREEN_NAME" + 0.220*"rhyme" + 0.020*"worldpoetryday" + 0.020*"another"')
(8, '0.122*"reflection" + 0.122*"auschwitz" + 0.122*"charlotte" + 0.122*"survivor"')
(9, '0.220*"SCREEN_NAME" + 0.220*"brexit" + 0.020*"worldpoetryday" + 0.020*"dream"')


# pyLDAvis

In [0]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [0]:
pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 8.2MB/s 
Collecting funcy (from pyLDAvis)
  Downloading https://files.pythonhosted.org/packages/47/a4/204fa23012e913839c2da4514b92f17da82bf5fc8c2c3d902fa3fa3c6eec/funcy-1.11-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.11 pyLDAvis-2.1.2


In [0]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [0]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)