## LDA in Gensim
https://radimrehurek.com/gensim/models/ldamodel.html

In [1]:
%%time
# want to make clean words and return a list of tokens

#from spacy.en import English
#parser = English()

import spacy
parser = spacy.load('en')

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

Wall time: 8.3 s


In [2]:
%%time
sent = '@bob said the #chicken was at the #junkyard. See http://www.jonathanmugan.com.'
out_tokens = tokenize(sent)
print(out_tokens)

['SCREEN_NAME', 'said', 'the', '#', 'chicken', 'was', 'at', 'the', '#', 'junkyard', '.', 'see', 'URL', '.']
Wall time: 110 ms


In [3]:
%%time
# we want to lemmatize so dogs goes to dog and ran goes to run
# Lemmatization means to get the "dictionary entry" for a word

# Some documentation here http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else: 
        return lemma
    
# or can use this
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

Wall time: 4.21 s


In [4]:
%%time
for w in ['dogs','ran','discouraged']:
    print(w,get_lemma(w),get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged
Wall time: 2.51 s


In [5]:
%%time
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))

Wall time: 258 ms


In [6]:
%%time
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

Wall time: 0 ns


In [7]:
%%time
sent = 'I enjoy going to restaurants to eat hamburgers.'
print(prepare_text_for_lda(sent))

['enjoy', 'going', 'restaurant', 'hamburger']
Wall time: 19.5 ms


In [9]:
%%time
# get the data
import random
text_data = []
with open('jonathan_mugan_tweets.txt', encoding="utf8") as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .95:
            print(tokens)
        text_data.append(tokens)
        

['yesterday', 'nothing', 'significant', 'product', 'epiphany', 'moment', 'today']
['finish', 'breakfast', 'champion', 'vonnegut', 'yesterday', 'book', 'love']
['first', 'breakfast', 'buy', 'another', 'airportfood']
['looking', 'refrigerator', 'harvest', 'world']
['tomato', 'really', 'settle', 'stomach', 'hope', 'would']
['think', 'another', 'century', 'actually', 'guess']
['amaze', 'story', 'make', 'experience', 'movie', 'times', 'place', 'association']
['drove', 'water', 'softener', 'front', 'hassle', 'carsarestupid']
['people', 'realize', 'dangerous', 'could', 'confuse', 'saving', 'cousin']
['cognitive', 'humility', 'crucial', 'adapt', 'world', 'constantly', 'change', 'curiosity', 'cycle']
['imagine', 'freeze', 'focus', 'reality', 'relationship', 'princess', 'hermit', 'doesnotendwell']
['want', 'movie', 'relax', 'could', 'worse', 'choice', 'different', 'perhaps']
['place', 'klout', 'wrong', 'perk', 'perk', 'potential', 'perk', 'advertisement']
['record', 'history', 'small', 'history'

In [10]:
%%time
# create a dictionary from the data
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

# Warning message shows that you can also do lemmatization through Pattern



Wall time: 6.42 s


In [11]:
%%time
# convert to bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in text_data]

Wall time: 31 ms


In [12]:
%%time
# save the corpus and dictionary, we will use these in another video to visualize
import pickle
pickle.dump( corpus, open( "corpus.pkl", "wb" ) )
dictionary.save('dictionary.gensim')

Wall time: 14 ms


In [13]:
%%time
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model5.gensim')

Wall time: 23.5 s


In [14]:
%%time
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.014*"want" + 0.011*"remember" + 0.011*"memory" + 0.009*"going"')
(1, '0.016*"coffee" + 0.009*"funny" + 0.007*"people" + 0.006*"would"')
(2, '0.012*"people" + 0.010*"amaze" + 0.009*"could" + 0.008*"think"')
(3, '0.010*"child" + 0.010*"people" + 0.008*"story" + 0.008*"funny"')
(4, '0.013*"watch" + 0.011*"would" + 0.008*"great" + 0.008*"funny"')
Wall time: 14.5 ms


In [15]:
%%time
# try a new document
# we see it is mostly topic 3
new_doc = 'I watch movies.'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(8, 1), (191, 1)]
[(0, 0.067139052), (1, 0.067170478), (2, 0.068138503), (3, 0.068003409), (4, 0.72954857)]
Wall time: 19.5 ms


In [16]:
%%time
# try three topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.012*"funny" + 0.011*"people" + 0.007*"dream" + 0.007*"computer"')
(1, '0.013*"would" + 0.008*"always" + 0.007*"think" + 0.006*"something"')
(2, '0.010*"could" + 0.010*"want" + 0.007*"watch" + 0.006*"remember"')
Wall time: 27.5 s


In [17]:
%%time
# try ten topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.012*"interest" + 0.012*"drink" + 0.012*"video" + 0.011*"people"')
(1, '0.019*"movie" + 0.018*"watch" + 0.017*"going" + 0.012*"could"')
(2, '0.015*"amaze" + 0.012*"people" + 0.010*"could" + 0.010*"still"')
(3, '0.013*"night" + 0.012*"around" + 0.010*"someone" + 0.010*"funny"')
(4, '0.022*"something" + 0.016*"funny" + 0.011*"would" + 0.010*"wrong"')
(5, '0.027*"dream" + 0.024*"remember" + 0.015*"memory" + 0.014*"want"')
(6, '0.018*"people" + 0.013*"think" + 0.011*"anyone" + 0.010*"picture"')
(7, '0.018*"coffee" + 0.011*"young" + 0.009*"reading" + 0.009*"china"')
(8, '0.015*"always" + 0.013*"make" + 0.012*"anything" + 0.010*"actually"')
(9, '0.025*"would" + 0.017*"computer" + 0.014*"people" + 0.012*"ask"')
Wall time: 21.5 s


In [18]:
%%time
# Exercise: Run LDA on Newsgroup Data
# The Newsgroup Data
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
print(dir(texts))
# 11,314 posts
print(len(texts.target))
print(texts.target)
print(texts.target_names)
print(texts.data[0])

['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']
11314
[7 4 4 ..., 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name