## LDA in Gensim
https://radimrehurek.com/gensim/models/ldamodel.html

In [1]:
# want to make clean words and return a list of tokens

#from spacy.en import English
#parser = English()

import spacy
parser = spacy.load('en')

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
# A lot of times you will get text from some unknown encoding. 
# UTF-8 is the most common representation.
# If you need to catch errros, you can remove errors='ignore'
def convert_unicode(text):
    if isinstance(text,str):
        return text.decode('utf-8',errors='ignore')
    else:
        return text

In [3]:
sent = '@bob said the #chicken was at the #junkyard. See http://www.jonathanmugan.com.'
out_tokens = tokenize(convert_unicode(sent))
print(out_tokens)

['SCREEN_NAME', u'said', u'the', u'#', u'chicken', u'was', u'at', u'the', u'#', u'junkyard', u'.', u'see', 'URL', u'.']


In [5]:
# we want to lemmatize so dogs goes to dog and ran goes to run
# Lemmatization means to get the "dictionary entry" for a word

# Some documentation here http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else: 
        return lemma
    
# or can use this
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [6]:
for w in ['dogs','ran','discouraged']:
    print(w,get_lemma(w),get_lemma2(w))

('dogs', u'dog', u'dog')
('ran', u'run', 'ran')
('discouraged', u'discourage', 'discouraged')


In [7]:
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))

In [13]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [14]:
sent = 'I enjoy going to restaurants to eat hamburgers.'
print(prepare_text_for_lda(convert_unicode(sent)))

[u'enjoy', u'going', u'restaurant', u'hamburger']


In [18]:
%%time
# get the data
import random
text_data = []
with open('jonathan_mugan_tweets.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(convert_unicode(line))
        if random.random() > .95:
            print(tokens)
        text_data.append(tokens)
        

[u'recently', u'finish', u'star', u'peter', u'heller', u'beautiful', u'especially', u'middle', u'enjoy', u'apocalypse']
[u'remember', u'waiting', u'people', u'phone', u'would', u'sometimes', u'amaze']
[u'still', u'diner', u'cards']
[u'coach', u'announce', u'soccer', u'practice', u'optional', u'going', u'optional']
[u'recently', u'watch', u'stripe', u'pajama']
[u'buy', u'bottle', u'whole', u'food', u'try', u'confirm', u'appropriately', u'price']
[u'recently', u'watch', u'pretty', u'movie', u'wilson', u'tiger']
[u'drove', u'water', u'softener', u'front', u'hassle', u'carsarestupid']
[u'attest', u'derisive', u'weekend', u'warrior', u'usually', u'recover', u'sunday', u'soccer', u'wednesday', u'oldman']
[u'culture', u'tip', u'things', u'valet', u'parking', u'culture', u'carry', u'something']
[u'spend', u'try', u'little', u'notification', u'icon', u'skinner']
[u'expose', u'travel', u'always', u'engross', u'thing', u'important']
[u'taste', u'think', u'taking', u'omega-3', u'thing', u'little']

In [19]:
# create a dictionary from the data
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

# Warning message shows that you can also do lemmatization through Pattern



In [20]:
# convert to bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in text_data]

In [21]:
%%time
# save the corpus and dictionary, we will use these in another video to visualize
import pickle
pickle.dump( corpus, open( "corpus-py27.pkl", "wb" ) )
dictionary.save('dictionary-py27.gensim')

Wall time: 171 ms


In [22]:
%%time
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model5-py27.gensim')

Wall time: 25.5 s


In [23]:
%%time
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.010*"watch" + 0.008*"think" + 0.008*"going" + 0.008*"wonder"')
(1, u'0.014*"people" + 0.010*"coffee" + 0.009*"would" + 0.007*"picture"')
(2, u'0.010*"people" + 0.009*"change" + 0.008*"write" + 0.007*"funny"')
(3, u'0.012*"funny" + 0.012*"dream" + 0.007*"child" + 0.006*"anymore"')
(4, u'0.012*"would" + 0.011*"could" + 0.011*"remember" + 0.009*"never"')
Wall time: 16 ms


In [26]:
%%time
# try a new document
# we see it is mostly topic 3
new_doc = 'I watch movies.'
new_doc = prepare_text_for_lda(convert_unicode(new_doc))
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(3, 1), (190, 1)]
[(0, 0.73031123020059985), (1, 0.067371280336751493), (2, 0.067096699075636451), (3, 0.067087415721834839), (4, 0.068133374665177465)]
Wall time: 88 ms


In [27]:
%%time
# try three topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model3-py27.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.010*"would" + 0.010*"remember" + 0.009*"could" + 0.007*"funny"')
(1, u'0.013*"movie" + 0.012*"watch" + 0.011*"people" + 0.010*"would"')
(2, u'0.010*"dream" + 0.007*"people" + 0.007*"reading" + 0.006*"amaze"')
Wall time: 27.6 s


In [28]:
%%time
# try ten topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model10-py27.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.014*"child" + 0.013*"sugar" + 0.010*"movie" + 0.009*"perfect"')
(1, u'0.014*"could" + 0.011*"google" + 0.009*"getting" + 0.009*"people"')
(2, u'0.015*"enough" + 0.014*"funny" + 0.013*"really" + 0.013*"watch"')
(3, u'0.018*"could" + 0.016*"place" + 0.013*"first" + 0.011*"things"')
(4, u'0.021*"funny" + 0.015*"something" + 0.014*"always" + 0.014*"seem"')
(5, u'0.030*"remember" + 0.019*"want" + 0.017*"memory" + 0.014*"friend"')
(6, u'0.013*"people" + 0.012*"think" + 0.011*"commercial" + 0.010*"change"')
(7, u'0.022*"would" + 0.016*"people" + 0.014*"coffee" + 0.014*"wonder"')
(8, u'0.020*"robot" + 0.014*"machine" + 0.013*"every" + 0.012*"person"')
(9, u'0.016*"dream" + 0.015*"thing" + 0.015*"around" + 0.011*"would"')
Wall time: 26.8 s


In [29]:
%%time
# Exercise: Run LDA on Newsgroup Data
# The Newsgroup Data
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
print(dir(texts))
# 11,314 posts
print(len(texts.target))
print(texts.target)
print(texts.target_names)
print(texts.data[0])

No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"


['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']
11314
[7 4 4 ..., 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name