## LDA in Gensim
https://radimrehurek.com/gensim/models/ldamodel.html

In [1]:
# want to make clean words and return a list of tokens

from spacy.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
sent = '@bob said the #chicken was at the #junkyard. See http://www.jonathanmugan.com.'
out_tokens = tokenize(sent)
print(out_tokens)

['SCREEN_NAME', 'said', 'the', '#', 'chicken', 'was', 'at', 'the', '#', 'junkyard', '.', 'see', 'URL', '.']


In [3]:
# we want to lemmatize so dogs goes to dog and ran goes to run
# Lemmatization means to get the "dictionary entry" for a word

# Some documentation here http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else: 
        return lemma
    
# or can use this
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [4]:
for w in ['dogs','ran','discouraged']:
    print(w,get_lemma(w),get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [5]:
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))

In [6]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [7]:
sent = 'I enjoy going to restaurants to eat hamburgers.'
print(prepare_text_for_lda(sent))

['enjoy', 'going', 'restaurant', 'hamburger']


In [8]:
# get the data
import random
text_data = []
with open('jonathan_mugan_tweets.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .95:
            print(tokens)
        text_data.append(tokens)
        

['watch', 'great', 'beauty', 'still', 'morning', 'world', 'begin']
['recently', 'attend', 'wedding', 'priest', 'sound', 'exactly', 'christopher', 'walken']
['recently', 'watch', 'transcendence', 'appear', 'johnny', 'assume', 'scientist', 'soulless', 'automaton', 'play']
['could', 'fully', 'trust', 'documentary', 'filmmaker', 'can’t', 'expect', 'people', 'spend', 'something', 'report']
['defensive', 'writing', 'write', 'point', 'across', 'avoid', 'people', 'dismiss', 'mention']
['twitter', 'whale', 'funny', 'appreciate', 'annoyance']
['interest', 'impression', 'world', 'movie', 'still', 'picture']
['everything', 'design', 'human', 'improvable', 'SCREEN_NAME']
['dream', 'feel', 'people', 'experience']
['learning', 'take', 'bravery', 'willing', 'stupid', 'question']
['install', 'software', 'spit', 'page', 'error', 'everything', 'sucessfully', 'instal']
['front', 'midnight', 'stroll', 'snake', 'sitting', 'looking', 'guess', 'watch', 'movie', 'instead']
['computer', 'enjoy', 'learning', 'al

In [9]:
# create a dictionary from the data
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

# Warning message shows that you can also do lemmatization through Pattern



In [10]:
# convert to bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in text_data]

In [11]:
# save the corpus and dictionary, we will use these in another video to visualize
import pickle
pickle.dump( corpus, open( "corpus.pkl", "wb" ) )
dictionary.save('dictionary.gensim')

In [12]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model5.gensim')

In [13]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.014*"remember" + 0.013*"could" + 0.012*"amaze" + 0.008*"funny"')
(1, '0.011*"people" + 0.009*"seem" + 0.009*"think" + 0.008*"funny"')
(2, '0.009*"people" + 0.007*"funny" + 0.007*"really" + 0.007*"never"')
(3, '0.013*"dream" + 0.012*"would" + 0.010*"want" + 0.010*"watch"')
(4, '0.008*"child" + 0.008*"coffee" + 0.007*"people" + 0.007*"going"')


In [14]:
# try a new document
# we see it is mostly topic 3
new_doc = 'I watch movies.'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(8, 1), (191, 1)]
[(0, 0.068171450237185072), (1, 0.068203317548867867), (2, 0.066931796547873221), (3, 0.72843889002983986), (4, 0.068254545636233885)]


In [15]:
# try three topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.008*"dream" + 0.007*"never" + 0.007*"could" + 0.006*"things"')
(1, '0.010*"always" + 0.007*"going" + 0.007*"coffee" + 0.005*"could"')
(2, '0.016*"would" + 0.011*"people" + 0.010*"funny" + 0.008*"movie"')


In [16]:
# try ten topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.018*"always" + 0.013*"instead" + 0.011*"really" + 0.011*"would"')
(1, '0.019*"try" + 0.012*"machine" + 0.011*"never" + 0.010*"means"')
(2, '0.018*"think" + 0.015*"funny" + 0.013*"would" + 0.011*"always"')
(3, '0.033*"movie" + 0.030*"watch" + 0.012*"make" + 0.010*"night"')
(4, '0.017*"could" + 0.015*"email" + 0.014*"place" + 0.012*"things"')
(5, '0.020*"would" + 0.013*"something" + 0.012*"wrong" + 0.012*"sense"')
(6, '0.019*"world" + 0.018*"remember" + 0.012*"clean" + 0.010*"dream"')
(7, '0.012*"water" + 0.012*"problem" + 0.012*"coffee" + 0.010*"spend"')
(8, '0.022*"people" + 0.018*"amaze" + 0.013*"every" + 0.011*"funny"')
(9, '0.016*"change" + 0.016*"funny" + 0.014*"computer" + 0.010*"memory"')


In [17]:
# Exercise: Run LDA on Newsgroup Data
# The Newsgroup Data
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
print(dir(texts))
# 11,314 posts
print(len(texts.target))
print(texts.target)
print(texts.target_names)
print(texts.data[0])

['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']
11314
[7 4 4 ..., 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name