# Topic Modeling

In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd
import re, random, os, string

from nltk.tokenize import word_tokenize
from pprint import pprint

### Quick and dirty LSA on small data

In [1]:
TextCorpus = [
"the sky is blue",
"the sun is bright",
"the sun in the sky is bright",
"we can see the shining sun, the bright sun"
]

In [3]:
text_tokens = [word_tokenize(sent) for sent in TextCorpus]
text_tokens

[['the', 'sky', 'is', 'blue'],
 ['the', 'sun', 'is', 'bright'],
 ['the', 'sun', 'in', 'the', 'sky', 'is', 'bright'],
 ['we', 'can', 'see', 'the', 'shining', 'sun', ',', 'the', 'bright', 'sun']]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
transformer = TfidfVectorizer()
tfidf = transformer.fit_transform(TextCorpus) 

In [6]:
tfidf.todense()

matrix([[0.65919112, 0.        , 0.        , 0.        , 0.42075315,
         0.        , 0.        , 0.51971385, 0.        , 0.34399327,
         0.        ],
        [0.        , 0.52210862, 0.        , 0.        , 0.52210862,
         0.        , 0.        , 0.        , 0.52210862, 0.42685801,
         0.        ],
        [0.        , 0.3218464 , 0.        , 0.50423458, 0.3218464 ,
         0.        , 0.        , 0.39754433, 0.3218464 , 0.52626104,
         0.        ],
        [0.        , 0.23910199, 0.37459947, 0.        , 0.        ,
         0.37459947, 0.37459947, 0.        , 0.47820398, 0.39096309,
         0.37459947]])

In [7]:
from sklearn.decomposition import TruncatedSVD

In [8]:
svd = TruncatedSVD(n_components = 3)
lsa = svd.fit_transform(tfidf)

In [9]:
lsa

array([[ 0.62306129,  0.69703778,  0.34250357],
       [ 0.88015639, -0.15398603, -0.30658728],
       [ 0.89278882,  0.12337056, -0.25257634],
       [ 0.67267265, -0.60788755,  0.41913607]])

#### Can measure association using cosine similarity

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
cosine_similarity(lsa[0,:].reshape(1,-1),lsa[1,:].reshape(1,-1))

array([[0.35728045]])

In [13]:
cosine_similarity(lsa)

array([[1.        , 0.35728045, 0.59632771, 0.1397158 ],
       [0.35728045, 1.        , 0.95480174, 0.59048322],
       [0.59632771, 0.95480174, 1.        , 0.44891338],
       [0.1397158 , 0.59048322, 0.44891338, 1.        ]])

## Using LDA package on clean, dummy dataset

In [2]:
import numpy as np
import lda

In [3]:
X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()

In [4]:
titles

('0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20',
 '1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21',
 "2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23",
 '3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25',
 '4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25',
 "5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25",
 '6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26',
 "7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25",
 '8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26',
 '9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26',
 '10 UK: Britain tells Charles to forget Camilla. LONDON 1996-08-27',
 "11 COTE D'IVOIRE: FEATURE - Quiet homecoming for reprieved Ivory Coast maid. ABIDJAN 1996-08-28",


In [5]:
X.shape

(395, 4258)

In [6]:
?lda.LDA

In [7]:
%%time
model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(X)

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -1051748
INFO:lda:<10> log likelihood: -719800
INFO:lda:<20> log likelihood: -699115
INFO:lda:<30> log likelihood: -689370
INFO:lda:<40> log likelihood: -684918
INFO:lda:<50> log likelihood: -681322
INFO:lda:<60> log likelihood: -678979
INFO:lda:<70> log likelihood: -676598
INFO:lda:<80> log likelihood: -675383
INFO:lda:<90> log likelihood: -673316
INFO:lda:<100> log likelihood: -672761
INFO:lda:<110> log likelihood: -671320
INFO:lda:<120> log likelihood: -669744
INFO:lda:<130> log likelihood: -669292
INFO:lda:<140> log likelihood: -667940
INFO:lda:<150> log likelihood: -668038
INFO:lda:<160> log likelihood: -667429
INFO:lda:<170> log likelihood: -666475
INFO:lda:<180> log likelihood: -665562
INFO:lda:<190> log likelihood: -664920
INFO:lda:<200> log likelihood: -664979
INFO:lda:<210> log likelihood: -664722
INFO:lda:<220> log likelihood: -6

Wall time: 6.51 s


In [8]:
topic_word = model.topic_word_

In [10]:
len(topic_word[0])

4258

In [11]:
n_top_words = 10

In [12]:
[np.argsort(topic_word[0])][:-n_top_words:-1]

[array([   0, 2773, 2774, ...,   32,   24,   36], dtype=int64)]

In [38]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
     print(" ".join(topic_words))

Topic 0: government british minister west group letters party former million
Topic 1: church first during people political country ceremony visit government
Topic 2: elvis king wright fans presley concert life death first
Topic 3: yeltsin russian russia president kremlin michael romania orthodox operation
Topic 4: pope vatican paul surgery pontiff john hospital trip rome
Topic 5: family police miami versace cunanan funeral home church kennedy
Topic 6: south simpson born york white north african black wife
Topic 7: order church mother successor since election religious head nuns
Topic 8: charles prince diana royal queen king parker bowles camilla
Topic 9: film france french against actor paris bardot magazine poster
Topic 10: germany german war nazi christian letter book scientology jews
Topic 11: east prize peace timor quebec belo indonesia nobel award
Topic 12: n't told life people church show very public come
Topic 13: years world time year last say three later news
Topic 14: mother 

### Amazon Tap reviews data

In [11]:
df = pd.read_csv('Tap Reviews.csv')

In [12]:
df.head()

Unnamed: 0,asins,name,reviews.numHelpful,reviews.rating,reviews.text,reviews.title
0,B01BH83OOM,Amazon Tap - Alexa-Enabled Portable Bluetooth ...,806,5,IMPORTANT UPDATE (3/8/17): As you read my orig...,"The Echo's Little Sister Is AMAZING!! 3,"
1,B01BH83OOM,Amazon Tap - Alexa-Enabled Portable Bluetooth ...,426,5,It was just a few weeks ago that I was bemoani...,Tap Alexa on the go!
2,B01BH83OOM,Amazon Tap - Alexa-Enabled Portable Bluetooth ...,295,5,The Amazon Tap is a portable Bluetooth speaker...,Amazon Tap Review - Best Bluetooth Speaker in ...
3,B01BH83OOM,Amazon Tap - Alexa-Enabled Portable Bluetooth ...,0,5,Look at this product as a portable speaker fir...,Great for what it does
4,B01BH83OOM,Amazon Tap - Alexa-Enabled Portable Bluetooth ...,3,5,This Amazon tap is not only a great Bluetooth ...,"Awesome, smart little portable speaker"


In [23]:
df = df.rename(columns={'reviews.text':'review'})

In [25]:
df.columns

Index(['asins', 'name', 'reviews.numHelpful', 'reviews.rating', 'review',
       'reviews.title'],
      dtype='object')

In [24]:
df.shape

(542, 6)

#### Text pre-processing

In [15]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english') + list(string.punctuation)

In [17]:
from gensim.utils import simple_preprocess

In [18]:
simple_preprocess("I don't like this product!")

['don', 'like', 'this', 'product']

In [19]:
?simple_preprocess

In [20]:
word_tokenize("I don't like this product!")

['I', 'do', "n't", 'like', 'this', 'product', '!']

**Important Note:** Models are not sutomatically downloaded with Spacy, so you may need to do a ```python -m spacy download en``` to use its preprocessing methods.

In [None]:
# gensim for LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# create dictionary and corpus
# Create Dictionary
id2word = corpora.Dictionary(data)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[2])

### Building the Topic Model

In [14]:
from gensim.models import ldamodel

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


In [15]:
?ldamodel.LdaModel

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=42,
                                           passes=10,
                                           per_word_topics=True)

In [244]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.100*"use" + 0.055*"love" + 0.045*"easy" + 0.044*"great" + 0.042*"buy" + '
  '0.029*"excellent" + 0.028*"gift" + 0.028*"enjoy" + 0.019*"work" + '
  '0.016*"product"'),
 (1,
  '0.052*"echo" + 0.047*"alexa" + 0.031*"button" + 0.030*"tap" + 0.026*"great" '
  '+ 0.022*"hand" + 0.022*"talk" + 0.019*"portable" + 0.018*"sound" + '
  '0.018*"set"'),
 (2,
  '0.055*"phone" + 0.027*"love" + 0.021*"light" + 0.021*"information" + '
  '0.020*"move" + 0.018*"old" + 0.018*"month" + 0.017*"apple" + 0.016*"app" + '
  '0.015*"system"'),
 (3,
  '0.036*"button" + 0.028*"size" + 0.024*"alexa" + 0.022*"speaker" + '
  '0.021*"loud" + 0.020*"dot" + 0.020*"press" + 0.019*"put" + 0.019*"get" + '
  '0.017*"echo"'),
 (4,
  '0.044*"question" + 0.035*"need" + 0.030*"get" + 0.029*"time" + 0.028*"ask" '
  '+ 0.027*"lot" + 0.023*"love" + 0.022*"pretty" + 0.020*"anywhere" + '
  '0.020*"answer"'),
 (5,
  '0.036*"fun" + 0.032*"carry" + 0.021*"super" + 0.019*"high" + 0.017*"news" + '
  '0.017*"case" + 0.016*"clock

In [None]:
# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [246]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [243]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=42,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

### Get topics for any new document

In [None]:
new_doc = "enter text here"

In [None]:
new_doc_bow = diction.doc2bow(new_doc)
new_doc_bow

In [None]:
lda_mod.get_document_topics(new_doc_bow)

### Choosing the number of topics

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []

    for num_topics in range(start, limit, step):
        model_list.append(model)
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, 
                                           random_state=42, passes=10, per_word_topics=True)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_values.append(coherence_model_lda.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=10, step=2)

In [None]:
limit=40; start=2; step=6;
x = range(start, limit, step)

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))