<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [12]:
import numpy as np
import pandas as pd
import gensim
import os
import re
import zipfile

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

In [35]:
data_path = os.path.join(os.getcwd(), '../module1-text-data/data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv.zip')

archive = zipfile.ZipFile(data_path)
files = archive.namelist()

archive.extract(files[0])
df = pd.read_csv(files[0])
os.remove(files[0])

In [42]:
df.shape

(28332, 25)

In [37]:
def tokenize(text):
    return [token for token in simple_preprocess(text, deacc=True) if token not in STOPWORDS]

In [38]:
df['tokens'] = df['reviews.text'].apply(lambda x: tokenize(x))
df['tokens'].head()

0    [order, item, bad, quality, missing, backup, s...
1               [bulk, expensive, way, products, like]
2                             [duracell, price, happy]
3              [work, brand, batteries, better, price]
4             [batteries, long, lasting, price, great]
Name: tokens, dtype: object

In [47]:
# id2word = df['tokens'].apply(lambda x: corpora.Dictionary(x))
id2word = corpora.Dictionary(df['tokens'])

In [48]:
import sys

sys.getsizeof(id2word)

56

In [49]:
len(id2word.keys())

9620

In [55]:
id2word.filter_extremes(no_below=5, no_above=0.95)

In [56]:
len(id2word.keys())

3581

In [57]:
corpus = [id2word.doc2bow(text) for text in df['tokens']]

In [58]:
corpus[0][:10]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1)]

In [59]:
lda = LdaMulticore(corpus=corpus,
                  id2word=id2word,
                  random_state=42,
                  num_topics=10,
                  passes=10,
                  workers=4)

In [60]:
lda.print_topics()

[(0,
  '0.093*"good" + 0.041*"best" + 0.040*"buy" + 0.040*"product" + 0.033*"amazon" + 0.027*"quality" + 0.027*"price" + 0.016*"excellent" + 0.014*"like" + 0.014*"value"'),
 (1,
  '0.054*"kindle" + 0.036*"love" + 0.035*"easy" + 0.022*"size" + 0.022*"use" + 0.022*"screen" + 0.020*"great" + 0.016*"reading" + 0.016*"tablet" + 0.016*"nice"'),
 (2,
  '0.048*"tablet" + 0.035*"games" + 0.032*"kids" + 0.029*"old" + 0.026*"year" + 0.018*"time" + 0.016*"loves" + 0.015*"apps" + 0.015*"love" + 0.015*"bought"'),
 (3,
  '0.047*"batteries" + 0.021*"amazon" + 0.015*"use" + 0.012*"box" + 0.010*"battery" + 0.009*"like" + 0.009*"work" + 0.009*"ve" + 0.007*"ok" + 0.007*"time"'),
 (4,
  '0.045*"kindle" + 0.017*"new" + 0.017*"charge" + 0.014*"like" + 0.013*"better" + 0.011*"read" + 0.010*"buy" + 0.010*"screen" + 0.009*"battery" + 0.009*"time"'),
 (5,
  '0.119*"great" + 0.080*"tablet" + 0.043*"use" + 0.034*"easy" + 0.029*"price" + 0.028*"works" + 0.022*"product" + 0.014*"recommend" + 0.014*"good" + 0.012*"lo

In [62]:
# Strip out everything except the words
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

In [63]:
topics = [' '.join(t[0:5]) for t in words]

In [64]:
for id, t in enumerate(topics):
    print(f"----- topic {id} -----")
    print(t, end="\n\n")

----- topic 0 -----
good best buy product amazon

----- topic 1 -----
kindle love easy size use

----- topic 2 -----
tablet games kids old year

----- topic 3 -----
batteries amazon use box battery

----- topic 4 -----
kindle new charge like better

----- topic 5 -----
great tablet use easy price

----- topic 6 -----
tablet amazon apps screen books

----- topic 7 -----
batteries use tablet toys remote

----- topic 8 -----
bought gift kids loves love

----- topic 9 -----
batteries great price good long



In [65]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [66]:
pyLDAvis.gensim.prepare(lda, corpus, id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling