<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [42]:
import numpy as np
import gensim
import os
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pandas as pd
os.getcwd()
df = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [44]:
STOPWORDS = set(STOPWORDS).union('amazon')
def tokenize(corpus):
    '''Tokenizer, excludes stop words'''
    return [token for token in simple_preprocess(corpus) if token not in STOPWORDS]

In [45]:
df['review_tokenized'] = df['reviews.text'].apply(tokenize)

In [46]:
df['review_tokenized'].head()

0    [order, item, bad, quality, missing, backup, s...
1               [bulk, expensive, way, products, like]
2                             [duracell, price, happy]
3              [work, brand, batteries, better, price]
4             [batteries, long, lasting, price, great]
Name: review_tokenized, dtype: object

In [47]:
def doc_stream(Series):
    '''Stream tokens from pandas Series for corpora construction'''
    for token_list in Series:
        yield token_list
    

In [48]:
#Building a corpora of words from Amazon review
id2word = corpora.Dictionary(doc_stream(df['review_tokenized']))

In [49]:
#Filter most common words
id2word.filter_extremes(no_below=3, no_above=.97)

In [50]:
#Build corpus
corpus = [id2word.doc2bow(text) for text in doc_stream(df['review_tokenized'])]
corpus[:5]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)],
 [(16, 1), (17, 1), (18, 1)],
 [(10, 1), (18, 1), (19, 1), (20, 1), (21, 1)],
 [(18, 1), (19, 1), (22, 1), (23, 1), (24, 1)]]

In [51]:
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=42,
                   num_topics=10,
                   passes=10,
                   workers=4)
lda.print_topics()

[(0,
  '0.060*"kindle" + 0.017*"bought" + 0.016*"reading" + 0.015*"love" + 0.014*"screen" + 0.014*"read" + 0.014*"tablet" + 0.012*"books" + 0.012*"use" + 0.012*"new"'),
 (1,
  '0.181*"great" + 0.113*"price" + 0.055*"batteries" + 0.042*"work" + 0.034*"product" + 0.028*"long" + 0.025*"value" + 0.023*"buy" + 0.019*"works" + 0.017*"deal"'),
 (2,
  '0.039*"battery" + 0.025*"life" + 0.020*"recommend" + 0.020*"kindle" + 0.019*"charge" + 0.019*"case" + 0.015*"highly" + 0.014*"screen" + 0.013*"time" + 0.012*"long"'),
 (3,
  '0.094*"batteries" + 0.018*"amazon" + 0.018*"long" + 0.014*"battery" + 0.014*"use" + 0.014*"brand" + 0.013*"good" + 0.012*"work" + 0.011*"time" + 0.010*"duracell"'),
 (4,
  '0.075*"best" + 0.026*"device" + 0.023*"buy" + 0.015*"great" + 0.015*"bought" + 0.014*"light" + 0.010*"tablet" + 0.008*"thanks" + 0.008*"use" + 0.008*"tablets"'),
 (5,
  '0.047*"easy" + 0.043*"loves" + 0.042*"use" + 0.038*"love" + 0.035*"great" + 0.031*"tablet" + 0.028*"bought" + 0.023*"old" + 0.021*"game

In [52]:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

In [53]:
topics = [' '.join(t[0:5]) for t in words]

In [54]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
kindle bought reading love screen

------ Topic 1 ------
great price batteries work product

------ Topic 2 ------
battery life recommend kindle charge

------ Topic 3 ------
batteries amazon long battery use

------ Topic 4 ------
best device buy great bought

------ Topic 5 ------
easy loves use love great

------ Topic 6 ------
tablet kids year great old

------ Topic 7 ------
tablet amazon apps kids play

------ Topic 8 ------
amazon like ok batteries lot

------ Topic 9 ------
good price great tablet nice



## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling