<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

  from collections import Iterable


In [2]:
# creates gensim tokenizer
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [3]:
# import amazon dataframe
amzn = pd.read_csv('data/amazon_data.zip')

# create column of tokenized data
amzn['tokens'] = amzn['reviews.text'].apply(lambda x: tokenize(x))

In [4]:
# creates dataframe with
df = amzn[['primaryCategories', 'brand', 'name', 'tokens', 'reviews.rating', ]].copy()
df.head()

Unnamed: 0,primaryCategories,brand,name,tokens,reviews.rating
0,Health & Beauty,Amazonbasics,AmazonBasics AAA Performance Alkaline Batterie...,"[order, item, bad, quality, missing, backup, s...",3
1,Health & Beauty,Amazonbasics,AmazonBasics AAA Performance Alkaline Batterie...,"[bulk, expensive, way, products, like]",4
2,Health & Beauty,Amazonbasics,AmazonBasics AAA Performance Alkaline Batterie...,"[duracell, price, happy]",5
3,Health & Beauty,Amazonbasics,AmazonBasics AAA Performance Alkaline Batterie...,"[work, brand, batteries, better, price]",5
4,Health & Beauty,Amazonbasics,AmazonBasics AAA Performance Alkaline Batterie...,"[batteries, long, lasting, price, great]",5


In [5]:
# creating corpora
id2word = corpora.Dictionary(df['tokens'])

# filter extreme values
id2word.filter_extremes(no_below=10, no_above=0.80)

# checks length of id2word
len(id2word)

2359

In [6]:
# creates corpus for each row
corpus = [id2word.doc2bow(text) for text in df['tokens']]

In [7]:
# # selects a product to look at

# product1 = df['name'].unique()[0]
# print(f'Product 1 is: {product1}')

# product2 = df['name'].unique()[3]
# print(f'Product 2 is: {product2}')

# product3 = df['name'].unique()[30]
# print(f'Product 2 is: {product3}')

In [8]:
# # creating list of tokens for each product
# p1t = df[df['name'] == product1].sum()
# p2t = df[df['name'] == product2].sum()
# p3t = df[df['name'] == product3].sum()

In [9]:
# setting up lda model
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   num_topics=8,
                   passes=10,
                   workers=4
                  )

In [10]:
pyLDAvis.gensim.prepare(lda, corpus, id2word)

In [11]:
# shows topics with lecture regex
word_list = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

# creates topic list
topics = []
for words in word_list:
    topic = ' '.join(words)
    topics.append(topic)   

In [12]:
lda_values = [lda[x] for x in corpus]

In [13]:
# topic_vals = []
# for x in range(len(lda_values)):
#     for i in range(9):
#         topic_vals.append(lda_values[x][i][1])

In [14]:
# lecture functrion for formating data to use in dataframe
def update(doc):
        d_dist = {k:0 for k in range(0,8)}
        for t in doc:
            d_dist[t[0]] = t[1]
        return d_dist
    
new_distro = [update(d) for d in lda_values]

In [15]:


DF = pd.DataFrame.from_records(new_distro)
DF.columns = topics
DF.head()

Unnamed: 0,kindle light read reading screen battery love like better new,far bought ipad purchase purchased tablet happy year working old,tablet kids loves old year great love bought games daughter,batteries good great price long work battery amazon brand buy,good tablet apps amazon screen use nice size perfect play,great tablet price good product works amazon value quality love,amazon tablet time like use charge work buy wifi alexa,easy use kindle love bought books read gift tablet reading
0,0.012507,0.012537,0.012504,0.713534,0.012535,0.012534,0.211338,0.01251
1,0.020857,0.020874,0.020839,0.854022,0.020855,0.020847,0.020854,0.020851
2,0.031261,0.208858,0.031258,0.603483,0.031258,0.031316,0.03126,0.031307
3,0.020849,0.020842,0.020834,0.854102,0.020841,0.020859,0.020838,0.020836
4,0.020837,0.020836,0.020842,0.854084,0.020836,0.020893,0.020836,0.020836


In [16]:
# what is the difference between from_recoreds and not using from records?
DF2 = pd.DataFrame(new_distro)
DF2.columns = topics
DF2.head()

Unnamed: 0,kindle light read reading screen battery love like better new,far bought ipad purchase purchased tablet happy year working old,tablet kids loves old year great love bought games daughter,batteries good great price long work battery amazon brand buy,good tablet apps amazon screen use nice size perfect play,great tablet price good product works amazon value quality love,amazon tablet time like use charge work buy wifi alexa,easy use kindle love bought books read gift tablet reading
0,0.012507,0.012537,0.012504,0.713534,0.012535,0.012534,0.211338,0.01251
1,0.020857,0.020874,0.020839,0.854022,0.020855,0.020847,0.020854,0.020851
2,0.031261,0.208858,0.031258,0.603483,0.031258,0.031316,0.03126,0.031307
3,0.020849,0.020842,0.020834,0.854102,0.020841,0.020859,0.020838,0.020836
4,0.020837,0.020836,0.020842,0.854084,0.020836,0.020893,0.020836,0.020836


In [17]:
# concats DF with category dataframe
finalDF = pd.concat([df['primaryCategories'], DF], axis=1)

In [18]:
# looks at topics used in different category reviews

finalDF.groupby('primaryCategories').mean()

Unnamed: 0_level_0,kindle light read reading screen battery love like better new,far bought ipad purchase purchased tablet happy year working old,tablet kids loves old year great love bought games daughter,batteries good great price long work battery amazon brand buy,good tablet apps amazon screen use nice size perfect play,great tablet price good product works amazon value quality love,amazon tablet time like use charge work buy wifi alexa,easy use kindle love bought books read gift tablet reading
primaryCategories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Animals & Pet Supplies,0.119633,0.145504,0.082952,0.141049,0.106739,0.245451,0.031277,0.119372
Electronics,0.092283,0.081155,0.210641,0.029242,0.116477,0.195677,0.089238,0.1728
"Electronics,Furniture",0.0,0.0,0.119464,0.0,0.200271,0.0,0.629416,0.0
"Electronics,Media",0.563683,0.037766,0.02359,0.021713,0.042163,0.073561,0.076965,0.145292
Health & Beauty,0.041548,0.069638,0.033273,0.557638,0.048495,0.15395,0.052277,0.035608
Home & Garden,0.041691,0.04168,0.342622,0.157301,0.291653,0.041676,0.041686,0.041691
Office Supplies,0.068214,0.245575,0.028121,0.053607,0.307375,0.182194,0.028129,0.086785
"Office Supplies,Electronics",0.273397,0.065127,0.046532,0.035876,0.089003,0.149591,0.067755,0.261338
"Toys & Games,Electronics",0.038054,0.08303,0.516212,0.022656,0.094401,0.07214,0.078792,0.081422
