<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [195]:
#Start Here

from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

import spacy

import pandas as pd

In [196]:
%pwd

'/Users/willsn/Desktop/DS-Unit-4-Sprint-1-NLP/module4-topic-modeling'

In [197]:
%ls ../module1-text-data/data

libc++abi.dylib: terminating with uncaught exception of type std::runtime_error: Couldn't close file


In [198]:
df = pd.read_csv('../module1-text-data/data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [199]:
df.shape

(28332, 24)

In [200]:
# Subset to just one product or Category here

list(df)

['id',
 'dateAdded',
 'dateUpdated',
 'name',
 'asins',
 'brand',
 'categories',
 'primaryCategories',
 'imageURLs',
 'keys',
 'manufacturer',
 'manufacturerNumber',
 'reviews.date',
 'reviews.dateSeen',
 'reviews.didPurchase',
 'reviews.doRecommend',
 'reviews.id',
 'reviews.numHelpful',
 'reviews.rating',
 'reviews.sourceURLs',
 'reviews.text',
 'reviews.title',
 'reviews.username',
 'sourceURLs']

In [201]:
df['primaryCategories'].value_counts()

Electronics                    13995
Health & Beauty                12071
Toys & Games,Electronics        1676
Office Supplies,Electronics      386
Electronics,Media                185
Office Supplies                    9
Animals & Pet Supplies             6
Electronics,Furniture              2
Home & Garden                      2
Name: primaryCategories, dtype: int64

In [202]:
df = df[df['primaryCategories'] == 'Health & Beauty'].copy()

## Get Tokens

In [203]:
nlp = spacy.load("en_core_web_lg")

In [204]:
tokens = []

for doc in nlp.pipe(df['reviews.text'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False):
            doc_tokens.append(token.lemma_.lower())
            
    tokens.append(doc_tokens)
        
df['tokens'] = tokens

del tokens

KeyboardInterrupt: 

In [None]:
df['tokens'].head()


In [None]:
df.shape

In [None]:
df.tail()

# Create id2word


In [None]:
id2word = Dictionary(df['tokens'])
print(len(id2word))

In [None]:
id2word.filter_extremes(no_below=5, no_above=.98)
print(len(id2word))

## Create Corpus Object

In [None]:
corpus = [id2word.doc2bow(d) for d in df['tokens']]

In [None]:
model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5)

In [None]:
import re
words = [re.findall(r'"([^"]*)"',t[1]) for t in model.print_topics()]

In [None]:
topics = [' '.join(t[0:5]) for t in words]

In [None]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

In [None]:
# Steps to Score Training Documents
# Already have BOW Represented called 'corpus'
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics = 15,
                   passes=10,
                   workers=8
                  )


distro = [lda[d] for d in corpus]
num_topics = 20

def update(doc):
        d_dist = {k:0 for k in range(0,num_topics)}
        for topic in doc:
            # Topic is Tuple where the first part is the topic id
            # the second part is the topic distribution in that doc
            d_dist[topic[0]] = topic[1]
        return d_dist
    
new_distro = [update(d) for d in distro]


In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(model, corpus, id2word)

In [None]:
list(df)

In [None]:
# How are topics correlated with rating? 

topics = pd.DataFrame.from_records(new_distro)
topics.head()

#topics['primaryTopic'] = topics.idxmax(axis=1)

In [None]:
df['primaryTopic'] = topics.idxmax(axis=1)

In [None]:
df[['primaryTopic', 'reviews.rating']].groupby('primaryTopic').mean()

In [None]:
import seaborn as sns

ax = sns.boxplot(x="primaryTopic", y="reviews.rating",
                 data=df)

In [None]:
# Are certin topics more helpful?

df['reviews.numHelpful'].describe()

In [None]:
# Continuous Variable

# Bin - using boxplot of upvotes per topic? 

topics_df = pd.DataFrame.from_records(new_distro)

In [None]:
topics_df.head()

In [None]:
topics_df['primaryTopic'] = topics_df.idxmax(axis=1)

In [None]:
topics_df['primaryTopic'].value_counts()

In [None]:
topics_df['numHelpful'] = df['reviews.numHelpful']

In [None]:
#sns.boxplot(x="numHelpful", y="primaryTopic",  data=topics_df[(topics_df['numHelpful'] > 5) & (topics_df['numHelpful'] < 100)]);

# What would it look like to just examine reviews on one product? Could you find features that people liked/disliked?


In [205]:
df = pd.read_csv('../module1-text-data/data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [206]:
df.shape

(28332, 24)

In [207]:
df['name'].value_counts().head(20)

AmazonBasics AAA Performance Alkaline Batteries (36 Count)                                                                                      8343
AmazonBasics AA Performance Alkaline Batteries (48 Count) - Packaging May Vary                                                                  3728
Fire HD 8 Tablet with Alexa, 8 HD Display, 16 GB, Tangerine - with Special Offers                                                               2443
All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Black                                                           2370
Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16 GB, Pink Kid-Proof Case                                                                          1676
Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16 GB, Blue Kid-Proof Case                                                                          1425
Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16 GB, Green Kid-Proof Case                                   

In [208]:
#### I am only looking at pink Kindles

In [209]:
df = df[df['name']=='Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16 GB, Pink Kid-Proof Case'].copy()

In [210]:
df.shape

(1676, 24)

In [211]:
nlp = spacy.load("en_core_web_lg")

In [212]:
tokens = []

for doc in nlp.pipe(df['reviews.text'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False):
            doc_tokens.append(token.lemma_.lower())
            
    tokens.append(doc_tokens)
        
df['tokens'] = tokens

del tokens

KeyboardInterrupt: 

In [None]:
df['tokens'].head()

In [None]:
id2word = Dictionary(df['tokens'])
print(len(id2word))

In [None]:
id2word.filter_extremes(no_below=5, no_above=.98)
print(len(id2word))

In [None]:
print(id2word[0])

In [None]:
df['tokens']

In [None]:
#df

In [None]:
### Creating a corpus

In [None]:
corpus = [id2word.doc2bow(d) for d in df['tokens']]

In [None]:
corpus

In [None]:
model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5)

In [None]:
import re
words = [re.findall(r'"([^"]*)"',t[1]) for t in model.print_topics()]

In [None]:
topics = [' '.join(t[0:5]) for t in words]

In [None]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(model, corpus, id2word)

In [None]:
# Steps to Score Training Documents
# Already have BOW Represented called 'corpus'
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics = 15,
                   passes=10,
                   workers=8
                  )


distro = [lda[d] for d in corpus]
num_topics = 20

def update(doc):
        d_dist = {k:0 for k in range(0,num_topics)}
        for topic in doc:
            # Topic is Tuple where the first part is the topic id
            # the second part is the topic distribution in that doc
            d_dist[topic[0]] = topic[1]
        return d_dist
    
new_distro = [update(d) for d in distro]


In [None]:
# How are topics correlated with rating? 

topics = pd.DataFrame.from_records(new_distro)
topics.head()

In [None]:
df['primaryTopic'] = topics.idxmax(axis=1)

In [None]:
#df['primaryTopic1'] = topics.idxmax(axis=1)

In [None]:
df['primaryTopic']

In [None]:
list(df)

In [None]:
len(topics)

In [None]:
len(df)

In [None]:
df['primaryTopic1']

In [None]:
df[['primaryTopic', 'reviews.rating']].groupby('primaryTopic').mean()

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling