<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import gensim 
import os
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pandas as pd

In [5]:
df = pd.read_csv('./data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [11]:
df['reviews.text'].head()

0    I order 3 of them and one of the item is bad q...
1    Bulk is always the less expensive way to go fo...
2    Well they are not Duracell but for the price i...
3    Seem to work as well as name brand batteries a...
4    These batteries are very long lasting the pric...
Name: reviews.text, dtype: object

In [26]:
STOPWORDS = set(STOPWORDS).union(['amazon', 'bought', 'price'])
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in set(STOPWORDS)]

In [27]:
df['tokens'] = df['reviews.text'].apply(tokenize)

In [28]:
df['tokens'].head()

0    [order, item, bad, quality, missing, backup, s...
1               [bulk, expensive, way, products, like]
2                                    [duracell, happy]
3                     [work, brand, batteries, better]
4                    [batteries, long, lasting, great]
Name: tokens, dtype: object

In [29]:
id2word=corpora.Dictionary(df['tokens'])

In [30]:
id2word.token2id['batteries']

18

In [31]:
id2word[4000]

'switches'

In [32]:
corpus = [id2word.doc2bow(text) for text in df['tokens']]

In [44]:
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics = 5,
                   passes=10,
                   workers=6
                  )

In [45]:
lda.print_topics()

[(0,
  '0.039*"good" + 0.025*"great" + 0.023*"product" + 0.022*"batteries" + 0.014*"use" + 0.013*"works" + 0.011*"deal" + 0.009*"long" + 0.008*"box" + 0.007*"time"'),
 (1,
  '0.047*"tablet" + 0.032*"kids" + 0.024*"great" + 0.022*"apps" + 0.021*"old" + 0.020*"year" + 0.018*"games" + 0.016*"loves" + 0.016*"love" + 0.014*"play"'),
 (2,
  '0.036*"great" + 0.035*"tablet" + 0.030*"use" + 0.030*"kindle" + 0.029*"easy" + 0.023*"love" + 0.015*"loves" + 0.014*"books" + 0.012*"reading" + 0.011*"gift"'),
 (3,
  '0.074*"batteries" + 0.046*"great" + 0.034*"good" + 0.023*"buy" + 0.021*"work" + 0.021*"long" + 0.014*"value" + 0.014*"brand" + 0.014*"battery" + 0.012*"quality"'),
 (4,
  '0.019*"tablet" + 0.017*"like" + 0.016*"screen" + 0.013*"good" + 0.013*"use" + 0.013*"great" + 0.011*"device" + 0.009*"kindle" + 0.009*"ipad" + 0.008*"battery"')]

In [47]:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

In [48]:
topics = [' '.join(t[0:5]) for t in words]

In [49]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
good great product batteries use

------ Topic 1 ------
tablet kids great apps old

------ Topic 2 ------
great tablet use kindle easy

------ Topic 3 ------
batteries great good buy work

------ Topic 4 ------
tablet like screen good use



In [41]:
from gensim.models.coherencemodel import CoherenceModel

def compute_coherence_values(dictionary, corpus, limit, start=2, step=3, passes=5):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : Max num of topics
    passes: the number of times the entire lda model & coherence values are calculated

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    
    coherence_values =[]
    
    for iter_ in range (passes):
        for num_topics in range(start, limit, step):
            model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, workers=4)
            coherencemodel = CoherenceModel(model=model, dictionary=dictionary, corpus=corpus, coherence='u_mass')
            coherence_values.append({'pass': iter_,
                                    'num_topics': num_topics,
                                    'coherence_score': coherencemodel.get_coherence()
                                    })
    
    return coherence_values  

In [43]:
coherence_values = compute_coherence_values(dictionary=id2word,
                                           corpus=corpus,
                                           start=2,limit=40,
                                           step=6,
                                           passes=40)

KeyboardInterrupt: 

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling