 ## Table Of Contents:

   <a href='#imports'>Import</a><br/>
   <a href='#import data'>Import Data</a><br/>
   <a href='#pre-processing'>Text Preprocessing</a><br/>
   <a href='#LDA Model'>LDA Using GenSim</a><br/>
   <a href='#pyLDAvis'>Visualization Using pyLDAvis</a><br/>
   <a href='#predictions'>Predictions</a><br/>
   <a href='#Scratchpad'>Scratchpad</a><br/>

## Import Packages
<a id='imports'></a>

In [24]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 11 09:06:24 2017
@author: abhijitgp
"""
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
#from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import json
import pandas as pd
from nltk.stem import WordNetLemmatizer 
from collections import Counter
import matplotlib.pyplot as plt
import seaborn
import numpy as np

## Import Data
<a id='import data'></a>

In [25]:
sc_reviews = pd.read_json('sc_reviews.json')

## Pre-processing Pipeline
<a id='pre-processing'></a>

In [26]:
source = sc_reviews.review_text#[sc_reviews.category=='Face Moisturizer']
texts = []

stpwrds = set(stopwords.words('english'))
stpwrds.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','/'])
stpwrds.update(['skin','product','feel', 'use','line','look','nan','le', 'get', 'put','tell','since','dont'])
stpwrds.update(list('aeiou'))
stpwrds.update(['go','it','try','really','make','also','however','i\'ve','it\'s','don\'t'])
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

def reviewcleaner(text):
    tok = tokenizer.tokenize(text.lower())
    if len(tok)>5:
        tok = [w for w in tok if len(w)>2]
        postg = pos_tag(tok)
        postg = [(word,pos) for word,pos in postg if pos !='NNP']
        lem=[]
        for tk in postg:
            if tk[1][0]=='V':
                lem.append(lemmatizer.lemmatize(tk[0],'v'))
            else:
                lem.append(lemmatizer.lemmatize(tk[0]))
        bag = [word for word in lem if word not in stpwrds]
    else:
        bag = []
    return bag

src = source.apply(lambda x: reviewcleaner(x))
texts = [[word for word in doc if word not in stpwrds] for doc in src]

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

# f = []
# for i in source:
#     f.append(reviewcleaner(i))
# texts = f
# texts = [item for sublist in f for item in sublist]  

In [27]:
print texts[:5]

[[u'work'], [], [u'steam', u'mask', u'face', u'least', u'month', u'single', u'pouch', u'awesome', u'variety', u'without', u'spending', u'fortune'], [u'like', u'leave', u'clean'], [u'super', u'easy', u'face', u'soft', u'afterwards']]


In [28]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

## LDA Model
<a id='LDA Model'></a>

In [29]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, \
                                           num_topics=5, \
                                           id2word = dictionary, \
                                           passes=20)

all_topics = ldamodel.get_document_topics(corpus, per_word_topics=True)

for t in ldamodel.print_topics():
    print t

(0, u'0.031*"acne" + 0.021*"week" + 0.018*"day" + 0.015*"help" + 0.014*"dry" + 0.014*"spot" + 0.014*"work" + 0.013*"see" + 0.012*"breakout" + 0.011*"face"')
(1, u'0.072*"mask" + 0.053*"face" + 0.034*"great" + 0.029*"love" + 0.028*"dry" + 0.027*"like" + 0.025*"leave" + 0.021*"smell" + 0.019*"soft" + 0.019*"clean"')
(2, u'0.027*"serum" + 0.025*"great" + 0.022*"love" + 0.021*"like" + 0.018*"receive" + 0.017*"light" + 0.017*"greasy" + 0.015*"buy" + 0.015*"always" + 0.015*"sample"')
(3, u'0.039*"love" + 0.033*"night" + 0.033*"wrinkle" + 0.027*"day" + 0.026*"year" + 0.024*"face" + 0.020*"daily" + 0.016*"serum" + 0.015*"much" + 0.015*"oily"')
(4, u'0.035*"work" + 0.016*"like" + 0.014*"little" + 0.013*"would" + 0.012*"smell" + 0.011*"nose" + 0.011*"time" + 0.011*"amp" + 0.010*"well" + 0.009*"good"')


In [None]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

##############  Latent Semantic Indexing Model 
# lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
# lsi.print_topics()

# model = models.TfidfModel(corpus, normalize=True)
# lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)
# lsi_model.show_topics()

## Perform pyLDAvis-based visualization of LDA topic modeling.
<a id='pyLDAvis'></a>

In [None]:
from __future__ import division
import graphlab as gl
import pandas as pd
import pyLDAvis
import pyLDAvis.graphlab

pyLDAvis.enable_notebook()

In [30]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary,R=30)
# p = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
# pyLDAvis.save_html(p,'/Users/abhijitgp/Documents/INSIGHT/All50.html')

## For Predicting the topics in the new text:
<a id='predictions'></a>

In [None]:
from textblob import TextBlob

'''data_1 is a 5-star rated review whereas data_2 is a 1-star rated review for
https://www.amazon.com/Pure-Body-Naturals-Purifying-Treatment/dp/B00XTAFYKY/ref=cm_cr_arp_d_product_top?ie=UTF8'''

data_1 = "At the end of a long week, I love throwing this on as it works it's magic. \
I have sensitive skin and after using only two times, I've noticed a brighter complexion \
and even a little reduction of rocesa I've battled forever. I started using a charcoal \
face mask I found at Target before using this one and they are pretty \
comparable but Pure Body Naturals gives you four times as much for the same price."

data_2 = "If I could give this 0 stars, I would. A product has never made my face break out like this one. \
Of course, you give a product a try so I continued to use it for a couple of months (1 or 2x a week) \
to see if my skin would improve and it just made it worse. I haven't had so many breakouts since a teenager. \
I do still get acne, but 1-2 a month and I did have blackheads that I was hoping this would clear, but it didn't. \
I stopped using the product completely and my skin has finally returned back to normal."

data = data_1  # Change the input choice here.

print "The review sentiment polarity is:",TextBlob(data).sentiment.polarity
# print data
vec = dictionary.doc2bow(data.split())
# print vec
ldamodel.get_document_topics(vec)

##  -------------------- --------------------  SCRATCHPAD ---------------------------------------- 
<a id='Scratchpad'></a>