# Import packages

In [1]:
import numpy as np
import pandas as pd
import re
import time
import nltk

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaModel

In [5]:
import pyLDAvis
import pyLDAvis.gensim

# Load the review data

In [6]:
df = pd.read_csv('yelp_training_set_review.csv', encoding='utf-8')

In [7]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes.cool,votes.funny,votes.useful
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,0,5
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,0,1
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,0,2
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [8]:
df.shape

(229907, 10)

# Clean the reviews

In [9]:
%%time
wordnet_lemmatizer = WordNetLemmatizer()
stopset = list(set(stopwords.words('english')))
clean_reviews_text = []
for review in df['text']:  # Loop through the tokens (the words or symbols) in each review. 
    try:    
        cleaned_review = re.sub("[^a-zA-Z]"," ", review)  # Remove numbers and punctuation.
        cleaned_review = cleaned_review.lower()  # Convert the text to lower case.
        cleaned_review = ' '.join([word for word in cleaned_review.split() if word not in stopset])  # Keep only words that are not stopwords.
        cleaned_review = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='n') for word in cleaned_review.split()])  # Keep each noun's lemma.
        cleaned_review = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='v') for word in cleaned_review.split()])  # Keep each verb's lemma.
        cleaned_review = re.sub(r"(http\S+)"," ", cleaned_review)  # Remove http links.
        cleaned_review = ' '.join(cleaned_review.split())  # Remove white space.
    except TypeError:
        pass
    clean_reviews_text.append(cleaned_review)

CPU times: user 4min 37s, sys: 2.22 s, total: 4min 39s
Wall time: 4min 39s


In [10]:
df['cleanText'] = clean_reviews_text
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes.cool,votes.funny,votes.useful,cleanText
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,0,5,wife take birthday breakfast excellent weather...
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,idea people give bad review place go show plea...
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,0,1,love gyro plate rice good also dig candy selec...
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,0,2,rosie dakota love chaparral dog park convenien...
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,general manager scott petello good egg go deta...


Look at a review, before and after cleaning.

In [11]:
print('ORIGINAL: ' + df.iloc[0]['text'])
print(' ')
print('CLEANED: ' + df.iloc[0]['cleanText'])

ORIGINAL: My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!
 
CLEANED: wife take birthday breakfast excellent weather perfect make sit outside overlook gr

# Perform semantic analysis using LDA.

Preprocess the reviews by creating a dictionary of words used and a bag-of-words corpus. Note that each of the steps below takes several minutes.

In [12]:
%%time
tokens_by_doc = [review.split() for review in clean_reviews_text]
dictionary = corpora.Dictionary(tokens_by_doc)
bow_corpus = [dictionary.doc2bow(tokens) for tokens in tokens_by_doc]

CPU times: user 43 s, sys: 748 ms, total: 43.7 s
Wall time: 43.8 s


Run the LDA model. This will take about 10 minutes.

In [13]:
%%time
lda_model = LdaModel(bow_corpus, num_topics=10, id2word=dictionary, random_state=201)

CPU times: user 10min 43s, sys: 222 ms, total: 10min 44s
Wall time: 10min 44s


Make a list of the most likely topic for each review. This will also take about 10 minutes.

In [14]:
%%time
topics_list = []
for doc_topics in lda_model.get_document_topics(bow_corpus):
    topics_list.append(sorted(doc_topics, key=lambda doc: -doc[1])[0][0])

CPU times: user 6min 51s, sys: 153 ms, total: 6min 51s
Wall time: 6min 51s


In [15]:
lda_model.get_document_topics(bow_corpus[1])

[(1, 0.65749589578948586),
 (2, 0.027202527864085457),
 (3, 0.026109629012616129),
 (8, 0.28383360413610059)]

Add a new column to the data frame called 'Top Topic'.

In [16]:
df['Top Topic'] = topics_list
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes.cool,votes.funny,votes.useful,cleanText,Top Topic
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,0,5,wife take birthday breakfast excellent weather...,2
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,idea people give bad review place go show plea...,1
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,0,1,love gyro plate rice good also dig candy selec...,9
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,0,2,rosie dakota love chaparral dog park convenien...,6
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,general manager scott petello good egg go deta...,5


Create a pivot table of the average number of useful votes by top topic. Which topic gets the most useful votes? Do the same analysis for stars. Which topic gets the most stars?

In [17]:
pd.pivot_table(df, values='votes.useful', columns='Top Topic', aggfunc='mean')

Top Topic,0,1,2,3,4,5,6,7,8,9
votes.useful,1.42551,1.643496,1.644544,1.506919,1.846467,1.275724,1.573795,1.186448,1.314423,1.11718


In [18]:
pd.pivot_table(df, values='stars', columns='Top Topic', aggfunc='mean')

Top Topic,0,1,2,3,4,5,6,7,8,9
stars,4.1069,2.613003,4.138665,3.895518,4.163043,3.624386,3.956246,4.04478,3.850044,3.891745


Look at the probability of the top 20 words appearing in Topic 0.

In [19]:
lda_model.show_topic(topicid=5, topn=20)

[('get', 0.014583716381568577),
 ('go', 0.011807179761282066),
 ('time', 0.010875527405797673),
 ('call', 0.0096648951492867979),
 ('take', 0.0096433997560053525),
 ('work', 0.0093190344065860325),
 ('service', 0.0082792584297509339),
 ('would', 0.0082012522852191981),
 ('car', 0.0076914782720942003),
 ('say', 0.007617794254765357),
 ('need', 0.0074760666105620037),
 ('tell', 0.0073642497473562765),
 ('make', 0.0071528496211106136),
 ('back', 0.0069438132115129166),
 ('day', 0.0064086920690013061),
 ('customer', 0.0061722227505412244),
 ('know', 0.0056777046532892998),
 ('give', 0.0055192946849838987),
 ('year', 0.0052047466414405297),
 ('one', 0.0050669229777149131)]

# Visualize your LDA results

Preparing the visualization will take several minutes.

In [20]:
%%time
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

CPU times: user 6min 45s, sys: 1.28 s, total: 6min 46s
Wall time: 6min 53s


The plot on the left below is generated using multidimensional scaling, which is a general purpose algorithm for plotting, in two dimensions, items that are multidimensional (e.g., topics which are described by probabilities of thousands of words). For further reading on multidimensional scaling, see https://en.wikipedia.org/wiki/Multidimensional_scaling. 

When you hover over a topic on the left plot, you can then see a ranking of the most salient words on the right plot. Using these two plots, you might be able to create names for your ten topics.

In [21]:
%%time
pyLDAvis.display(LDAvis_prepared)

CPU times: user 5.85 ms, sys: 3.99 ms, total: 9.84 ms
Wall time: 9.44 ms
