## Sentiment Analysis: Yelp Reviews

In [4]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from six.moves import range
import json

# Setup Pandas
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import the garbage collection
import gc

# Setup Seaborn
sns.set_style("whitegrid")
sns.set_context("poster")

#Set maximum number of rows,columns to be shown. Maximum column width made to 500.
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

In [4]:
import os
os.chdir("C:\Master\PD_Career\Courses\Springboard\Capstone Project 2")

#data directory
data_path = "yelp-datasets/"
intermediate_data_path = "yelp-datasets/intermediate/"
source_data_path = "yelp-datasets/source/"
submit_data_path = "yelp-datasets/submit/"

## Data Preview
First, We will preview the datasets by exploring how the data is stored in the source files. <br>.

In [7]:
with open(source_data_path+"yelp_academic_dataset_business.json", encoding='utf_8') as business_json_file:
    print(business_json_file.readline())

{"business_id":"1SWheh84yJXfytovILXOAQ","name":"Arizona Biltmore Golf Club","address":"2818 E Camino Acequia Drive","city":"Phoenix","state":"AZ","postal_code":"85016","latitude":33.5221425,"longitude":-112.0184807,"stars":3.0,"review_count":5,"is_open":0,"attributes":{"GoodForKids":"False"},"categories":"Golf, Active Life","hours":null}



In [6]:
with open(source_data_path+"yelp_academic_dataset_review.json", encoding='utf_8') as review_json_file:
    print(review_json_file.readline())

{"review_id":"Q1sbwvVQXV2734tPgoKj4Q","user_id":"hG7b0MtEbXx5QzbzE6C_VA","business_id":"ujmEBvifdJM6h6RLv4wQIg","stars":1.0,"useful":6,"funny":1,"cool":0,"text":"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","date":"2013-05-07 04:34:36"}



From the below two previews, it is evident that each row of the dataset is stored in json format. <br>
As we are concerned only with the restaurant reviews, we will extract all the businesses in restaurant category and then find all corresponding reviews. <br> All other business catgoreis will be out of scope for our project. <br>

In [14]:
restaurant_id_set = set()
with open(source_data_path+"yelp_academic_dataset_business.json", encoding='utf_8') as business_json_file:
    for line_no, business_json in enumerate(business_json_file):
        if line_no > 50000:
            break
        else:
            business_record = json.loads(business_json)
            #print(business_record[u'categories'])
            #Check whether the business is a restaurant
            if business_record[u'categories'] is not None:
                if u'Restaurants' not in business_record[u'categories']:
                    continue
                restaurant_id_set.add(business_record[u'business_id'])

In [18]:
# Checking the number of restaurants in sample set
print(len(restaurant_id_set))
# 15304 unique restaurants. This is a good sample of restaurants to start with.

# When complete dataset will be considered, convert the set to frozenset to avoid any accidental modifications to the set.
# restaurant_id_set = frozenset(restaurant_id_set)

15304


In [22]:
## Collect reviews for the restaurants in the set
## Data Preparation
if True:
    review_count=0
    
    # Write the reviews in text format. 
    with open(intermediate_data_path+"yelp_restaurants_review.txt", 'w' ,encoding='utf_8') as review_txt_file:
        with open(source_data_path+"yelp_academic_dataset_review.json", encoding='utf_8') as review_json_file:
            for line_no, review_json in enumerate(review_json_file):
                if line_no > 100000:
                    break
                else:
                    review_record=json.loads(review_json)

                    #Check if the review is about a restaurant
                    if review_record[u'business_id'] not in restaurant_id_set:
                        continue
                    #Write restaurant review as a line in the new file
                    #escape new line characters in the original review file
                    review_txt_file.write(review_record[u'text'].replace('\n', '\\n')+'\n')
                    review_count+=1
    print("Total number of reviews: ", review_count) 

Total number of reviews:  63667


Now, using SpaCy, we can make sense out of the reviews data. <br>
Spacy can do the following: <br>
(1) Tokenization <br> 
(2) Text normalization, such as lowercasing, stemming/lemmatization <br>
(3) Part-of-speech tagging <br>
(4) Syntactic dependency parsing <br>
(5) Sentence boundary detection <br>
(6) Named entity recognition and annotation <br>

In [1]:
import spacy
import pandas as pd
import itertools as it

nlp = spacy.load('en_core_web_sm')

We will apply the transformations on a sample review

In [7]:
with open(intermediate_data_path+"yelp_restaurants_review.txt", encoding='utf_8') as review:
    sample_review = list(it.islice(review, 1, 2))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print(sample_review)

I'll be the first to admit that I was not excited about going to La Tavolta. Being a food snob, when a group of friends suggested we go for dinner I looked online at the menu and to me there was nothing special and it seemed overpriced.  Im also not big on ordering pasta when I go out. Alas, I was outnumbered. Thank goodness! I ordered the sea bass special. It was to die for. Cooked perfectly, seasoned perfectly, perfect portion. I can not say enough good things about this dish. When the server asked how it was he seemed very proud of the dish and said, " doesn't she (the chef) do an incredible job?" She does. 

My hubby got the crab tortellini and also loved his. I heard "mmmm this is so good" from all around the table. Our waiter was super nice and even gave us free desserts because we were some of the last people in the restaurant. Service was very slow and the place was PACKED but we had our jugs of wine and a large group with good conversation so it didn't seem to bother anyone.



In [8]:
%%time
parsed_review = nlp(sample_review)

Wall time: 296 ms


In [9]:
print(parsed_review)

I'll be the first to admit that I was not excited about going to La Tavolta. Being a food snob, when a group of friends suggested we go for dinner I looked online at the menu and to me there was nothing special and it seemed overpriced.  Im also not big on ordering pasta when I go out. Alas, I was outnumbered. Thank goodness! I ordered the sea bass special. It was to die for. Cooked perfectly, seasoned perfectly, perfect portion. I can not say enough good things about this dish. When the server asked how it was he seemed very proud of the dish and said, " doesn't she (the chef) do an incredible job?" She does. 

My hubby got the crab tortellini and also loved his. I heard "mmmm this is so good" from all around the table. Our waiter was super nice and even gave us free desserts because we were some of the last people in the restaurant. Service was very slow and the place was PACKED but we had our jugs of wine and a large group with good conversation so it didn't seem to bother anyone.



In [11]:
#Sentence Detection and Segmentation
for num, sentence in enumerate(parsed_review.sents):
    print('Sentence {}:'.format(num + 1))
    print(sentence)
    print('')

Sentence 1:
I'll be the first to admit that I was not excited about going to La Tavolta.

Sentence 2:
Being a food snob, when a group of friends suggested we go for dinner

Sentence 3:
I looked online at the menu and to me there was nothing special

Sentence 4:
and it seemed overpriced.  

Sentence 5:
Im also not big on ordering pasta when I go out.

Sentence 6:
Alas, I was outnumbered.

Sentence 7:
Thank goodness!

Sentence 8:
I ordered the sea bass special.

Sentence 9:
It was to die for.

Sentence 10:
Cooked perfectly, seasoned perfectly, perfect portion.

Sentence 11:
I can not say enough good things about this dish.

Sentence 12:
When the server asked how it was he seemed very proud of the dish and said, " doesn't she (the chef) do an incredible job?

Sentence 13:
"

Sentence 14:
She does. 



Sentence 15:
My hubby got the crab tortellini and also loved his.

Sentence 16:
I heard "mmmm this is so good" from all around the table.

Sentence 17:
Our waiter was super nice and even gav

In [12]:
# Named Entity Recognition (NER)
for num, entity in enumerate(parsed_review.ents):
    print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print ('')

Entity 1: first - ORDINAL

Entity 2: La Tavolta - ORG

Entity 3: Romano - ORG



In [13]:
# Part of Speech Tagging
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,I,PRON
1,'ll,AUX
2,be,VERB
3,the,DET
4,first,ADJ
5,to,PART
6,admit,VERB
7,that,ADP
8,I,PRON
9,was,VERB


In [14]:
# Token Stemming and Lemmatization
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_lemma, token_shape),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,I,-PRON-,X
1,'ll,will,'xx
2,be,be,xx
3,the,the,xxx
4,first,first,xxxx
5,to,to,xx
6,admit,admit,xxxx
7,that,that,xxxx
8,I,-PRON-,X
9,was,be,xxx


In [15]:
# stopword, punctuation, whitespace, number
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,I,-20.0,Yes,,,,Yes
1,'ll,-20.0,Yes,,,,Yes
2,be,-20.0,Yes,,,,Yes
3,the,-20.0,Yes,,,,Yes
4,first,-20.0,Yes,,,,Yes
5,to,-20.0,Yes,,,,Yes
6,admit,-20.0,,,,,Yes
7,that,-20.0,Yes,,,,Yes
8,I,-20.0,Yes,,,,Yes
9,was,-20.0,Yes,,,,Yes
