In [None]:
#!pip install nltk

In [165]:
import os
import requests
from bs4 import BeautifulSoup
import ast
import re
import numpy as np
import pandas as pd

In [166]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk import tokenize
nltk.download('punkt')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\astanchi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [167]:
websites = []
root_path = os.getcwd()
file_input = os.path.join(root_path, 'yelp_closed_biz.txt')
with open(file_input,'r') as fh:
    websites = fh.read().splitlines()
websites

['https://www.yelp.com/biz/maxs-restaurants-san-francisco-3',
 'https://www.yelp.com/biz/franceschis-san-francisco',
 'https://www.yelp.com/biz/butterfly-san-francisco',
 'https://www.yelp.com/biz/la-terrasse-san-francisco',
 'https://www.yelp.com/biz/titas-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-public-san-francisco',
 'https://www.yelp.com/biz/tin-pan-san-francisco',
 'https://www.yelp.com/biz/eat-restaurant-san-francisco',
 'https://www.yelp.com/biz/bonanza-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-metro-bar-and-restaurant-san-francisco',
 'https://www.yelp.com/biz/sinbads-pier2-restaurant-san-francisco-2',
 'https://www.yelp.com/biz/the-monte-carlo-san-francisco',
 'https://www.yelp.com/biz/myth-san-francisco',
 'https://www.yelp.com/biz/2223-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-palace-san-francisco',
 'https://www.yelp.com/biz/watergate-san-francisco',
 'https://www.yelp.com/biz/tallula-san-francisco',
 'https://www.yelp.com/b

In [168]:
def cleanText(dirtyText):
    text = dirtyText.replace('null','""')
    text = text.replace('\t','')
    text = re.sub(' +',' ',text) #replace multiple white spaces with one
    return text[1:]

In [169]:
def analyzeParaSentiment(paragraph, doprint = False):
    global rest_by_sen_cmpd, rest_by_sen_pos, rest_by_sen_neu, rest_by_sen_neg
    
    sentence_list = tokenize.sent_tokenize(paragraph)

    para_cmpd = []
    para_pos = []
    para_neu = []
    para_neg = []

    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        para_cmpd.append(vs["compound"])
        para_pos.append(vs["pos"])
        para_neu.append(vs["neu"])
        para_neg.append(vs["neg"])
        
        rest_by_sen_cmpd.append(vs["compound"])
        rest_by_sen_pos.append(vs["pos"])
        rest_by_sen_neu.append(vs["neu"])
        rest_by_sen_neg.append(vs["neg"])
        
        if doprint:
            print("{:-<100} {}".format(sentence, str(vs["compound"])))

    if doprint:
        print("AVERAGE COMPOUND SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_cmpd),4)))
        print("AVERAGE POSITIVE SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_pos),4)))
        print("AVERAGE NEUTRAL SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_neu),4)))
        print("AVERAGE NEGATIVE SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_neg),4)))
        
    return(round(np.mean(para_cmpd),4), round(np.mean(para_pos),4), round(np.mean(para_neu),4), round(np.mean(para_neg),4))

In [170]:
def analyzeYelpPage(yelp_dict,doprint = False):
    page_cmpd = []
    page_pos = []
    page_neu = []
    page_neg = []
    
    try: 
        for idx, review in enumerate(yelp_dict["review"]):
            if doprint:
                print("\nReview #%s: %s\n" % (idx+1,review["description"]))
            para_cmpd, para_pos, para_neu, para_neg = analyzeParaSentiment(review["description"], doprint)

            page_cmpd.append(para_cmpd)
            page_pos.append(para_pos)
            page_neu.append(para_neu)
            page_neg.append(para_neg)    
            
        if doprint:
            print("\n\nAVERAGE YELP PAGE REVIEW SENTIMENT: \t" + str(round(np.mean(page_cmpd),4)))
            print ("************************************************************\n\n")

    except:
        print("Error parsing review.")
    
    return (page_cmpd, page_pos, page_neu, page_neg)

In [191]:
def analyzeYelp(url, doprint=False):
    global rest_by_sen_cmpd, rest_by_sen_pos, rest_by_sen_neu, rest_by_sen_neg
    print("processing url: %s" % url)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    pagination = soup.find("div", {"class":"page-of-pages arrange_unit arrange_unit--fill"})
    if pagination:
        page_list = pagination.text.strip().split(" ")
        try:
            max_page = int(page_list[-1])
        except:
            pass
    
    rest_name = ""
    rest_review = 0.0
    rest_reviewcnt = 0
    rest_cmpd = []
    rest_pos = []
    rest_neu = []
    rest_neg = []
    
    # reinitialize restaurant by sentence lists
    rest_by_sen_cmpd = []
    rest_by_sen_pos = []
    rest_by_sen_neu = []
    rest_by_sen_neg = []
    
    stuff = soup.find("script", type="application/ld+json")
    if stuff:
        good_stuff = cleanText(stuff.text)
        yelp_dict = ast.literal_eval(good_stuff)
        rest_name = yelp_dict["name"]
            
        if ("aggregateRating" in yelp_dict):
            try: 
                rest_review = yelp_dict["aggregateRating"]["ratingValue"]
                rest_reviewcnt = yelp_dict["aggregateRating"]["reviewCount"]

                if doprint:
                    print ("\n\n************************************************************")
                    print("Restaurant: %s" % rest_name)
                    print("Yelp Rating: %s" % rest_review)  
                page_cmpd, page_pos, page_neu, page_neg = analyzeYelpPage(yelp_dict, doprint)
                rest_cmpd.extend(page_cmpd)
                rest_pos.extend(page_pos)
                rest_neu.extend(page_neu)
                rest_neg.extend(page_neg)
                reviews_per_page = 20
                page_cnt = 1
                if max_page > 1: #more than 20 yelp reviews
                    while (page_cnt < 4):
                        page_url = url + "?start=" + str(reviews_per_page * page_cnt)
                        page = requests.get(page_url)
                        soup = BeautifulSoup(page.text, 'lxml')
                        stuff = soup.find("script", type="application/ld+json")
                        if stuff:
                            good_stuff = cleanText(stuff.text)
                            yelp_dict = ast.literal_eval(good_stuff)
                            page_cmpd, page_pos, page_neu, page_neg = analyzeYelpPage(yelp_dict, doprint)
                            rest_cmpd.extend(page_cmpd)
                            rest_pos.extend(page_pos)
                            rest_neu.extend(page_neu)
                            rest_neg.extend(page_neg)
                        page_cnt += 1

                if doprint:
                    print("\n\nAVERAGE YELP REVIEW SENTIMENT: \t" + str(round(np.mean(rest_cmpd),4)))
                    print ("************************************************************\n\n")

            except:
                print("Error parsing review for %s" % rest_name)
    
    if rest_cmpd:
        avg_cmpd = round(np.mean(rest_cmpd),4)
        avg_pos = round(np.mean(rest_pos),4)
        avg_neu = round(np.mean(rest_neu),4)
        avg_neg = round(np.mean(rest_neg),4)
        
        avg_rest_by_sen_cmpd = round(np.mean(rest_by_sen_cmpd),4)
        avg_rest_by_sen_pos = round(np.mean(rest_by_sen_pos),4)
        avg_rest_by_sen_neu = round(np.mean(rest_by_sen_neu),4)
        avg_rest_by_sen_neg = round(np.mean(rest_by_sen_neg),4)
        
        rest_median_cmpd = np.median(rest_by_sen_cmpd)
        rest_median_pos = np.median(rest_by_sen_pos)
        rest_median_neu = np.median(rest_by_sen_neu)
        rest_median_neg = np.median(rest_by_sen_neg)
        
        rest_cmpd_min = np.min(rest_by_sen_cmpd)
        rest_cmpd_max = np.max(rest_by_sen_cmpd)
        rest_pos_min = np.min(rest_by_sen_pos)
        rest_pos_max = np.max(rest_by_sen_pos)
        rest_neu_min = np.min(rest_by_sen_neu)
        rest_neu_max = np.max(rest_by_sen_neu)
        rest_neg_min = np.min(rest_by_sen_neg)
        rest_neg_max = np.max(rest_by_sen_neg)
    else:
        avg_cmpd = 0.0
        avg_pos = 0.0
        avg_neu = 0.0
        avg_neg = 0.0
        
        avg_rest_by_sen_cmpd = 0.0
        avg_rest_by_sen_pos = 0.0
        avg_rest_by_sen_neu = 0.0
        avg_rest_by_sen_neg = 0.0
        
        rest_median_cmpd =  0.0
        rest_median_pos =  0.0
        rest_median_neu =  0.0
        rest_median_neg =  0.0
        
        rest_cmpd_min = 0
        rest_cmpd_max = 0
        rest_pos_min = 0
        rest_pos_max = 0
        rest_neu_min = 0
        rest_neu_max = 0
        rest_neg_min = 0
        rest_neg_max = 0
    return {
        "name": rest_name,
        "rating": rest_review,
        "compound": avg_cmpd,
        "positive": avg_pos,
        "neutral": avg_neu,
        "negative": avg_neg,
        "num_reviews": rest_reviewcnt,
        "compound_by_sent": avg_rest_by_sen_cmpd,
        "positive_by_sent": avg_rest_by_sen_pos,
        "neutral_by_sent": avg_rest_by_sen_neu,
        "negative_by_sent": avg_rest_by_sen_neg,
        "median_cmpd": rest_median_cmpd,
        "median_pos": rest_median_pos,
        "median_neu": rest_median_neu,
        "median_neg": rest_median_neg,
        "range_cmpd_min": rest_cmpd_min,
        "range_cmpd_max": rest_cmpd_max,
        "range_pos_min": rest_pos_min,
        "range_pos_max": rest_pos_max,
        "range_neu_min": rest_neu_min,
        "range_neu_max": rest_neu_max,
        "range_neg_min": rest_neg_min,
        "range_neg_max": rest_neg_max,
        
    }

In [194]:
# store sentence evaluation
rest_by_sen_cmpd = []
rest_by_sen_pos = []
rest_by_sen_neu = []
rest_by_sen_neg = []

reviews = []
for url in websites: # websites[0:1]:
    reviews.append(analyzeYelp(url, False))

#reviews

processing url: https://www.yelp.com/biz/maxs-restaurants-san-francisco-3
processing url: https://www.yelp.com/biz/franceschis-san-francisco
processing url: https://www.yelp.com/biz/butterfly-san-francisco
processing url: https://www.yelp.com/biz/la-terrasse-san-francisco
processing url: https://www.yelp.com/biz/titas-restaurant-san-francisco
processing url: https://www.yelp.com/biz/the-public-san-francisco
processing url: https://www.yelp.com/biz/tin-pan-san-francisco
processing url: https://www.yelp.com/biz/eat-restaurant-san-francisco
processing url: https://www.yelp.com/biz/bonanza-restaurant-san-francisco
processing url: https://www.yelp.com/biz/the-metro-bar-and-restaurant-san-francisco
processing url: https://www.yelp.com/biz/sinbads-pier2-restaurant-san-francisco-2
processing url: https://www.yelp.com/biz/the-monte-carlo-san-francisco
processing url: https://www.yelp.com/biz/myth-san-francisco
processing url: https://www.yelp.com/biz/2223-restaurant-san-francisco
processing url

In [195]:
reviews_df = pd.DataFrame(reviews)
reviews_df

Unnamed: 0,compound,compound_by_sent,median_cmpd,median_neg,median_neu,median_pos,name,negative,negative_by_sent,neutral,...,positive_by_sent,range_cmpd_max,range_cmpd_min,range_neg_max,range_neg_min,range_neu_max,range_neu_min,range_pos_max,range_pos_min,rating
0,0.1346,0.0497,0.00000,0.000,0.8580,0.0000,Max’s Restaurants,0.0876,0.1100,0.8159,...,0.0783,0.7269,-0.4404,1.000,0.0,1.0,0.000,0.357,0.0,3.0
1,0.0624,0.0624,0.00000,0.061,0.8080,0.1150,Franceschi’s,0.0619,0.0619,0.8240,...,0.1142,0.7050,-0.4336,0.150,0.0,1.0,0.527,0.343,0.0,2.0
2,0.2836,0.2691,0.31820,0.000,0.7990,0.1490,Butterfly Restaurant,0.0483,0.0429,0.7384,...,0.1848,0.9546,-0.8377,1.000,0.0,1.0,0.000,1.000,0.0,3.5
3,0.1785,0.1455,0.00000,0.000,0.8110,0.0490,La Terrasse,0.0676,0.0690,0.7766,...,0.1410,0.9523,-0.9015,1.000,0.0,1.0,0.000,1.000,0.0,2.5
4,0.2955,0.2688,0.38180,0.000,0.8140,0.1800,Tita’s Restaurant,0.0560,0.0585,0.7835,...,0.1470,0.8550,-0.3818,1.000,0.0,1.0,0.000,0.610,0.0,3.0
5,0.2926,0.2662,0.29675,0.000,0.7860,0.1500,The Public,0.0410,0.0429,0.7519,...,0.1882,0.9775,-0.8622,1.000,0.0,1.0,0.000,1.000,0.0,4.0
6,0.2505,0.0579,0.00000,0.000,0.9650,0.0000,Tin Pan,0.0839,0.0671,0.7681,...,0.0593,0.7579,-0.5994,0.262,0.0,1.0,0.473,0.400,0.0,2.5
7,0.3689,0.2781,0.31820,0.000,0.8090,0.1600,EAT Restaurant,0.0286,0.0342,0.7373,...,0.1945,0.9635,-0.8553,1.000,0.0,1.0,0.000,1.000,0.0,4.0
8,0.3146,0.2322,0.20680,0.000,0.8070,0.1280,Bonanza Restaurant,0.0433,0.0399,0.7200,...,0.1914,0.9464,-0.9313,1.000,0.0,1.0,0.000,1.000,0.0,4.0
9,0.3310,0.3089,0.38180,0.000,0.7700,0.1820,The Metro Bar and Restaurant,0.0332,0.0377,0.7533,...,0.2004,0.9880,-0.8594,1.000,0.0,1.0,0.000,1.000,0.0,4.0


In [196]:
reviews_df.to_csv('yelp_review_analysis.csv', index=False)