In [None]:
#!pip install nltk

In [145]:
import os
import requests
from bs4 import BeautifulSoup
import ast
import re
import numpy as np
import pandas as pd

In [146]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk import tokenize
nltk.download('punkt')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\astanchi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [147]:
websites = []
root_path = os.getcwd()
file_input = os.path.join(root_path, 'yelp_closed_biz.txt')
with open(file_input,'r') as fh:
    websites = fh.read().splitlines()
websites

['https://www.yelp.com/biz/maxs-restaurants-san-francisco-3',
 'https://www.yelp.com/biz/franceschis-san-francisco',
 'https://www.yelp.com/biz/butterfly-san-francisco',
 'https://www.yelp.com/biz/la-terrasse-san-francisco',
 'https://www.yelp.com/biz/titas-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-public-san-francisco',
 'https://www.yelp.com/biz/tin-pan-san-francisco',
 'https://www.yelp.com/biz/eat-restaurant-san-francisco',
 'https://www.yelp.com/biz/bonanza-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-metro-bar-and-restaurant-san-francisco',
 'https://www.yelp.com/biz/sinbads-pier2-restaurant-san-francisco-2',
 'https://www.yelp.com/biz/the-monte-carlo-san-francisco',
 'https://www.yelp.com/biz/myth-san-francisco',
 'https://www.yelp.com/biz/2223-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-palace-san-francisco',
 'https://www.yelp.com/biz/watergate-san-francisco',
 'https://www.yelp.com/biz/tallula-san-francisco',
 'https://www.yelp.com/b

In [148]:
def cleanText(dirtyText):
    text = dirtyText.replace('null','""')
    text = text.replace('\t','')
    text = re.sub(' +',' ',text) #replace multiple white spaces with one
    return text[1:]

In [149]:
def analyzeParaSentiment(paragraph, doprint = False):
    sentence_list = tokenize.sent_tokenize(paragraph)

    para_cmpd = []
    para_pos = []
    para_neu = []
    para_neg = []

    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        para_cmpd.append(vs["compound"])
        para_pos.append(vs["pos"])
        para_neu.append(vs["neu"])
        para_neg.append(vs["neg"])
        if doprint:
            print("{:-<100} {}".format(sentence, str(vs["compound"])))

    if doprint:
        print("AVERAGE COMPOUND SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_cmpd),4)))
        print("AVERAGE POSITIVE SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_pos),4)))
        print("AVERAGE NEUTRAL SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_neu),4)))
        print("AVERAGE NEGATIVE SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_neg),4)))
        
    return(round(np.mean(para_cmpd),4), round(np.mean(para_pos),4), round(np.mean(para_neu),4), round(np.mean(para_neg),4))

In [150]:
def analyzeYelpPage(yelp_dict,doprint = False):
    page_cmpd = []
    page_pos = []
    page_neu = []
    page_neg = []
    
    try: 
        for idx, review in enumerate(yelp_dict["review"]):
            if doprint:
                print("\nReview #%s: %s\n" % (idx+1,review["description"]))
            para_cmpd, para_pos, para_neu, para_neg = analyzeParaSentiment(review["description"], doprint)

            page_cmpd.append(para_cmpd)
            page_pos.append(para_pos)
            page_neu.append(para_neu)
            page_neg.append(para_neg)    
            
        if doprint:
            print("\n\nAVERAGE YELP PAGE REVIEW SENTIMENT: \t" + str(round(np.mean(page_cmpd),4)))
            print ("************************************************************\n\n")

    except:
        print("Error parsing review.")
    
    return (page_cmpd, page_pos, page_neu, page_neg)

In [151]:
def analyzeYelp(url, doprint=False):
    
    print("processing url: %s" % url)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    pagination = soup.find("div", {"class":"page-of-pages arrange_unit arrange_unit--fill"})
    if pagination:
        page_list = pagination.text.strip().split(" ")
        try:
            max_page = int(page_list[-1])
        except:
            pass
    
    rest_name = ""
    rest_review = 0.0
    rest_reviewcnt = 0
    rest_cmpd = []
    rest_pos = []
    rest_neu = []
    rest_neg = []
     
    stuff = soup.find("script", type="application/ld+json")
    if stuff:
        good_stuff = cleanText(stuff.text)
        yelp_dict = ast.literal_eval(good_stuff)
        rest_name = yelp_dict["name"]
            
        if ("aggregateRating" in yelp_dict):
            try: 
                rest_review = yelp_dict["aggregateRating"]["ratingValue"]
                rest_reviewcnt = yelp_dict["aggregateRating"]["reviewCount"]

                if doprint:
                    print ("\n\n************************************************************")
                    print("Restaurant: %s" % rest_name)
                    print("Yelp Rating: %s" % rest_review)  
                page_cmpd, page_pos, page_neu, page_neg = analyzeYelpPage(yelp_dict, doprint)
                rest_cmpd.extend(page_cmpd)
                rest_pos.extend(page_pos)
                rest_neu.extend(page_neu)
                rest_neg.extend(page_neg)
                reviews_per_page = 20
                page_cnt = 1
                if max_page > 1: #more than 20 yelp reviews
                    while (page_cnt < 4):
                        page_url = url + "?start=" + str(reviews_per_page * page_cnt)
                        page = requests.get(page_url)
                        soup = BeautifulSoup(page.text, 'lxml')
                        stuff = soup.find("script", type="application/ld+json")
                        if stuff:
                            good_stuff = cleanText(stuff.text)
                            yelp_dict = ast.literal_eval(good_stuff)
                            page_cmpd, page_pos, page_neu, page_neg = analyzeYelpPage(yelp_dict, doprint)
                            rest_cmpd.extend(page_cmpd)
                            rest_pos.extend(page_pos)
                            rest_neu.extend(page_neu)
                            rest_neg.extend(page_neg)
                        page_cnt += 1

                if doprint:
                    print("\n\nAVERAGE YELP REVIEW SENTIMENT: \t" + str(round(np.mean(rest_cmpd),4)))
                    print ("************************************************************\n\n")

            except:
                print("Error parsing review for %s" % rest_name)
    
    if rest_cmpd:
        avg_cmpd = round(np.mean(rest_cmpd),4)
        avg_pos = round(np.mean(rest_pos),4)
        avg_neu = round(np.mean(rest_neu),4)
        avg_neg = round(np.mean(rest_neg),4)
    else:
        avg_cmpd = 0.0
        avg_pos = 0.0
        avg_neu = 0.0
        avg_neg = 0.0
    
    return {
        "name": rest_name,
        "rating": rest_review,
        "compound": avg_cmpd,
        "positive": avg_pos,
        "neutral": avg_neu,
        "negative": avg_neg,
        "num_reviews": rest_reviewcnt
    }

In [152]:
reviews = []
for url in websites: # websites[0:1]:
    reviews.append(analyzeYelp(url, False))

#reviews

processing url: https://www.yelp.com/biz/maxs-restaurants-san-francisco-3
processing url: https://www.yelp.com/biz/franceschis-san-francisco
processing url: https://www.yelp.com/biz/butterfly-san-francisco
processing url: https://www.yelp.com/biz/la-terrasse-san-francisco
processing url: https://www.yelp.com/biz/titas-restaurant-san-francisco
processing url: https://www.yelp.com/biz/the-public-san-francisco
processing url: https://www.yelp.com/biz/tin-pan-san-francisco
processing url: https://www.yelp.com/biz/eat-restaurant-san-francisco
processing url: https://www.yelp.com/biz/bonanza-restaurant-san-francisco
processing url: https://www.yelp.com/biz/the-metro-bar-and-restaurant-san-francisco
processing url: https://www.yelp.com/biz/sinbads-pier2-restaurant-san-francisco-2
processing url: https://www.yelp.com/biz/the-monte-carlo-san-francisco
processing url: https://www.yelp.com/biz/myth-san-francisco
processing url: https://www.yelp.com/biz/2223-restaurant-san-francisco
processing url

In [153]:
reviews_df = pd.DataFrame(reviews)
reviews_df

Unnamed: 0,compound,name,negative,neutral,num_reviews,positive,rating
0,0.1346,Max’s Restaurants,0.0876,0.8159,6,0.0965,3.0
1,0.0624,Franceschi’s,0.0619,0.8240,1,0.1142,2.0
2,0.2836,Butterfly Restaurant,0.0483,0.7384,1445,0.2064,3.5
3,0.1785,La Terrasse,0.0676,0.7766,265,0.1515,2.5
4,0.2955,Tita’s Restaurant,0.0560,0.7835,6,0.1604,3.0
5,0.2926,The Public,0.0410,0.7519,195,0.1982,4.0
6,0.2505,Tin Pan,0.0839,0.7681,3,0.1479,2.5
7,0.3689,EAT Restaurant,0.0286,0.7373,25,0.2340,4.0
8,0.3146,Bonanza Restaurant,0.0433,0.7200,24,0.2307,4.0
9,0.3310,The Metro Bar and Restaurant,0.0332,0.7533,56,0.2106,4.0


In [154]:
reviews_df.to_csv('yelp_review_analysis.csv', index=False)