In [1]:
#!pip install nltk

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import ast
import re
import numpy as np
import pandas as pd

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk import tokenize
nltk.download('punkt')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\astanchi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
websites = []
root_path = os.getcwd()
file_input = os.path.join(root_path, 'yelp_closed_biz.txt')
with open(file_input,'r') as fh:
    websites = fh.read().splitlines()
websites

['https://www.yelp.com/biz/maxs-restaurants-san-francisco-3',
 'https://www.yelp.com/biz/franceschis-san-francisco',
 'https://www.yelp.com/biz/butterfly-san-francisco',
 'https://www.yelp.com/biz/la-terrasse-san-francisco',
 'https://www.yelp.com/biz/titas-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-public-san-francisco',
 'https://www.yelp.com/biz/tin-pan-san-francisco',
 'https://www.yelp.com/biz/eat-restaurant-san-francisco',
 'https://www.yelp.com/biz/bonanza-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-metro-bar-and-restaurant-san-francisco',
 'https://www.yelp.com/biz/sinbads-pier2-restaurant-san-francisco-2',
 'https://www.yelp.com/biz/the-monte-carlo-san-francisco',
 'https://www.yelp.com/biz/myth-san-francisco',
 'https://www.yelp.com/biz/2223-restaurant-san-francisco',
 'https://www.yelp.com/biz/the-palace-san-francisco',
 'https://www.yelp.com/biz/watergate-san-francisco',
 'https://www.yelp.com/biz/tallula-san-francisco',
 'https://www.yelp.com/b

In [5]:
def cleanText(dirtyText):
    text = dirtyText.replace('null','""')
    text = text.replace('\t','')
    text = re.sub(' +',' ',text) #replace multiple white spaces with one
    return text[1:]

In [None]:
def analyzeYelp(yelp_dict, doprint=False):
    rest_name = ""
    rest_review = 0.0
    rest_reviewcnt = 0
    rest_cmpd = []
    rest_pos = []
    rest_neu = []
    rest_neg = []

    try: 
        rest_name = yelp_dict["name"]
        if ("aggregateRating" in yelp_dict):
            rest_review = yelp_dict["aggregateRating"]["ratingValue"]
            rest_reviewcnt = yelp_dict["aggregateRating"]["reviewCount"]

        if doprint:
            print ("\n\n************************************************************")
            print("Restaurant: %s" % rest_name)
            print("Yelp Rating: %s" % rest_review)

        for idx, review in enumerate(yelp_dict["review"]):
            if doprint:
                print("\nReview #%s: %s\n" % (idx+1,review["description"]))
            sentence_list = tokenize.sent_tokenize(review["description"])

            para_cmpd = []
            para_pos = []
            para_neu = []
            para_neg = []

            for sentence in sentence_list:
                vs = analyzer.polarity_scores(sentence)
                para_cmpd.append(vs["compound"])
                para_pos.append(vs["pos"])
                para_neu.append(vs["neu"])
                para_neg.append(vs["neg"])
                if doprint:
                    print("{:-<100} {}".format(sentence, str(vs["compound"])))

            if doprint:
                print("AVERAGE SENTIMENT FOR PARAGRAPH: \t" + str(round(np.mean(para_cmpd),4)))

            rest_cmpd.append(np.mean(para_cmpd))
            rest_pos.append(np.mean(para_pos))
            rest_neu.append(np.mean(para_neu))
            rest_neg.append(np.mean(para_neg))

        if doprint:
            print("\n\nAVERAGE YELP REVIEW SENTIMENT: \t" + str(round(np.mean(rest_cmpd),4)))
            print ("************************************************************\n\n")
    
    except:
        print("Error parsing %s" % rest_name)
    
    if rest_cmpd:
        avg_cmpd = round(np.mean(rest_cmpd),4)
        avg_pos = round(np.mean(rest_pos),4)
        avg_neu = round(np.mean(rest_neu),4)
        avg_neg = round(np.mean(rest_neg),4)
    else:
        avg_cmpd = 0.0
        avg_pos = 0.0
        avg_neu = 0.0
        avg_neg = 0.0
    
    return {
        "name": rest_name,
        "rating": rest_review,
        "compound": avg_cmpd,
        "positive": avg_pos,
        "neutral": avg_neu,
        "negative": avg_neg,
        "num_reviews": rest_reviewcnt
    }

In [None]:
reviews = []
for url in websites: # websites[0:1]:
    print("processing url: %s" % url)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    stuff = soup.find("script", type="application/ld+json")
    if stuff:
        good_stuff = cleanText(stuff.text)
        stuff1_dict = ast.literal_eval(good_stuff)
        #reviews.append(analyzeYelp(stuff1_dict, True))
        reviews.append(analyzeYelp(stuff1_dict))

#reviews

processing url: https://www.yelp.com/biz/maxs-restaurants-san-francisco-3
processing url: https://www.yelp.com/biz/franceschis-san-francisco
processing url: https://www.yelp.com/biz/butterfly-san-francisco
processing url: https://www.yelp.com/biz/la-terrasse-san-francisco
processing url: https://www.yelp.com/biz/titas-restaurant-san-francisco
processing url: https://www.yelp.com/biz/the-public-san-francisco
processing url: https://www.yelp.com/biz/tin-pan-san-francisco
processing url: https://www.yelp.com/biz/eat-restaurant-san-francisco
processing url: https://www.yelp.com/biz/bonanza-restaurant-san-francisco
processing url: https://www.yelp.com/biz/the-metro-bar-and-restaurant-san-francisco
processing url: https://www.yelp.com/biz/sinbads-pier2-restaurant-san-francisco-2
processing url: https://www.yelp.com/biz/the-monte-carlo-san-francisco
processing url: https://www.yelp.com/biz/myth-san-francisco
processing url: https://www.yelp.com/biz/2223-restaurant-san-francisco
processing url

In [None]:
reviews_df = pd.DataFrame(reviews)
reviews_df

In [None]:
reviews_df.to_csv('yelp_review_analysis.csv', index=False)