In [63]:
import pandas as pd
import numpy as np
import json
import requests
import re
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
%matplotlib inline

#Need to use a delay between page scrapes in order to limit getting blocked by Yelp
from time import sleep

In [23]:
#ENTER SEARCH TERMS BELOW:
cuisine_type = ""
location = "Singapore"

#Generate URL based on search terms
base_url = "https://www.yelp.com"
search_url = f"{base_url}/search?find_desc={cuisine_type}&find_loc={location}"

#Or manually set search_url by copying directly from Yelp Page if desired
#search_url = "https://www.yelp.com/search?find_desc=burger&find_loc

In [69]:
star_container_class = "lemon--div__373c0__1mboc attribute__373c0__1hPI_ display--inline-block__373c0__2de_K u-space-r1 border-color--default__373c0__2oFDT"
price_range_class = "lemon--span__373c0__3997G text__373c0__2pB8f priceRange__373c0__2DY87 text-color--normal__373c0__K_MKN text-align--left__373c0__2pnx_ text-bullet--after__373c0__1ZHaA"
review_count_class = "lemon--span__373c0__3997G text__373c0__2pB8f reviewCount__373c0__2r4xT text-color--mid__373c0__3G312 text-align--left__373c0__2pnx_"
next_page_class = "lemon--a__373c0__IEZFH link__373c0__29943 next-link navigation-button__373c0__1D3Ug link-color--blue-dark__373c0__1mhJo link-size--default__373c0__1skgq"
search_result_class = "container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY"


In [71]:
next_page_url = search_url
page_counter = 1
business_list = []

#Run continuously until there is no longer a "next page" url found.
while next_page_url:
    #Request HTML page and load into Beautiful Soup object
    request = requests.get(next_page_url)
    soup = BeautifulSoup(request.content,'html.parser')
    
    #Find search results container on page.
    search_results = soup.findAll("div", class_=search_result_class)
    print(f"Page {page_counter}, {len(search_results)-1} results {next_page_url}")
    result_counter = 1

    #Loop through search results and store information for each business
    for search_result in search_results:
        business_info = {}
        try:
            business_name_url = search_result.findAll('a', href=True)[1]
            business_info['url'] = f"https://www.yelp.com{business_name_url['href']}"
            business_info['name'] = business_name_url['name']
            business_info['biz_id'] = business_name_url['href'].split('/biz/')[1].split('?')[0]
        except:
            continue
            
        try:
            business_info['address'] = search_result.find('address').text
        except:
            pass
        try:
            business_info['category'] = [category.text for category in search_result.findAll("a",attrs={"role":"link"})]
        except:
            pass
        try:
            business_info['star_rating'] = float(re.findall(r"[-+]?\d*\.\d+|\d+", 
                                                      search_result.find(
                                                          class_=star_container_class).find('div')['aria-label'] )[0] )
        except:
            pass
        try:
            business_info['price_range'] = search_result.find(class_=price_range_class).text
        except:
            pass
        try:
            business_info['num_reviews'] = int(re.findall(r"[-+]?\d*\.\d+|\d+",
                                                      search_result.find(
                                                          class_=review_count_class).text )[0] )
        except:
            pass
        try:
            business_info['image_shown'] = search_result.find('img')['src']
        except:
            pass
        
        #Append business information for each search result to a list containing all businesses.
        if business_info:
            business_list.append(business_info)
            
        result_counter+=1
    
    #Set url for next page. If not found, break out of loop.
    if soup.find(class_=next_page_class):
        next_page_url = base_url + soup.find(class_=next_page_class)['href']
        page_counter+=1
    else:
        break
    
    #Random delay between 2 and 20 seconds to prevent getting blocked
    sleep(np.random.randint(2,20))

print(len(business_list), "businesses scraped")
business_list

Page 1, 9 results https://www.yelp.com/search?find_desc=&find_loc=Singapore
10 businesses scraped


[{'url': 'https://www.yelp.com/biz/gardens-by-the-bay-singapore-3',
  'name': 'Gardens By The Bay',
  'biz_id': 'gardens-by-the-bay-singapore-3',
  'category': ['Botanical Gardens'],
  'image_shown': 'https://s3-media0.fl.yelpcdn.com/bphoto/qT8KatwbuJJvqrRJP4gmzw/348s.jpg'},
 {'url': 'https://www.yelp.com/biz/singapore-botanic-gardens-singapore-5',
  'name': 'Singapore Botanic Gardens',
  'biz_id': 'singapore-botanic-gardens-singapore-5',
  'category': ['Botanical Gardens'],
  'image_shown': 'https://s3-media0.fl.yelpcdn.com/bphoto/k_j7-pjkx6V3mL2nj3Un0w/348s.jpg'},
 {'url': 'https://www.yelp.com/biz/tian-tian-hainanese-chicken-rice-singapore-7',
  'name': 'Tian Tian Hainanese Chicken Rice',
  'biz_id': 'tian-tian-hainanese-chicken-rice-singapore-7',
  'category': ['Hainan', 'Chicken Shop'],
  'image_shown': 'https://s3-media0.fl.yelpcdn.com/bphoto/p295N0p6K52CzkmBOxx73w/348s.jpg'},
 {'url': 'https://www.yelp.com/biz/singapore-zoo-singapore-2',
  'name': 'Singapore Zoo',
  'biz_id': 's

In [6]:
business_info_df = pd.DataFrame(business_list)
#Drop businesses with no reviews
business_info_df.dropna(subset=['num_reviews'], inplace=True)
#Drop duplicates
business_info_df.drop(business_info_df[business_info_df.biz_id.duplicated(keep='first')].index, inplace=True)
print(len(business_info_df))
business_info_df.tail()

KeyError: ['num_reviews']

In [None]:
#CLEAN UP CATEGORY VALUES - remove parenthesis
business_info_categories = []
for category in business_info_df.category:
    cat_list = []
    for cat in category:
        cat = cat.replace('(',' ')
        cat = cat.replace(')',' ')
        cat = re.sub(' +',' ', cat).strip()
        cat_list.append(cat)
    business_info_categories.append(cat_list)
business_info_df.category = business_info_categories

In [None]:
#CLEAN UP BUSINESS NAMES
business_names = []
for name in business_info_df.name:
    name = name.replace('â\x80\x99',"\'")
    business_names.append(name)
business_info_df.name = business_names

In [None]:
def get_reviews(business_name, business_index, yelp_business_url, verbose=False):
    """
    This function will iterate through all of the review pages for a particular business and
    return a list populated with all reviews found.
    
    INPUTS:
    business_name     = The name of the business. It is contained in the results list records.
    business_index    = The business index (unique identifier). It is contained in the results records.
    yelp_business_url = The URL for the starting page of reviews for the business.
    verbose           = Summary info is always printed, but with verbose validation of each page is printed.
    
    OUTPUT:
    List of reviews. Each review is a dictionary containing desired review information.
    """
    
    #Class names used in Yelp Review pages.
    #There are two flavors of page design that yelp uses
    search_result_class_v1 = "lemon--li__373c0__1r9wz u-space-b3 u-padding-b3 border--bottom__373c0__uPbXS border-color--default__373c0__2oFDT"
    search_result_class_v2 = "review review--with-sidebar"
    
    #Set starting page (first page of reviews)
    next_page_url = yelp_business_url

    reviews_list = []
    page_counter=1

    #Continue to loop through review pages until there is no longer a "next" link at the bottom.
    while next_page_url:
        if verbose:
            #Print the page url being parsed
            print(f"Page {page_counter}, {next_page_url}")

        #Request html for page and load into BeautifulSoup object.
        request = requests.get(next_page_url)
        soup = BeautifulSoup(request.content,'html.parser')
        
        #Check which version of the page is being used. If neither is found, print error message.
        if len(soup.findAll(class_=search_result_class_v1))!=0:
            reviews_list.extend(get_reviews_page_v1(soup,business_name,business_index,verbose))
        elif len(soup.findAll(class_=search_result_class_v2))!=0:
            reviews_list.extend(get_reviews_page_v2(soup,business_name,business_index,verbose))
        else:
            print("Could not parse page: ", next_page_url)
        
        #Check for "next" page link - update next_page_url if found.
        #Break from while loop if there is no next page.
        if soup.find("link", attrs={'rel':'next'}):
            next_page_url = soup.find("link", attrs={'rel':'next'})['href']
            page_counter+=1
        else:
            break
        
        #Random delay between 1 and 4 seconds to prevent getting blocked
        sleep(np.random.randint(1,3))
    
    return reviews_list

In [None]:
def get_reviews_page_v1(soup, business_name, business_index, verbose=False):
    """
    This function will extract reviews information from the BeautifulSoup object representing
    version 1 of a Yelp review page.
    
    INPUTS:
    soup           = BeautifulSoup object to traverse.
    business_name  = The name of the business. It is contained in the results list records.
    business_index = The business index (unique identifier). It is contained in the results records.
    verbose        = If True, print status of review extraction.
    
    OUTPUT:
    List of reviews. Each review is a dictionary containing desired review information.
    """
    search_result_class = "lemon--li__373c0__1r9wz u-space-b3 u-padding-b3 border--bottom__373c0__uPbXS border-color--default__373c0__2oFDT"
    star_container_class = "lemon--div__373c0__1mboc arrange-unit__373c0__1piwO border-color--default__373c0__2oFDT"
    date_class = "lemon--span__373c0__3997G text__373c0__2pB8f text-color--mid__373c0__3G312 text-align--left__373c0__2pnx_"
    pic_class = "lemon--span__373c0__3997G photo-box-grid-item__373c0__2kFqV display--inline__373c0__1DbOG u-space-r2 u-space-b2 border-color--default__373c0__2oFDT"
    pic_url_class = "lemon--img__373c0__3GQUb photo-box-img__373c0__O0tbt"
    
    #Get each review block
    reviews = soup.findAll(class_=search_result_class)
    reviews_list=[]
    skipped_review_counter=0
    #Loop through each review and pull out pertinent information. Put into list of dictionaries.
    for review in reviews:
        try:
            review_info = {}
            review_info["business_name"] = business_name
            review_info["business_index"] = business_index
            review_info["date"] = review.find(class_=date_class).text.strip()
            #review_info["review"] = review.find("span", attrs={"class": "lemon--span__373c0__3997G", "lang": "en"}).text
            review_info["review"] = review.find(attrs={"lang": "en"}).text
            review_info['star_rating'] = float(re.findall(r"[-+]?\d*\.\d+|\d+", 
                           review.find(class_=star_container_class).find('div')['aria-label'] )[0] )
            review_info["pic_count"] = len(review.find_all(class_=pic_class))
            review_info["pic_urls"] = [obj['src'] for obj in review.findAll(class_=pic_url_class)]

            #Sometimes the user id is not being found
            try:
                review_info["user_id"] = review.find('a')['href'].split('userid=')[1]
            except:
                None

            reviews_list.append(review_info)
        except:
            skipped_review_counter+=1
            
    if verbose:
        if skipped_review_counter!=0:
            print(f"Skipped {skipped_review_counter} reviews")

    return(reviews_list)

In [None]:
def get_reviews_page_v2(soup, business_name, business_index, verbose=False):
    """
    This function will extract reviews information from the BeautifulSoup object representing
    version 2 of a Yelp review page.
    
    INPUTS:
    soup           = BeautifulSoup object to traverse.
    business_name  = The name of the business. It is contained in the results list records.
    business_index = The business index (unique identifier). It is contained in the results records.
    verbose        = If True, print status of review extraction.
    
    OUTPUT:
    List of reviews. Each review is a dictionary containing desired review information.
    """
    
    search_result_class = "review review--with-sidebar"
    star_container_class = "biz-rating__stars"
    date_class = "rating-qualifier"
    review_photo_box_class = "photo-box-grid clearfix js-content-expandable lightbox-media-parent"
    
    #Get each review block
    reviews = soup.findAll(class_=search_result_class)
    reviews_list=[]
    skipped_review_counter=0
    #Loop through each review and pull out pertinent information. Put into list of dictionaries.
    for review in reviews:
        try:
            review_info = {}
            review_info["business_name"] = business_name
            review_info["business_index"] = business_index
            review_info["date"] = review.find(class_=date_class).text.strip()
            review_info["review"] = review.find(attrs={"lang": "en"}).text
            review_info['star_rating'] = float(re.findall(r"[-+]?\d*\.\d+|\d+", 
                           review.find(class_=star_container_class).find('div')['title'])[0] )
            try:
                pic_line_items = review.find(class_=review_photo_box_class).findAll('li')
                review_info["pic_count"] = len(pic_line_items)
                review_info["pic_urls"] = [obj.find('img')['src'] for obj in pic_line_items]
            except:
                review_info["pic_count"] = 0
                review_info["pic_urls"] = []

            #Sometimes the user id is not being found
            try:
                review_info["user_id"] = review.find('a')['href'].split('userid=')[1]
            except:
                None

            reviews_list.append(review_info)
        except:
            skipped_review_counter+=1

    if verbose:
        if skipped_review_counter!=0:
            print(f"Skipped {skipped_review_counter} reviews")
            
    return(reviews_list)

In [None]:
#TESTING SCRAPER FOR A SINGLE BUSINESS
index_num = 4
business_url = business_info_df.url[index_num]
business_name = business_info_df.name[index_num]
business_index = business_info_df.biz_id[index_num]

reviews_df = pd.DataFrame(get_reviews(business_name,business_index, business_url,verbose=True))