# Functions for extraction of data using YELP API and web scraping

In [1]:
import io, time, json
import requests
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import quote

In [2]:
def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string): 

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """
    article = requests.get(url)
    art_content = article.text
    
    return article.status_code, art_content

In [3]:
def read_api_key(filepath="api_key.txt"):
    """
    Read the Yelp API Key from file.
    
    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """
    
    # Feel free to modify this function if you are storing the API Key differently
    return Path(filepath).read_text().strip()

In [4]:
def yelp_search(api_key, query):
    """
    Make an authenticated request to the Yelp API.

    Args:
        query (string): Search term

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the query
        businesses (list): list of dicts representing each business
    """
    host = 'https://api.yelp.com'
    path = '/v3/businesses/search'

    url_params = {'location': query.replace(' ', '+')}

    url = '{0}{1}'.format(host, quote(path.encode('utf8')))

    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    response = requests.request('GET', url, headers=headers, params=url_params)
    #print(response.json())
    total = response.json()["total"]
    businesses = response.json()["businesses"]

    return total

In [5]:
def all_restaurants(api_key, query):
    """
    Retrieve ALL the restaurants on Yelp for a given query.

    Args:
        query (string): Search term

    Returns:
        results (list): list of dicts representing each business
    """
    host = 'https://api.yelp.com'
    path = '/v3/businesses/search'
    
    limit = 20
    offset = 1
    
    url_params = {'location': query.replace(' ', '+'), 'limit': limit, 'offset': offset}
    
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))

    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    response = requests.request('GET', url, headers=headers, params=url_params)
    
    total = response.json()["total"]
    businesses = response.json()["businesses"]
    
    length = len(businesses)
    
    while length < total-1:
        time.sleep(.800)
        offset+=limit
        
        url_params = {'location': query.replace(' ', '+'), 'offset': offset}
        response = requests.request('GET', url, headers=headers, params=url_params)
        
        if "businesses" in response.json().keys():
            businesses.extend(response.json()["businesses"])
        else:
            print(len(businesses),total,query)
            return None
            break
    
        length = len(businesses)
        
    return businesses

In [6]:
def parse_api_response(data):
    """
    Parse Yelp API results to extract restaurant URLs.
    
    Args:
        data (list): list of restaurant data.

    Returns:
        (list): list of URLs as strings from the input list.
    """
    url_list = []
    
    for i in range(len(data)):
        url_dict = {}
        url_dict[data[i]["name"]] = data[i]["url"]
        url_list.append(url_dict)
    
    return url_list

In [7]:
def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.
    
    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        Tuple[List[Dict], int]: a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: number of pages total.
    """
    soup = BeautifulSoup(html, "html.parser")
    data = json.loads(soup.find_all("script", type ="application/ld+json")[-1].string)
    review_list = []
    
    for i in range(len(data['review'])):
        review_list.append(dict())
        review_list[i]['author'] = data['review'][i]['author']
        review_list[i]['rating'] = float(data['review'][i]['reviewRating']['ratingValue'])
        review_list[i]['date'] = data['review'][i]['datePublished']
        review_list[i]['description'] = data['review'][i]['description']
        
    review_count = float(data['aggregateRating']['reviewCount'])
    num_pages = review_count//20
    
    if review_count%20 != 0:
        num_pages += 1
        
    return review_list,num_pages

In [98]:
def extract_reviews(url):
    """
    Retrieve ALL of the reviews for a single restaurant on Yelp.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        reviews (list): list of dictionaries containing extracted review information
    """
    limit = 20
    
    restaurant = requests.get(url)
    review_list, num_pages = parse_page(restaurant.text)
    
    page_number = 1
    
    try:
        while page_number < num_pages:
            time.sleep(.800)
            restaurant = requests.get(url,params = {"start":limit*page_number})
            review_list.extend(parse_page(restaurant.text)[0])
            page_number += 1

        return review_list
    
    except:
        return None

In [9]:
def city_neighbourhoods(city_url):
    """
    Parse Yelp API results to extract neighbourhoods in a city.
    
    Args:
        data (string): URL of the city.

    Returns:
        (list): list of neighbourhoods as strings.
    """
    html = retrieve_html(city_url)[1]
    soup = BeautifulSoup(html, "html.parser")
    try:
        neigh_dict = soup.find_all("script",type="application/json")[-3].string
        neigh_dict = json.loads(neigh_dict[4:-3])
        neigh_list = []

        for i in range(len(neigh_dict['links'])):
            neigh_list.append(neigh_dict['links'][i]['text'])

        return neigh_list
    except:
        print(city_url)
        return city_url

In [10]:
def restaurant_data(restaurants,neighbourhood = None):
    """
    Extract useful information from Yelp API results.
    
    Args:
        restaurants (list): Results obtained from yelp api.

    Returns:
        (list): list of extracted restaurant information as a dictionary.
    """
    businesses = []
    
    for i in range(len(restaurants)):
        business = {}
        business["name"] = restaurants[i]["name"]
        business["review_count"] = restaurants[i]["review_count"]
        business["categories"] = []
        for j in range(len(restaurants[i]["categories"])):
            business["categories"].append(restaurants[i]["categories"][j]["title"])
        business["latitude"] = restaurants[i]["coordinates"]["latitude"]
        business["longitude"] = restaurants[i]["coordinates"]["longitude"]
        business["rating"] = restaurants[i]["rating"]
        if 'price' in restaurants[i].keys():
            business["price"] = restaurants[i]["price"]
        else:
            business["price"] = '$'
        if neighbourhood is not None:
            business["neighbourhood"] = neighbourhood
        business["city"] = restaurants[i]["location"]["city"]
        business["address"] = restaurants[i]["location"]["display_address"]
        
        businesses.append(business)
        
    return businesses

In [11]:
def extract_cities_url(url):
    """
    Extract city urls.
    
    Args:
        url (string): Yelp City url

    Returns:
        (dict): State and their corresponding cities and urls as a dictionary.
    """
    html = retrieve_html(url)[1]
    soup = BeautifulSoup(html,"html.parser")
    state_data = soup.find_all("script",type = "application/json")[-1].string
    state_data = json.loads(state_data[4:-3])
    state_dict = {}
    
    for i in range(len(state_data["stateLinkLists"])):
        state_dict[state_data["stateLinkLists"][i]["title"]] = {}
        for j in range(len(state_data["stateLinkLists"][i]["links"])):
            state_dict[state_data["stateLinkLists"][i]["title"]][state_data["stateLinkLists"][i]["links"][j]["text"]] = state_data["stateLinkLists"][i]["links"][j]["uri"]
    
    return state_dict

In [12]:
def city_dict_creation(state_dict):
    cities = {}
    cities_with_neighborhood = {}
    
    for state in (state_dict.keys()):
        for city in state_dict[state]:
            html = retrieve_html("https://www.yelp.com/" + state_dict[state][city])[1]
            soup = BeautifulSoup(html, "html.parser")
            tag_p_list = soup.find_all("p")
            city_data = []
            for k in tag_p_list:
                if k.has_attr("class") and (k["class"][0] == "counter-widget_num" or k["class"][0] == "counter-widget_text") :
                    k = BeautifulSoup(str(k))
                    tag = k.p
                    city_data.append(tag.string.replace(",",""))
                    
            if len(city_data) == 8:
                cities[city] = {}
                for m in range(len(city_data)):
                    if m%2 == 0:
                        cities[city][city_data[m+1]] = int(city_data[m])
                cities[city]["url"] = "https://www.yelp.com/" + state_dict[state][city]
                        
            elif len(city_data) == 10:
                cities_with_neighborhood[city] = {}
                for m in range(len(city_data)):
                    if m%2 == 0:
                        cities_with_neighborhood[city][city_data[m+1]] = int(city_data[m])
                cities_with_neighborhood[city]["url"] = "https://www.yelp.com/" + state_dict[state][city]
                        
    return cities,cities_with_neighborhood

In [13]:
def save_file(f_name,entity):
    with open(f_name,"w") as file:
        json.dump(entity,file)

In [14]:
def open_file(f_name):
    with open(f_name,"r") as file:
        entity = json.load(file)
        
    return entity