In [1]:
# setup library imports
import io, time, json, re
import requests, datetime
from bs4 import BeautifulSoup

## Basic HTTP Requests



In [2]:
def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string): 

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """
    
    response = requests.get(url)
    response_tuple = response.status_code, response.content
    return response_tuple
    pass



## Authenticated HTTP Request with the Yelp API


In [4]:
def read_api_key(filepath):
    """
    Read the Yelp API Key from file.
    
    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """
    with open('api_key.txt', 'r') as f:
        return f.read().strip()
    


In [5]:
def yelp_search(api_key, query):
    """
    Make an authenticated request to the Yelp API.

    Args:
        query (string): Search term

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the query
        businesses (list): list of dicts representing each business
    """
    END_POINT = "https://api.yelp.com/v3/businesses/search"
    head = {'Authorization': 'Bearer %s'%api_key}
    parameters = {'location': query }
    
    response = requests.get(END_POINT, headers = head, params = parameters)
    total = 0
    business_list = []
    if(response.status_code == 200):
        resp_json_obj = json.loads(response.content.decode("utf-8"))
        total = resp_json_obj['total']
        busi_list = resp_json_obj['businesses']
    return total, busi_list
    pass


## Parameterization and Pagination


## Aquire all of the restaurants in Pittsburgh (on Yelp)


In [7]:
def all_restaurants(api_key, query):
    """
    Retrieve ALL the restaurants on Yelp for a given query.

    Args:
        query (string): Search term

    Returns:
        results (list): list of dicts representing each business
    """
    
    END_POINT = "https://api.yelp.com/v3/businesses/search"
    head = {'Authorization': 'Bearer %s'%api_key}
    url_params = {'location': query, 'limit': 20, 'categories': 'restaurants'}
    
    response = requests.get(END_POINT, headers = head, params = url_params)
    total = 0
    if(response.status_code == 200):
        resp_json_obj = json.loads(response.content.decode("utf-8"))
        total = resp_json_obj['total']
    
    final_busi_list = []
    thisOffset = 0
    while(thisOffset < total):
        url_params['offset'] = thisOffset
        response = requests.get(END_POINT, headers = head, params = url_params)
        if response.status_code == 200:
            resp_json_obj = json.loads(response.content.decode("utf-8"))
            final_busi_list.extend(resp_json_obj['businesses'])
        time.sleep(.300)
        thisOffset = thisOffset + 20
    return final_busi_list
    pass


## Parse the API Responses and Extract the URLs


In [9]:
def parse_api_response(data):
    """
    Parse Yelp API results to extract restaurant URLs.
    
    Args:
        data (string): String of properly formatted JSON.

    Returns:
        (list): list of URLs as strings from the input JSON.
    """
    if data:
        json_obj = json.loads(data)
        list_urls = list(map(lambda thisB: thisB['url'], json_obj['businesses']))
        return list_urls
    else:
        return []
        
    pass


## Parse a Yelp restaurant Page

Using `BeautifulSoup`, parse the HTML of a single Yelp restaurant page to extract the reviews in a structured form as well as the URL to the next page of reviews (or `None` if it is the last page).
* the reviews as a structured Python dictionary
* the HTML element containing the link/url for the next page of reviews (or None).

```python
{
    'review_id': str
    'user_id': str
    'rating': float
    'date': str ('yyyy-mm-dd')
    'text': str
}

# Example
{
    'review_id': '12345'
    'user_id': '6789'
    'rating': 4.7
    'date': '2016-01-23'
    'text': "Wonderful!"
}
```

In [15]:
def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.

    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        tuple(list, string): a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: URL for the next page of reviews (or None if it is the last page)
    """
    soup = BeautifulSoup(html, "html.parser")
    review_list = []
    for thisTag in soup.findAll("div", "review review--with-sidebar"):
        thisReviewInfo = {}
        # review id
        thisReviewInfo["review_id"] = str(thisTag.get("data-review-id"))
        # user id
        thisReviewInfo["user_id"] = str(thisTag.get("data-signup-object").replace("user_id:",""))
        # rating
        thisReviewInfo["rating"] = round(float(thisTag.find("div", {"class": (lambda x:x and x.startswith("i-stars i-stars--"))}).get("title").replace(" star rating","")),1)
        # date
        if len(thisTag.find("div", {"class": "biz-rating biz-rating-large clearfix"}).find("span").text) > 0:
            thisReviewInfo["date"] = str(thisTag.find("div", {"class": "biz-rating biz-rating-large clearfix"}).find("span").text).strip('\\n').strip().strip('\\n')
        else:
            thisReviewInfo["date"] = ''
        # text
        user_review = ""
        for thisLine in thisTag.find("div", {"class": "review-content"}).find("p").text:
            user_review += str(thisLine)
        thisReviewInfo['text'] = user_review
        
        review_list.append(thisReviewInfo)
        
    next_obj = soup.find("a", {"class":"u-decoration-none next pagination-links_anchor"})
    next_url = next_obj.get("href") if next_obj != None else None
    return_tuple = (review_list, next_url)
    return return_tuple
    
    pass


## Extract all of the Yelp reviews for a Single Restaurant


In [17]:
def extract_reviews(url):
    """
    Retrieve ALL of the reviews for a single restaurant on Yelp.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        reviews (list): list of dictionaries containing extracted review information
    """
    final_list = []
    
    html_response = requests.get(url)
    extracted_resp = parse_page(html_response.text)
    for thisReview in extracted_resp[0]:
        final_list.append(thisReview)
        
    if extracted_resp[1] != None:
        time.sleep(.300)
        final_list.extend(extract_reviews(extracted_resp[1]))
    else:
        return final_list
    return final_list
    pass
