# Business Recommendation By Review
- Scrape and analyze review data from NYC
- Using NLP techniques, identify common features of highly-reviewed establishments to generate recommendations by training a model

In [13]:
import asyncio
import requests
import nest_asyncio
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [14]:
def async_web_scrape(iterable, individual_scrape_function, *args):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    nest_asyncio.apply()
    async def create_scrape_loop(iterable, individual_scrape_function, *args):
        """
        1. Establish an executor and number of workers
        2. Establish the session
        3. Establish the event loop
        4. Create the task by list comprenhensions
        5. Gather tasks.
        """
        with ThreadPoolExecutor(max_workers=40) as executor:
            with requests.Session() as session:
                loop = asyncio.get_event_loop()
                tasks = [
                    loop.run_in_executor(
                        executor, individual_scrape_function, *(i, *args)
                    )
                    for i in iterable
                ]
                for response in await asyncio.gather(*tasks):
                    pass
    
    future = asyncio.ensure_future(
        create_scrape_loop(iterable, individual_scrape_function,*args)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [15]:
def filter_biz_url(url):
    if ("?hrid" in url)|("/adredir?ad_business_id" in url)|( "/search?cflt" in url)|("yelp.com/city/nyc" in url)|("databyacxiom" in url):
        pass
    else:
        return url

In [4]:
def scrape_yelp_search(start_number, business_type, location, all_links):
#     url = 'https://www.yelp.com/search?cflt='+business_type+'&find_loc='+location+'&start='
    url = "https://www.yelp.com/search?cflt=coffee&find_loc=New%20York%2C%20NY&start="
    r = requests.get(url+str(start_number)).text
    soup = BeautifulSoup(r, "html.parser")
    a = soup.find_all("a",{"class":"lemon--a__373c0__IEZFH link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5"})
    links = [filter_biz_url(i["href"]) for i in a if filter_biz_url(i["href"]) is not None]
    all_links.extend(links)

In [16]:
def scrape_yelp_search(start_number, all_links):
#     url = 'https://www.yelp.com/search?cflt='+business_type+'&find_loc='+location+'&start='
    url = "https://www.yelp.com/search?cflt=coffee&find_loc=New%20York%2C%20NY&start="
    r = requests.get(url+str(start_number)).text
    soup = BeautifulSoup(r, "html.parser")
    a = soup.find_all("a",{"class":"lemon--a__373c0__IEZFH link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5"})
    links = [filter_biz_url(i["href"]) for i in a if filter_biz_url(i["href"]) is not None]
    all_links.extend(links)

In [17]:
business_type = 'coffee' # business type: painter, autorepair, burger
location = "New York, NY" # city, state
location = location.replace(" ", "%20").replace(",", "%2C")
# yelp shows 30 per page, so increments of 30 for starting number = 0
all_links = []
async_web_scrape(list(range(0,1200,30)), scrape_yelp_search, all_links)

In [18]:
all_links[:10]

['/biz/kaffe-1668-south-new-york',
 '/biz/harbs-new-york-13',
 '/biz/outpost-caf%C3%A9-brooklyn-3',
 '/biz/la-colombe-coffee-new-york-5',
 '/biz/marcy-and-myrtle-cafe-bedford-stuyvesant',
 '/biz/swallow-cafe-cobble-hill',
 '/biz/cranberrys-brooklyn',
 '/biz/yanni-s-coffee-new-york',
 '/biz/mud-new-york-3',
 '/biz/joe-coffee-company-brooklyn-2']

In [19]:
coffee_ny_ny_list = all_links
len(coffee_ny_ny_list)

990

In [20]:
import pandas as pd
import requests
from lxml import html

def scraper(biz_url, review_index=20):
    
    # get url
    yelp_url = "https://www.yelp.com" 
    review_url = "/review_feed?sort_by=date_desc&start="
    full_url = yelp_url + biz_url + review_url + str(review_index)

    colnames = ['date', 'review', 'star_rating']
    df_reviews = pd.DataFrame(columns=colnames)
    with requests.get(full_url, timeout=20) as response: 
        if response.status_code==200:
            string = response.json()
            div = html.fromstring(string['review_list'])
            reviews = [e.text for e in div.xpath("//div[@class='review-content']/p")]
            dates = div.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
            star_ratings = div.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")
            df = pd.DataFrame([dates, reviews, star_ratings]).T
            df.columns = colnames
            df['date'] = df['date'].apply(lambda x: x.strip())
#             df['star_rating'] = df['star_rating'].apply(lambda x: float(x[:3]))
            df_reviews = pd.concat([df_reviews, df], ignore_index=True)
            del df
   
    return df_reviews

In [None]:
df = scraper(coffee_ny_ny_list[1], 20)
df.head()

In [None]:
df_list = []
for y in range(len(coffee_ny_ny_list)):
    for x in range(0,1000,20):
        df = scraper(coffee_ny_ny_list[y], x)
        df_list.append(df)
coffee_ny_ny = pd.concat(df_list, join='outer', axis=0, ignore_index=True)
coffee_ny_ny['star_rating'] = coffee_ny_ny['star_rating'].apply(lambda x: float(x[:3]))
coffee_ny_ny['keyword'] = 'coffee'
coffee_ny_ny = coffee_ny_ny.dropna()
print(coffee_ny_ny.shape)
coffee_ny_ny.head()

In [22]:
len(df_list)

2042

In [None]:
coffee_ny_ny.to_csv('coffee_ny_ny_yelp_reviews.csv')

In [87]:
import pandas as pd
import requests
from lxml import html

def yelp_review_scraper(biz_id, starting_review_index=0):
    
    # get url
    yelp_url = "https://www.yelp.com" 
    yelp_review = "/review_feed?sort_by=date_desc&start="
    full_url = yelp_url + biz_id + yelp_review + str(starting_review_index)

    cols = ['date', 'review', 'star_rating']
    yelp_reviews = pd.DataFrame(columns=cols)
    with requests.get(full_url, timeout=20) as response: 
        if response.status_code==200:
            json_text = response.json()
            div = html.fromstring(json_text['review_list'])
            reviews = [e.text for e in div.xpath("//div[@class='review-content']/p")]
            dates = div.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
            star_ratings = div.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")
            star_ratings = float(star_ratings[:3])
            df = pd.DataFrame([dates, reviews, star_ratings]).T
            df.columns = cols
            df['date'] = df['date'].apply(lambda x: x.strip())
            yelp_reviews = pd.concat([yelp_reviews, df], ignore_index=True)
            del df
   
    return yelp_reviews

In [None]:
df_list = []
for x in range(0,500,20):
    df = yelp_review_scraper(coffee_ny_ny_list[0], x)
    df_list.append(df)
coffee_ny_ny = pd.concat(df_list, join='outer', axis=0, ignore_index=True)
coffee_ny_ny = coffee_ny_ny.dropna()
print(coffee_ny_ny.shape)
coffee_ny_ny.head()