# Business Recommendation By Review
- Scrape and analyze review data from NYC
- Using NLP techniques, identify common features of highly-reviewed establishments to generate recommendations by training a model

https://www.yelp.com/biz/coffee-project-ny-new-york?osq=coffee%20shop&start=20&sort_by=rating_desc

In [11]:
import asyncio
import requests
import nest_asyncio
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [12]:
def async_web_scrape(iterable, individual_scrape_function, *args):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    nest_asyncio.apply()
    async def create_scrape_loop(iterable, individual_scrape_function, *args):
        """
        1. Establish an executor and number of workers
        2. Establish the session
        3. Establish the event loop
        4. Create the task by list comprenhensions
        5. Gather tasks.
        """
        with ThreadPoolExecutor(max_workers=40) as executor:
            with requests.Session() as session:
                loop = asyncio.get_event_loop()
                tasks = [
                    loop.run_in_executor(
                        executor, individual_scrape_function, *(i, *args)
                    )
                    for i in iterable
                ]
                for response in await asyncio.gather(*tasks):
                    pass
    
    future = asyncio.ensure_future(
        create_scrape_loop(iterable, individual_scrape_function,*args)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [38]:
def filter_biz_url(url):
    if ("?hrid" in url)|("/adredir?ad_business_id" in url)|( "/search?cflt" in url)|("yelp.com/city/nyc" in url)|("databyacxiom" in url):
        pass
    else:
        return url

In [31]:
def scrape_yelp_search(start_number, all_links):
#     url = 'https://www.yelp.com/search?cflt='+business_type+'&find_loc='+location+'&start='
    url = "https://www.yelp.com/search?cflt=coffee&find_loc=New%20York%2C%20NY&start="
    r = requests.get(url+str(start_number)).text
    soup = BeautifulSoup(r, "html.parser")
    a = soup.find_all("a",{"class":"lemon--a__373c0__IEZFH link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5"})
    links = [filter_biz_url(i["href"]) for i in a if filter_biz_url(i["href"]) is not None]
    all_links.extend(links)

In [22]:
business_type = 'italian' # business type: painter, autorepair, burger
location = "New York, NY" # city, state
location = location.replace(" ", "%20").replace(",", "%2C")
start_num = 0 # yelp shows 30 per page, so increments of 30
url = 'https://www.yelp.com/search?cflt='+business_type+'&find_loc='+location+'&start='+str(start_num)
url

'https://www.yelp.com/search?cflt=italian&find_loc=New%20York%2C%20NY&start=0'

In [40]:
business_type = 'italian' # business type: painter, autorepair, burger
location = "New York, NY" # city, state
location = location.replace(" ", "%20").replace(",", "%2C")
all_links = []
async_web_scrape(list(range(0,1200,30)), scrape_yelp_search, all_links)

In [42]:
len(all_links)

990