In [40]:
from YELP_API_KEY import YELP_API_KEY
import csv
import requests
import json
from typing import List, Dict
from bs4 import BeautifulSoup as bs
# import re

In [2]:
def search_nested(nested_dictionary, keys: set, aliases=None, result=[]):
    for key, value in nested_dictionary.items():
        if key in keys and aliases is not None and key in aliases:
            result.append({aliases[key]: value})
        elif key in keys:
            result.append({key: value})
        elif type(value) is dict:
            search_nested(value, keys, aliases, result)
    return result

In [3]:
def get_business_info(ids: set,
                      keys: set,
                      api_key: str,
                      aliases=None):
    """Makes a request to the Yelp business api and gets the desired params.
    Aliases allows renaming of yelp keys."""
    businesses: List[Dict] = []
    url = 'https://api.yelp.com/v3/businesses/'
    headers = {'Authorization': f'Bearer {YELP_API_KEY}'}
    for id_ in ids:
        response = requests.get(f'{url}{id_}', headers=headers)
        business = {'id': id_}
        for key, value in response.json().items():
            if key in keys and aliases is not None and key in aliases:
                business[aliases[key]] = response.json()[key]
            elif key in keys:
                business[key] = response.json()[key]
            elif type(value) is dict:
                nested_keys = search_nested(value, keys, aliases)
                for result in nested_keys:
                    business.update(result)
        businesses.append(business)
    return businesses

In [50]:
def scrape_business_info(yelp_url):
    response = requests.get(yelp_url)
    soup = bs(response.text)
    url = soup.find('a', href=re.compile("^\/biz_redir\?url="), role='link')
    if url is not None:
        return url.text
    return url

In [5]:
csv_file = 'businesses_v2.csv'
ids: set = set()
with open(csv_file, 'r') as csv_file:
    for row in csv.reader(csv_file):
        ids.add(row[0])

In [6]:
keys = {
    'name',
    'image_url',
    'url',
    'phone',
    'categories',
    'address1',
    'city',
    'zip_code',
    'country',
    'state',
    'latitude',
    'longitude',
    'photos',
    'price',
}
aliases = {
    'url': 'yelp_url',
    'address1': 'street_address',
    'zip_code': 'postal_code',
    'state': 'province',
}
businesses = get_business_info(ids, keys, YELP_API_KEY, aliases)
for business in businesses:
    business['url'] = scrape_business_info(business['yelp_url'])

In [53]:
for business in businesses:
    business['url'] = scrape_business_info(business['yelp_url'])

In [54]:
businesses

[{'id': 'q5xrVJ4kivx_yEfJeOKNYQ',
  'name': 'Bar Isabel',
  'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/BnLe7mg4YaszrnZntiSPQQ/o.jpg',
  'yelp_url': 'https://www.yelp.com/biz/bar-isabel-toronto?adjust_creative=dWJeA2crTuyuRlT4LXbwew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_lookup&utm_source=dWJeA2crTuyuRlT4LXbwew',
  'phone': '+14165322222',
  'categories': [{'alias': 'spanish', 'title': 'Spanish'},
   {'alias': 'tapas', 'title': 'Tapas Bars'}],
  'street_address': '797 College Street',
  'city': 'Toronto',
  'postal_code': 'M6G 1C6',
  'country': 'CA',
  'province': 'ON',
  'latitude': 43.65463,
  'longitude': -79.42075,
  'photos': ['https://s3-media3.fl.yelpcdn.com/bphoto/BnLe7mg4YaszrnZntiSPQQ/o.jpg',
   'https://s3-media4.fl.yelpcdn.com/bphoto/A4FvPG0Kqz3NiHHWtLVhSg/o.jpg',
   'https://s3-media1.fl.yelpcdn.com/bphoto/m-VsYexkbT8kWw7YDjXIIA/o.jpg'],
  'price': '$$$',
  'url': 'barisabel.com'},
 {'id': 'fh8a_k9oslEDSHbmJLzUrQ',
  'name': 'Kekou Gelato',
  'image