# Yelp API Webscraping

In [25]:
import pandas as pd 
import json
import requests

# Compiled Code
- Don't forget to comment so we can understand!

## known issues:
- function populates dataframe with NaN rows
- do we want business name?
- businesses have multiple categories and function only picks up one at a time
    - currently the 'alias' portion picks up the alias of the business (name), not the category alias

In [9]:
# NOTHING IN THIS CELL NEEDS TO GET CHANGED 
# IF YOU HAVE YOUR API KEY IN 'creds.json' IN ./Assets

# format your json file as a dictionary containing api key with DOUBLE QUOTES
# {"api": "your_super_long_api_key"}
creds_file = open('../Assets/creds.json')

# load credentials into variable
yelp_credentials = json.loads(creds_file.read())
api_key = yelp_credentials['api']
headers = {'Authorization': 'Bearer %s' % api_key}

# this is the url we use to make broad business searches
# https://www.yelp.com/developers/documentation/v3/business_search

url = 'https://api.yelp.com/v3/businesses/search'

In [10]:
# depending on what we want to search we can change values in this dictionary
params = {'term':'food', 'location': 'Los Angeles'}

In [11]:
# ONLY RUN THIS CELL IF YOU WANT TO MAKE A REQUEST
req = requests.get(url, params=params, headers=headers)
print(f'Status Code: {req.status_code}')

Status Code: 200


In [12]:
yelp = json.loads(req.text)

In [13]:
businesses_object = yelp['businesses']
businesses_object[15]

# id
# coordinates
# price
# review_count
# rating
# location > zip_code
# location > city
# categories > alias

{'id': 'oNHjZ3wC0F9l5KpuqHd45w',
 'alias': 'rickys-fish-tacos-los-angeles-3',
 'name': "Ricky's Fish Tacos",
 'image_url': 'https://s3-media2.fl.yelpcdn.com/bphoto/xOUSwBSssYYBgbLzqV_2dg/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/rickys-fish-tacos-los-angeles-3?adjust_creative=1hBkN0UhEY8hcT3qXV3ivQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=1hBkN0UhEY8hcT3qXV3ivQ',
 'review_count': 1160,
 'categories': [{'alias': 'foodtrucks', 'title': 'Food Trucks'},
  {'alias': 'tacos', 'title': 'Tacos'}],
 'rating': 4.5,
 'coordinates': {'latitude': 34.114824, 'longitude': -118.269945},
 'transactions': [],
 'price': '$',
 'location': {'address1': '3201 Riverside Dr',
  'address2': None,
  'address3': '',
  'city': 'Los Angeles',
  'zip_code': '90027',
  'country': 'US',
  'state': 'CA',
  'display_address': ['3201 Riverside Dr', 'Los Angeles, CA 90027']},
 'phone': '+13233956233',
 'display_phone': '(323) 395-6233',
 'distance': 7581.229822603469}

In [14]:
# stole this function from project 3 - jerry

def get_price(yelp_object):
    
    # empty list container containing dictionaries representing each unique business
    businesses = []
    businesses_object = yelp_object['businesses']
    
    # looking at each subreddit which is passed into our function
    for i, business in enumerate(list(businesses_object)):
        
        # each dictionary will contain all of the desired information from each post
        business_dict = {}

        if 'price' in businesses_object[i].keys():
            business_dict['id']           = business['id']
            business_dict['latitude']     = business['coordinates']['latitude']
            business_dict['longitude']    = business['coordinates']['longitude']
            business_dict['price']        = business['price']
            business_dict['review_count'] = business['review_count']
            business_dict['rating']       = business['rating']
            business_dict['zip_code']     = business['location']['zip_code']
            business_dict['city']         = business['location']['city']
            business_dict['alias']        = business['alias']

        # populate the posts list with each post dictionary
        businesses.append(business_dict)

        # just a little sanity check to see how far along our function is going
        print(f'{i+1} out of {len(businesses_object)}')
    
    businesses = pd.DataFrame(businesses)
    
    return businesses

In [15]:
get_price(yelp)

1 out of 20
2 out of 20
3 out of 20
4 out of 20
5 out of 20
6 out of 20
7 out of 20
8 out of 20
9 out of 20
10 out of 20
11 out of 20
12 out of 20
13 out of 20
14 out of 20
15 out of 20
16 out of 20
17 out of 20
18 out of 20
19 out of 20
20 out of 20


Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
0,CcqraT0cuGKYEcZ1ri_kxg,34.040403,-118.253512,$$,689,5.0,90015,Los Angeles,broken-mouth-lees-homestyle-los-angeles-5
1,KQBGm5G8IDkE8LeNY45mbA,34.045605,-118.236061,$$,8205,4.0,90013,Los Angeles,wurstküche-los-angeles-2
2,pjh40JY5YwWeV8aKhkXERg,34.06577,-118.30847,$,702,4.5,90020,Los Angeles,myungrang-hot-dog-california-market-la-los-ang...
3,b4SH4SbQUJfXxh6hNkF0wg,34.050529,-118.248619,$,5597,4.0,90013,Los Angeles,eggslut-los-angeles-7
4,bvpxd7o3RZtz50YLpDsgLA,34.057125,-118.346074,$,1266,4.5,90019,Los Angeles,el-chato-taco-truck-los-angeles-2
5,DrToq9357afdpOyO5w0Y5w,34.064009,-118.300701,$$,1961,4.5,90020,Los Angeles,yup-dduk-la-los-angeles
6,9OhKC782fYh0akM1VIfMxQ,34.09745,-118.349941,$$,661,4.5,90046,Los Angeles,the-carving-board-los-angeles
7,26fIBSlQkaTa29uoKAvZOw,34.099915,-118.258928,$$,331,4.0,90039,Los Angeles,burgers-never-say-die-los-angeles-2
8,omaJSGvnj2vaIJ_MBxeyBw,34.032251,-118.334644,$,96,4.5,90018,Los Angeles,l-a-birria-los-angeles
9,WjuO8PhtqINeWqm-ei8NQA,34.0693,-118.29253,$$,508,4.5,90020,Los Angeles,dumpling-house-los-angeles-2


## aerika section

In [1]:
hi

NameError: name 'hi' is not defined

In [2]:
my

NameError: name 'my' is not defined

In [3]:
name

NameError: name 'name' is not defined

## echo section

added comment

In [16]:
import time

In [17]:
# stole this function from project 3 - jerry-----Echo: comment out a few lines to make them work with time in next cell

def get_price(yelp_object):
    businesses = []
    # empty list container containing dictionaries representing each unique business
    
    businesses_object = yelp_object['businesses']
    
    # looking at each subreddit which is passed into our function
    for i, business in enumerate(list(businesses_object)):
        
        # each dictionary will contain all of the desired information from each post
        business_dict = {}

        if 'price' in businesses_object[i].keys():
            business_dict['id']           = business['id']
            business_dict['latitude']     = business['coordinates']['latitude']
            business_dict['longitude']    = business['coordinates']['longitude']
            business_dict['price']        = business['price']
            business_dict['review_count'] = business['review_count']
            business_dict['rating']       = business['rating']
            business_dict['zip_code']     = business['location']['zip_code']
            business_dict['city']         = business['location']['city']
            business_dict['alias']        = business['alias']

        # populate the posts list with each post dictionary
        businesses.append(business_dict)

        # just a little sanity check to see how far along our function is going
        #print(f'{i+1} out of {len(businesses_object)}')
    return businesses
    #businesses = pd.DataFrame(businesses)
   

In [None]:
##let the function run 1 time every second to get data list

In [18]:
business_list = []  ## set n to your ideal number. Yelp claims we canget 5000 a day
n=0
while n<6:
    business_list.extend(get_price(yelp))
    time.sleep(1)
    n+=1

In [22]:
df = pd.DataFrame(business_list)

In [24]:
df

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
0,CcqraT0cuGKYEcZ1ri_kxg,34.040403,-118.253512,$$,689,5.0,90015,Los Angeles,broken-mouth-lees-homestyle-los-angeles-5
1,KQBGm5G8IDkE8LeNY45mbA,34.045605,-118.236061,$$,8205,4.0,90013,Los Angeles,wurstküche-los-angeles-2
2,pjh40JY5YwWeV8aKhkXERg,34.065770,-118.308470,$,702,4.5,90020,Los Angeles,myungrang-hot-dog-california-market-la-los-ang...
3,b4SH4SbQUJfXxh6hNkF0wg,34.050529,-118.248619,$,5597,4.0,90013,Los Angeles,eggslut-los-angeles-7
4,bvpxd7o3RZtz50YLpDsgLA,34.057125,-118.346074,$,1266,4.5,90019,Los Angeles,el-chato-taco-truck-los-angeles-2
...,...,...,...,...,...,...,...,...,...
115,oNHjZ3wC0F9l5KpuqHd45w,34.114824,-118.269945,$,1160,4.5,90027,Los Angeles,rickys-fish-tacos-los-angeles-3
116,23Qrcz2i9e2e8MstUIIO0A,34.062181,-118.348153,$$,1527,4.5,90036,Los Angeles,yuko-kitchen-los-angeles
117,HXWdcnzYG1zmf0vDplmEEQ,34.059960,-118.419820,$$,354,4.0,90067,Los Angeles,the-crack-shack-los-angeles-2
118,F1_EZV0z5gjoZu6K4BUUzQ,34.154819,-118.431921,$$,314,4.5,91423,Sherman Oaks,hot-motha-clucker-sherman-oaks


In [None]:
#df.to_csv('../data/echo1023.csv')

In [30]:
df = pd.read_csv('../data/echo1023.csv',index_col=0)
df

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
0,CcqraT0cuGKYEcZ1ri_kxg,34.040403,-118.253512,$$,689.0,5.0,90015.0,Los Angeles,broken-mouth-lees-homestyle-los-angeles-5
1,KQBGm5G8IDkE8LeNY45mbA,34.045605,-118.236061,$$,8205.0,4.0,90013.0,Los Angeles,wurstküche-los-angeles-2
2,pjh40JY5YwWeV8aKhkXERg,34.065770,-118.308470,$,702.0,4.5,90020.0,Los Angeles,myungrang-hot-dog-california-market-la-los-ang...
3,b4SH4SbQUJfXxh6hNkF0wg,34.050529,-118.248619,$,5597.0,4.0,90013.0,Los Angeles,eggslut-los-angeles-7
4,sYn3SNQP-j2t2XSwjlCbRg,34.064690,-118.308760,$$,1618.0,4.5,90020.0,Los Angeles,montys-good-burger-los-angeles
...,...,...,...,...,...,...,...,...,...
6095,CbW8U0QAwh5XRkaLt0xNZA,34.077585,-118.259599,$$,4023.0,4.0,90026.0,Los Angeles,masa-of-echo-park-los-angeles
6096,fjWpZPr4YDJRKdnj0fqlkg,34.083320,-118.327380,$$,33.0,5.0,90038.0,Los Angeles,omakase-teriyaki-los-angeles
6097,ohosmz6FXVAeoW5nUkYwng,34.090596,-118.277172,$$,2151.0,4.5,90026.0,Los Angeles,pine-and-crane-los-angeles
6098,,,,,,,,,


In [34]:
df[df.isna().any(axis=1)] ## show all the rows with nan

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
14,,,,,,,,,
18,,,,,,,,,
34,,,,,,,,,
38,,,,,,,,,
54,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
6058,,,,,,,,,
6074,,,,,,,,,
6078,,,,,,,,,
6094,,,,,,,,,


In [None]:
## gonna drop the nans before we learning the missing values

In [35]:
df = df.dropna()

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
0,CcqraT0cuGKYEcZ1ri_kxg,34.040403,-118.253512,$$,689.0,5.0,90015.0,Los Angeles,broken-mouth-lees-homestyle-los-angeles-5
1,KQBGm5G8IDkE8LeNY45mbA,34.045605,-118.236061,$$,8205.0,4.0,90013.0,Los Angeles,wurstküche-los-angeles-2
2,pjh40JY5YwWeV8aKhkXERg,34.065770,-118.308470,$,702.0,4.5,90020.0,Los Angeles,myungrang-hot-dog-california-market-la-los-ang...
3,b4SH4SbQUJfXxh6hNkF0wg,34.050529,-118.248619,$,5597.0,4.0,90013.0,Los Angeles,eggslut-los-angeles-7
4,sYn3SNQP-j2t2XSwjlCbRg,34.064690,-118.308760,$$,1618.0,4.5,90020.0,Los Angeles,montys-good-burger-los-angeles
...,...,...,...,...,...,...,...,...,...
6093,WjuO8PhtqINeWqm-ei8NQA,34.069300,-118.292530,$$,508.0,4.5,90020.0,Los Angeles,dumpling-house-los-angeles-2
6095,CbW8U0QAwh5XRkaLt0xNZA,34.077585,-118.259599,$$,4023.0,4.0,90026.0,Los Angeles,masa-of-echo-park-los-angeles
6096,fjWpZPr4YDJRKdnj0fqlkg,34.083320,-118.327380,$$,33.0,5.0,90038.0,Los Angeles,omakase-teriyaki-los-angeles
6097,ohosmz6FXVAeoW5nUkYwng,34.090596,-118.277172,$$,2151.0,4.5,90026.0,Los Angeles,pine-and-crane-los-angeles


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6100 entries, 0 to 6099
Data columns (total 9 columns):
id              5490 non-null object
latitude        5490 non-null float64
longitude       5490 non-null float64
price           5490 non-null object
review_count    5490 non-null float64
rating          5490 non-null float64
zip_code        5490 non-null float64
city            5490 non-null object
alias           5490 non-null object
dtypes: float64(5), object(4)
memory usage: 476.6+ KB


In [None]:
## seems there is no business with only a few columns of nan, dont know what to clean yet

## jerry section

In [None]:
# ds;alkfjds;lfkas;dghas'dlfjas;dlkfjas'dfj

In [1]:
# this is another change a;sdfja;dfkasd;fajsd;fasdkfjas;fwef

In [2]:
# yet another change ;asdlfkjas;dknva;djaweoifsd;laknfas;dfknasd

In [3]:
x = np.exp(52)
x

3.831008000716577e+22