# Yelp API Webscraping

In [3]:
import pandas as pd 
import json
import requests
import time, datetime

# Compiled Code
- Don't forget to comment so we can understand!

## known issues:
- function populates dataframe with NaN rows
- do we want business name?
- businesses have multiple categories and function only picks up one at a time
    - currently the 'alias' portion picks up the alias of the business (name), not the category alias

In [4]:
# NOTHING IN THIS CELL NEEDS TO GET CHANGED 
# IF YOU HAVE YOUR API KEY IN 'creds.json' IN ./Assets

# format your json file as a dictionary containing api key with DOUBLE QUOTES
# {"api": "your_super_long_api_key"}
creds_file = open('../Assets/creds.json')

# load credentials into variable
yelp_credentials = json.loads(creds_file.read())
api_key = yelp_credentials['api']
headers = {'Authorization': 'Bearer %s' % api_key}

# this is the url we use to make broad business searches
# https://www.yelp.com/developers/documentation/v3/business_search

url = 'https://api.yelp.com/v3/businesses/search'

In [52]:

def get_businesses(location):
    
    businesses_object = []
    for offset in range(0, 1000, 50):
        params = {
            'limit': 50, 
            'location': location.replace(' ', '+'),
            'radius': 3200,
            'offset': offset
        }
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            businesses_object += response.json()['businesses']
        elif response.status_code == 400:
            print('400 Bad Request')
            break


    # empty list container containing dictionaries representing each unique business
    businesses = []
    
    # looking at each subreddit which is passed into our function
    for i, business in enumerate(list(businesses_object)):
        
        # each dictionary will contain all of the desired information from each post
        business_dict = {}

        if 'price' in businesses_object[i].keys():
            business_dict['id']           = business['id']
            business_dict['latitude']     = business['coordinates']['latitude']
            business_dict['longitude']    = business['coordinates']['longitude']
            business_dict['price']        = business['price']
            business_dict['review_count'] = business['review_count']
            business_dict['rating']       = business['rating']
            business_dict['zip_code']     = business['location']['zip_code']
            business_dict['city']         = business['location']['city']
            business_dict['alias']        = business['alias']
           
        num_categories = len(businesses_object[i]['categories'])
        
        category_list = []
        
        for num in range(num_categories):
            category = businesses_object[i]['categories'][num]['alias']
            category_list.append(category)
            business_dict['category']     = category_list

        # populate the posts list with each post dictionary
        businesses.append(business_dict)

        # just a little sanity check to see how far along our function is going
        if (i+1) % 100 == 0:
            print(f'{i+1} out of {len(businesses_object)}')
    
    businesses = pd.DataFrame(businesses)
    
    # Saves the business dataframe to a csv file with current time
    now = datetime.datetime.now()
    businesses_path = "../data/" + location +"_businesses" + now.strftime("%Y-%m-%d_%H_%M") + ".csv"
    
    
    
    
    # Uncomment this line when you're ready to start saving files
#     businesses.to_csv(businesses_path,index=False)

    return businesses_path

In [54]:
get_businesses('Koreatown')

100 out of 1000
200 out of 1000
300 out of 1000
400 out of 1000
500 out of 1000
600 out of 1000
700 out of 1000
800 out of 1000
900 out of 1000
1000 out of 1000


'../data/Koreatown Los Angeles_businesses2019-10-24_12_25.csv'

## aerika section

## echo section

added comment

In [16]:
import time

In [17]:
# stole this function from project 3 - jerry-----Echo: comment out a few lines to make them work with time in next cell

def get_price(yelp_object):
    businesses = []
    # empty list container containing dictionaries representing each unique business
    
    businesses_object = yelp_object['businesses']
    
    # looking at each subreddit which is passed into our function
    for i, business in enumerate(list(businesses_object)):
        
        # each dictionary will contain all of the desired information from each post
        business_dict = {}

        if 'price' in businesses_object[i].keys():
            business_dict['id']           = business['id']
            business_dict['latitude']     = business['coordinates']['latitude']
            business_dict['longitude']    = business['coordinates']['longitude']
            business_dict['price']        = business['price']
            business_dict['review_count'] = business['review_count']
            business_dict['rating']       = business['rating']
            business_dict['zip_code']     = business['location']['zip_code']
            business_dict['city']         = business['location']['city']
            business_dict['alias']        = business['alias']

        # populate the posts list with each post dictionary
        businesses.append(business_dict)

        # just a little sanity check to see how far along our function is going
        #print(f'{i+1} out of {len(businesses_object)}')
    return businesses
    #businesses = pd.DataFrame(businesses)
   

In [None]:
##let the function run 1 time every second to get data list

In [18]:
business_list = []  ## set n to your ideal number. Yelp claims we canget 5000 a day
n=0
while n<6:
    business_list.extend(get_price(yelp))
    time.sleep(1)
    n+=1

In [22]:
df = pd.DataFrame(business_list)

In [24]:
df

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
0,CcqraT0cuGKYEcZ1ri_kxg,34.040403,-118.253512,$$,689,5.0,90015,Los Angeles,broken-mouth-lees-homestyle-los-angeles-5
1,KQBGm5G8IDkE8LeNY45mbA,34.045605,-118.236061,$$,8205,4.0,90013,Los Angeles,wurstküche-los-angeles-2
2,pjh40JY5YwWeV8aKhkXERg,34.065770,-118.308470,$,702,4.5,90020,Los Angeles,myungrang-hot-dog-california-market-la-los-ang...
3,b4SH4SbQUJfXxh6hNkF0wg,34.050529,-118.248619,$,5597,4.0,90013,Los Angeles,eggslut-los-angeles-7
4,bvpxd7o3RZtz50YLpDsgLA,34.057125,-118.346074,$,1266,4.5,90019,Los Angeles,el-chato-taco-truck-los-angeles-2
...,...,...,...,...,...,...,...,...,...
115,oNHjZ3wC0F9l5KpuqHd45w,34.114824,-118.269945,$,1160,4.5,90027,Los Angeles,rickys-fish-tacos-los-angeles-3
116,23Qrcz2i9e2e8MstUIIO0A,34.062181,-118.348153,$$,1527,4.5,90036,Los Angeles,yuko-kitchen-los-angeles
117,HXWdcnzYG1zmf0vDplmEEQ,34.059960,-118.419820,$$,354,4.0,90067,Los Angeles,the-crack-shack-los-angeles-2
118,F1_EZV0z5gjoZu6K4BUUzQ,34.154819,-118.431921,$$,314,4.5,91423,Sherman Oaks,hot-motha-clucker-sherman-oaks


In [None]:
#df.to_csv('../data/echo1023.csv')

In [30]:
df = pd.read_csv('../data/echo1023.csv',index_col=0)
df

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
0,CcqraT0cuGKYEcZ1ri_kxg,34.040403,-118.253512,$$,689.0,5.0,90015.0,Los Angeles,broken-mouth-lees-homestyle-los-angeles-5
1,KQBGm5G8IDkE8LeNY45mbA,34.045605,-118.236061,$$,8205.0,4.0,90013.0,Los Angeles,wurstküche-los-angeles-2
2,pjh40JY5YwWeV8aKhkXERg,34.065770,-118.308470,$,702.0,4.5,90020.0,Los Angeles,myungrang-hot-dog-california-market-la-los-ang...
3,b4SH4SbQUJfXxh6hNkF0wg,34.050529,-118.248619,$,5597.0,4.0,90013.0,Los Angeles,eggslut-los-angeles-7
4,sYn3SNQP-j2t2XSwjlCbRg,34.064690,-118.308760,$$,1618.0,4.5,90020.0,Los Angeles,montys-good-burger-los-angeles
...,...,...,...,...,...,...,...,...,...
6095,CbW8U0QAwh5XRkaLt0xNZA,34.077585,-118.259599,$$,4023.0,4.0,90026.0,Los Angeles,masa-of-echo-park-los-angeles
6096,fjWpZPr4YDJRKdnj0fqlkg,34.083320,-118.327380,$$,33.0,5.0,90038.0,Los Angeles,omakase-teriyaki-los-angeles
6097,ohosmz6FXVAeoW5nUkYwng,34.090596,-118.277172,$$,2151.0,4.5,90026.0,Los Angeles,pine-and-crane-los-angeles
6098,,,,,,,,,


In [34]:
df[df.isna().any(axis=1)] ## show all the rows with nan

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
14,,,,,,,,,
18,,,,,,,,,
34,,,,,,,,,
38,,,,,,,,,
54,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
6058,,,,,,,,,
6074,,,,,,,,,
6078,,,,,,,,,
6094,,,,,,,,,


In [None]:
## gonna drop the nans before we learning the missing values

In [35]:
df = df.dropna()

Unnamed: 0,id,latitude,longitude,price,review_count,rating,zip_code,city,alias
0,CcqraT0cuGKYEcZ1ri_kxg,34.040403,-118.253512,$$,689.0,5.0,90015.0,Los Angeles,broken-mouth-lees-homestyle-los-angeles-5
1,KQBGm5G8IDkE8LeNY45mbA,34.045605,-118.236061,$$,8205.0,4.0,90013.0,Los Angeles,wurstküche-los-angeles-2
2,pjh40JY5YwWeV8aKhkXERg,34.065770,-118.308470,$,702.0,4.5,90020.0,Los Angeles,myungrang-hot-dog-california-market-la-los-ang...
3,b4SH4SbQUJfXxh6hNkF0wg,34.050529,-118.248619,$,5597.0,4.0,90013.0,Los Angeles,eggslut-los-angeles-7
4,sYn3SNQP-j2t2XSwjlCbRg,34.064690,-118.308760,$$,1618.0,4.5,90020.0,Los Angeles,montys-good-burger-los-angeles
...,...,...,...,...,...,...,...,...,...
6093,WjuO8PhtqINeWqm-ei8NQA,34.069300,-118.292530,$$,508.0,4.5,90020.0,Los Angeles,dumpling-house-los-angeles-2
6095,CbW8U0QAwh5XRkaLt0xNZA,34.077585,-118.259599,$$,4023.0,4.0,90026.0,Los Angeles,masa-of-echo-park-los-angeles
6096,fjWpZPr4YDJRKdnj0fqlkg,34.083320,-118.327380,$$,33.0,5.0,90038.0,Los Angeles,omakase-teriyaki-los-angeles
6097,ohosmz6FXVAeoW5nUkYwng,34.090596,-118.277172,$$,2151.0,4.5,90026.0,Los Angeles,pine-and-crane-los-angeles


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6100 entries, 0 to 6099
Data columns (total 9 columns):
id              5490 non-null object
latitude        5490 non-null float64
longitude       5490 non-null float64
price           5490 non-null object
review_count    5490 non-null float64
rating          5490 non-null float64
zip_code        5490 non-null float64
city            5490 non-null object
alias           5490 non-null object
dtypes: float64(5), object(4)
memory usage: 476.6+ KB


In [None]:
## seems there is no business with only a few columns of nan, dont know what to clean yet

## jerry section

In [None]:
# ds;alkfjds;lfkas;dghas'dlfjas;dlkfjas'dfj

In [1]:
# this is another change a;sdfja;dfkasd;fajsd;fasdkfjas;fwef

In [2]:
# yet another change ;asdlfkjas;dknva;djaweoifsd;laknfas;dfknasd

In [3]:
x = np.exp(52)
x

3.831008000716577e+22