In [38]:
from config import client_id, api_key

import pandas as pd
import requests
import json
import time
import sqlite3

# Yelp Pet Stores and Pet Services in NYC (ETL): Extract

In [3]:
#function to parse Yelp Fusion API response for a list of Yelp businesses
def parse_response(response):
    response_json = response.json()
    businesses = []
    
    for business in response_json['businesses']:
        business_json = {}
        categories = []
        
        for category in business['categories']:
            categories.append(category['title'])
        
        business_json['categories'] = categories
        business_json['id'] = business['id']
        business_json['name'] = business['name']
        business_json['is_closed'] = business['is_closed']
        business_json['review_count'] = business['review_count']
        business_json['rating'] = business['rating']
        business_json['zip_code'] = business['location']['zip_code']
        
        businesses.append(business_json)
        
    return businesses
    
#function to call Yelp Fusion API with given search term, location and max search results
def call_yelp(term, location, search_max):
    search_results = []
    search_limit = 50
    search_calls = int(search_max / search_limit)
    
    for i in range(search_calls):
        search_offset = i * search_limit
        url = 'https://api.yelp.com/v3/businesses/search'
        headers = {'Authorization': 'Bearer {}'.format(api_key),}
        url_params = {'term': term.replace(' ', '+'),
                      'location': location.replace(' ', '+'),
                      'limit': search_limit,
                      'offset': search_offset
                     }
        
        response = requests.get(url, headers=headers, params=url_params)
        print('term: {}, offset: {}, response: {}'.format(term, search_offset, response))
        
        search_results.extend(parse_response(response))
        time.sleep(2)
        
    return search_results

In [4]:
#pull and save 1,000 (max) pet stores in NYC
pet_stores = 'Pet Stores'
location = 'New York, NY'
search_max = 1000

yelp_pet_stores = call_yelp(pet_stores, location, search_max)

term: Pet Stores, offset: 0, response: <Response [200]>
term: Pet Stores, offset: 50, response: <Response [200]>
term: Pet Stores, offset: 100, response: <Response [200]>
term: Pet Stores, offset: 150, response: <Response [200]>
term: Pet Stores, offset: 200, response: <Response [200]>
term: Pet Stores, offset: 250, response: <Response [200]>
term: Pet Stores, offset: 300, response: <Response [200]>
term: Pet Stores, offset: 350, response: <Response [200]>
term: Pet Stores, offset: 400, response: <Response [200]>
term: Pet Stores, offset: 450, response: <Response [200]>
term: Pet Stores, offset: 500, response: <Response [200]>
term: Pet Stores, offset: 550, response: <Response [200]>
term: Pet Stores, offset: 600, response: <Response [200]>
term: Pet Stores, offset: 650, response: <Response [200]>
term: Pet Stores, offset: 700, response: <Response [200]>
term: Pet Stores, offset: 750, response: <Response [200]>
term: Pet Stores, offset: 800, response: <Response [200]>
term: Pet Stores,

In [17]:
#pull and save 1,000 (max) pet services in NYC
pet_services = 'Pet Services'
location = 'New York, NY'
search_max = 1000

yelp_pet_services = call_yelp(pet_services, location, search_max)

term: Pet Services, offset: 0, response: <Response [200]>
term: Pet Services, offset: 50, response: <Response [200]>
term: Pet Services, offset: 100, response: <Response [200]>
term: Pet Services, offset: 150, response: <Response [200]>
term: Pet Services, offset: 200, response: <Response [200]>
term: Pet Services, offset: 250, response: <Response [200]>
term: Pet Services, offset: 300, response: <Response [200]>
term: Pet Services, offset: 350, response: <Response [200]>
term: Pet Services, offset: 400, response: <Response [200]>
term: Pet Services, offset: 450, response: <Response [200]>
term: Pet Services, offset: 500, response: <Response [200]>
term: Pet Services, offset: 550, response: <Response [200]>
term: Pet Services, offset: 600, response: <Response [200]>
term: Pet Services, offset: 650, response: <Response [200]>
term: Pet Services, offset: 700, response: <Response [200]>
term: Pet Services, offset: 750, response: <Response [200]>
term: Pet Services, offset: 800, response: 

# Yelp Pet Stores and Pet Services in NYC (ETL): Transform

In [34]:
# create dataframe of pet stores
pet_stores = pd.DataFrame.from_dict(yelp_pet_stores)
pet_stores.head()

Unnamed: 0,categories,id,is_closed,name,rating,review_count,zip_code
0,[Pet Stores],gxMLTap163ma5RJNK0EiNg,False,Hoboken Pet,4.5,30,7030
1,[Pet Stores],eTqmm-qOh4nStAcR6Ezn7A,False,Zee.Dog - SoHo,4.5,14,10013
2,"[Pet Stores, Pet Groomers]",tYZEFqmbGEA0cN6SGz7Swg,False,Petropolis,4.5,76,10006
3,[Pet Stores],YtJXZdh3lnuH2JuDtXBqqQ,False,NYC Pet,2.5,15,11211
4,[Pet Stores],IMw1Jr7T96is-7lccQUT3w,False,Petopia,4.0,52,10009


In [64]:
# create dataframe of pet services
pet_services = pd.DataFrame.from_dict(yelp_pet_services)
pet_services.head()

Unnamed: 0,categories,id,is_closed,name,rating,review_count,zip_code
0,"[Dog Walkers, Pet Sitting]",81v--_Id0CEOEPNe8AwhmA,False,Happy Pants,5.0,33,10014
1,"[Dog Walkers, Pet Sitting]",2I0mcwFT6FZswHhb6ldaHw,False,Pet Sitting Pod,5.0,19,10002
2,[Pet Sitting],60kRIrCCxQCns1gIH-0CoQ,False,Paws & Relax,4.5,33,10009
3,"[Pet Groomers, Pet Stores, Dog Walkers]",FUNFrflWo_F0sLeUM3bCgQ,False,Le Pet Spa,5.0,39,10280
4,"[Dog Walkers, Pet Sitting]",VbfjSxyG0VTe4MYUIKIfAQ,False,Frida's Premium Pet Services,5.0,26,7302


In [129]:
#create list of unique pet store and service categoories
pet_categories = []

for row in range(pet_stores.shape[0]):
    pet_categories.extend(pet_stores['categories'][row])
    
for row in range(pet_services.shape[0]):
    pet_categories.extend(pet_services['categories'][row])
    
pet_categories = sorted(list(set(pet_categories)))
pet_categories

['Accessories',
 'Acupuncture',
 'Animal Shelters',
 'Aquarium Services',
 'Aquariums',
 'Art Galleries',
 'Beer, Wine & Spirits',
 'Bird Shops',
 'Cafes',
 'Child Care & Day Care',
 "Children's Clothing",
 'Coffee & Tea',
 'Community Service/Non-Profit',
 'Couriers & Delivery Services',
 'Cremation Services',
 'Doctors',
 'Dog Parks',
 'Dog Walkers',
 'Drugstores',
 'Emergency Pet Hospital',
 'Event Photography',
 'Gift Shops',
 'Grocery',
 'Handyman',
 'Holistic Animal Care',
 'Home & Garden',
 'Home Cleaning',
 'Home Organization',
 'Horse Boarding',
 'Horseback Riding',
 'Hospice',
 'House Sitters',
 'Ice Cream & Frozen Yogurt',
 'Insurance',
 'Landscaping',
 'Lighting Fixtures & Equipment',
 'Livestock Feed & Supply',
 'Local Fish Stores',
 'Office Cleaning',
 'Personal Assistants',
 'Pet Adoption',
 'Pet Boarding',
 'Pet Breeders',
 'Pet Cremation Services',
 'Pet Groomers',
 'Pet Insurance',
 'Pet Photography',
 'Pet Services',
 'Pet Sitting',
 'Pet Stores',
 'Pet Training',
 'P

In [132]:
#create junction table for businesses and categories
business_ids = []
category_ids = []

for row in range(pet_stores.shape[0]):
    for category in pet_stores['categories'][row]:
        business_ids.append(pet_stores['id'][row])
        category_ids.append(pet_categories.index(category))
        
for row in range(pet_services.shape[0]):
    for category in pet_services['categories'][row]:
        business_ids.append(pet_services['id'][row])
        category_ids.append(pet_categories.index(category))
        
print(len(business_ids))
print(len(category_ids))

3274
3274


[49,
 49,
 49,
 44,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 50,
 44,
 49,
 49,
 49,
 49,
 49,
 44,
 48,
 49,
 47,
 49,
 47,
 49,
 49,
 49,
 44,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 50,
 44,
 49,
 49,
 49,
 49,
 49,
 50,
 47,
 49,
 49,
 49,
 50,
 49,
 49,
 49,
 44,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 44,
 49,
 41,
 44,
 4,
 61,
 49,
 49,
 49,
 49,
 0,
 46,
 49,
 44,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 42,
 49,
 49,
 49,
 49,
 44,
 41,
 44,
 49,
 49,
 44,
 50,
 44,
 49,
 41,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 44,
 49,
 49,
 44,
 17,
 49,
 49,
 48,
 44,
 44,
 49,
 50,
 49,
 49,
 49,
 48,
 44,
 49,
 50,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 42,
 49,
 49,
 49,
 49,
 49,
 49,
 49,
 44,
 49,
 49,
 44,
 49,
 50,
 44,
 44,
 17,
 49,
 44,
 49,
 50,
 49,
 49,
 44,
 49,
 49,
 44,
 49,
 49,
 50,
 44,
 49,
 49,
 44,
 17,
 48,
 49,
 49,
 50,
 44,
 49,
 40,
 2,
 12,
 49,
 49,
 49,
 49,
 8,
 49,
 49,
 49,
 49,
 50,

# Yelp Pet Stores and Pet Services in NYC (ETL): Load

In [39]:
#creating SQL connection
conn = sqlite3.connect('../Data/pet_care_industry.db')
c = conn.cursor()

#function to create table
def create_table(query):
    c.execute(query)

#function to close connection
def close_c_conn():
    c.close()
    conn.close()

In [73]:
#create pet stores and services table
create_query = """CREATE TABLE stores_and_services
                (id TEXT PRIMARY KEY,
                 Name TEXT,
                 Rating REAL,
                 Review_Count INTEGER,
                 ZipCode INTEGER);"""

c.execute('DROP TABLE IF EXISTS stores_and_services')
create_table(create_query)

In [75]:
#function to insert businesses into table
def insert_businesses(businesses):
    for i in range(len(businesses.index)):
        if (not businesses.iloc[i]['is_closed']) & (businesses.iloc[i]['zip_code'].isnumeric()):
            c.execute("""INSERT OR REPLACE INTO stores_and_services
                  (id,
                   Name,
                   Rating,
                   Review_Count,
                   ZipCode)
                   VALUES
                   (?,?,?,?,?)""",
                   (businesses.iloc[i]['id'],
                    businesses.iloc[i]['name'],
                    float(businesses.iloc[i]['rating']),
                    int(businesses.iloc[i]['review_count']),
                    int(businesses.iloc[i]['zip_code'])))
        
    conn.commit()
    
#insert pet store and services into table
insert_businesses(pet_stores)
insert_businesses(pet_services)

In [84]:
#check SQL pet store and services table
stores_and_services = pd.read_sql_query("""SELECT Name, Rating, Review_Count, ZipCode
                                        FROM stores_and_services;""", conn)
stores_and_services = stores_and_services.set_index('Name')
stores_and_services

Unnamed: 0_level_0,Rating,Review_Count,ZipCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NYC Pet,2.5,15,11211
NYC Pet,5.0,8,11211
Petland Discounts,3.5,6,11237
Park slope Pet,5.0,10,11215
Petland Discounts,3.0,19,11216
Pet Central,3.5,6,10128
Petland Discounts,2.5,9,11206
Pet Valu,3.0,14,7002
"Slope Pet Food Supplies, Inc",5.0,8,11215
Dee's Pet Food,4.5,14,11238


In [113]:
#create unique categories table
create_query = """CREATE TABLE categories
                (id TEXT PRIMARY KEY,
                 Name TEXT);"""

c.execute('DROP TABLE IF EXISTS categories')
create_table(create_query)

In [115]:
#function to insert categories into table
def insert_categories(categories):
    for i in range(len(categories)):
        c.execute("""INSERT INTO categories
                  (id,
                   Name)
                   VALUES
                   (?,?)""",
                   (i,
                    categories[i]))
        
    conn.commit()
    
#insert categories into table
insert_categories(pet_categories)

In [117]:
#check SQL categories table
categories = pd.read_sql_query("""SELECT * FROM categories;""", conn)
categories = categories.set_index('id')
categories

Unnamed: 0_level_0,Name
id,Unnamed: 1_level_1
0,Accessories
1,Animal Shelters
2,Aquarium Services
3,Aquariums
4,Art Galleries
5,"Beer, Wine & Spirits"
6,Bird Shops
7,Cafes
8,Children's Clothing
9,Coffee & Tea


In [119]:
#defining SQL query to create junction table for businesses and categories
create_query = """CREATE TABLE IF NOT EXISTS businesses_categories
(   business_id TEXT    NOT NULL,
    category_id INTEGER NOT NULL,
    PRIMARY KEY(business_id, category_id),
    FOREIGN KEY(business_id) REFERENCES stores_and_services(id),
    FOREIGN KEY(category_id) REFERENCES categories(id));"""

c.execute('DROP TABLE IF EXISTS businesses_categories')
create_table(create_query)

In [134]:
#function to insert businesses and categories into table
def insert_businesses_categories(business_ids, category_ids):
    for i in range(len(business_ids)):
        c.execute("""INSERT OR REPLACE INTO businesses_categories
                  (business_id,
                   category_id)
                   VALUES
                   (?,?)""",
                   (business_ids[i],
                    category_ids[i]))
        
    conn.commit()
    
#insert categories into table
insert_businesses_categories(business_ids, category_ids)

In [136]:
#querying SQL businesses and categories table
businesses_categories = pd.read_sql_query("SELECT * FROM businesses_categories;", conn)
businesses_categories

Unnamed: 0,business_id,category_id
0,YtJXZdh3lnuH2JuDtXBqqQ,49
1,yc0AK8DAKTQF3JuDI1Ta4w,49
2,8AKa6ep2-fW4zFkvg_yhKw,49
3,o7ja0NftF0t8bIQJJgGP4Q,49
4,JwbfRncSOiv2akciJcwVHA,47
5,JwbfRncSOiv2akciJcwVHA,49
6,uBoaVbgLCPTwa2ab5890JA,47
7,uBoaVbgLCPTwa2ab5890JA,49
8,jYfy42Dx_in0SLYQIunehg,49
9,bUp8mGkEoz68vW4K_7ItVQ,49


In [137]:
#close connection
close_c_conn()