In [1]:
import json
import pandas as pd
import requests

import synapseclient
from synapseclient import File

syn = synapseclient.Synapse()
syn.login()

Welcome, Luke Waninger!



Load the categories from the Yelp documentation at https://www.yelp.com/developers/documentation/v3/all_category_list

In [2]:
r = requests.get('https://www.yelp.com/developers/documentation/v3/all_category_list/categories.json')
if r.ok:
    j = r.json()

Manual mappings based primarily on the Yelp defined hieararchy but several have been manually tuned.

In [3]:
YELP_TYPE_MAPPINGS = dict(
    government_offices=[
        'publicservicesgovt', 'animalshelters', 'government_offices'
    ],
    place_of_mourning=[
        'funeralservices', 'place_of_mourning'
    ],
    education=[
        'education', 'tastingclasses', 'specialtyschools', 'adultedu'
    ],
    place_of_worship=[
        'religiousorgs', 'place_of_worship'
    ],
    lodging=[
        'hotels', 'hotelstravel', 'agriturismi', 'apartments', 'condominiums', 'university_housing',
        'homelessshelters', 'lodging'
    ],
    entertainment=[
        'active', 'adultentertainment', 'artclasses', 'arts', 'artsandcrafts', 'entertainment',
        'bars', 'breweries', 'diving', 'festivals', 'martialarts', 'movietheaters',
        'museums', 'nightlife', 'tours', 'wineries', 'zoos', 'social_clubs', 'localflavor'
    ],
    health=[
        'dentalhygienists', 'dentists', 'fitness', 'gyms', 'health', 'medcenters', 'medicalspa',
        'opthamalogists', 'opticians', 'physicians', 'tcm', 'c_and_mh', 'acnetreatment', 'acupuncture',
        'addictionmedicine', 'allergist', 'alternativemedicine', 'anesthesiologists',
        'animalassistedtherapy'
    ],
    finance=[
        'estateplanning', 'financialservices', 'insurance', 'accountants', 'finance'
    ],
    repair=[
        'autoglass', 'autorepair', 'diagnosticservices', 'itservices', 'homeservices', 'repair',
        'junkremovalandhauling', 'laundryservices', 'localservices', 'musicinstrumentservices',
        'plumbing', 'professional'
    ],
    transit=[
        'airports', 'transport', 'travelservices', 'carrental', 'motorcycle_rental', 'trainstations', 'transit',
    ],
    dining_out=[
        'cafes', 'food', 'restaurants', 'jpsweets', 'african', 'arabian', 'dining_out',
        'belgian', 'brazilian', 'caribbean', 'chinese', 'donburi', 'french', 'german', 'turkish',
        'italian', 'japanse', 'latin', 'malaysian', 'mediterranean', 'mexican', 'mideastern', 'polish',
        'portugese', 'spanish', 'portuguese', 'japanese'
    ],
    home_store=[
        'gardening', 'homeandgarden', 'professional', 'kitchenandbath', 'landscaping', 'realestate', 'home_store'
        'realestateagents', 'realestatesvcs', 'apartmentagents', 'apartments', 'appliances', 'appraisalservices'
    ],
    supermarket=[
        'food, shopping', 'farms', 'wholesale_stores', 'wholesalers', 'gourmet', 'grocery', 'ethicgrocery', 'beverage_stores',
        'butcher', 'csa', 'convenience', 'farmersmarket', 'organic_stores', 'supermarket'
    ],
    automotive=[
        'auto', 'automotive'
    ],
    consumer_goods=[
        'flowers', 'bicycles', 'cannabis_clinics', 'fashion', 'shopping', 'partyequipmentrentals',
        'sportgoods', 'sportswear', 'wholesalers', 'pets', 'petstore', 'petservices',
        'accessories', 'petbreeders', 'antiques', 'shoppingcenters'
    ],
    personal_services=[
        'eventservices', 'beautysvc', 'hair', 'hairremoval', 'othersalons', 'psychic_astrology', 'skincare',
        'tanning', 'photographers', 'utilities', 'pet_sitting', 'aestheticians', 'animalholistic',
        'animalphysicaltherapy', '3dprinting', 'personal_services', 'lawyers', 'legalservices'
    ],
    park=[
        'parks', 'park'
    ],
    other_organization=[
        'massmedia', 'media', 'nonprofit', 'adoptionservices', 'advertising', 'other', 'other_organization'
    ]
)

Now, we iterate first through each alias looking for a mapping but if the alias isn't already in one of our predefined categories then check the parent. Mostly, the parent will be our source but in some instances we use the alias as it is a 'finer-grained' mapping.

In [4]:
yelp_mappings = pd.DataFrame(columns=['cat', 'mapping'])

for i, r in enumerate(j):
    alias = r.get('alias')
    parent = r.get('parents')
    
    mapping = 'none'
    for k, v in YELP_TYPE_MAPPINGS.items():
        if alias in v:
            mapping = k
            break
    
    if mapping == 'none':
        for k, v in YELP_TYPE_MAPPINGS.items():
            if any([p in v for p in parent]):
                mapping = k
    
    yelp_mappings.loc[i] = (alias, mapping)

yelp_mappings.to_csv('yelp_mappings.csv', index=None)
t = syn.store(File(name='yelp_mappings', path='yelp_mappings.csv', parent='syn16816579')); del t


##################################################
 Uploading file to Synapse storage 
##################################################



In [5]:
GMAP_TYPE_MAPPINGS = dict(
    government_offices=[
        'post_office', 'city_hall', 'courthouse', 'embassy',
        'local_government_office', 'police', 'fire_station',
        'government_office'
    ],
    place_of_mourning=[
        'cemetery', 'funeral_home', 'place_of_mourning'
    ],
    education=[
        'school', 'university', 'education'
    ],
    place_of_worship=[
        'church', 'hindu_temple', 'mosque', 'synagogue', 'place_of_worship'
    ],
    lodging=[
        'campground', 'lodging', 'rv_park'
    ],
    entertainment=[
        'bar', 'amusement_park', 'aquarium', 'art_gallery', 'bowling_alley',
        'casino', 'movie_rental', 'movie_theater', 'museum', 'night_club',
        'stadium', 'zoo', 'library', 'entertainment'
    ],
    health=[
        'dentist', 'doctor', 'gym', 'hospital', 'pharmacy', 'physiotherapist', 'health'
    ],
    finance=[
        'atm', 'bank', 'insurance_agency', 'finance', 'accounting'
    ],
    repair=[
        'car_repair', 'car_wash', 'electrician', 'plumber', 'general_contractor',
        'roofing_contractor', 'painter', 'locksmith', 'travel_agency', 'repair'
    ],
    transit=[
        'airport', 'bus_station', 'taxi_stand', 'train_station',
        'transit_station', 'subway_station', 'travel_agency', 'transit'
    ],
    dining_out=[
        'bakery', 'cafe', 'meal_delivery', 'meal_takeaway', 'restaurant', 'dining_out', 'food'
    ],
    home_store=[
        'furniture_store', 'electronics_store', 'hardware_store',
        'home_goods_store', 'moving_company', 'real_estate_agency',
        'storage', 'laundry', 'home_store'
    ],
    supermarket=[
        'convenience_store', 'liquor_store', 'supermarket',
        'grocery_or_supermarket'
    ],
    automotive=[
        'car_dealer', 'car_rental', 'gas_station', 'parking', 'automotive'
    ],
    consumer_goods=[
        'book_store', 'bicycle_store', 'clothing_store', 'department_store',
        'florist', 'jewelry_store', 'pet_store', 'shoe_store', 'shopping_mall',
        'consumer_goods', 'store'
    ],
    personal_services=[
        'beauty_salon', 'hair_care', 'spa', 'personal_services', 'lawyer', 'veterinary_care'
    ],
    other_organization=[
        'other'
    ],
    park=[
        'parks', 'park'
    ]
)

In [6]:
gmap_mappings = pd.DataFrame(columns=['cat', 'mapping'])

i = 0
for k, v in GMAP_TYPE_MAPPINGS.items():    
    for vi in v:
        gmap_mappings.loc[i] = (vi, k)
        i += 1
        
gmap_mappings.to_csv('gmap_mappings.csv', index=None)
t = syn.store(File(name='gmap_mappings', path='gmap_mappings.csv', parent='syn16816579')); del t


##################################################
 Uploading file to Synapse storage 
##################################################

