# Setup

In [1]:
import json
from os import path
import pandas as pd

In [4]:
dataset_dir = "./yelp_dataset"
business_filepath = path.join(dataset_dir, "business.json")
review_filepath = path.join(dataset_dir, "review.json")
restaurants_categories_filepath = path.join(dataset_dir, "restaurant_categories.json")

# Manually Extract Categories

In [9]:
asian_categories = [
    "Bubble Tea","Izakaya","Vietnamese","Pan Asian","Lebanese","Dumplings","Szechuan",
    "Shaved Snow","Chinese","Japanese Curry","Japanese","Mongolian","Armenian","Syrian",
    "Hakka","Kebab","Donburi","Hot Pot","Tonkatsu","Teppanyaki","Pita",
    "Taiwanese","Oriental","Sushi Bars","Kombucha","Conveyor Belt Sushi","Ramen",
    "Middle Eastern","Filipino","Pakistani","Thai","Fuzhou","Himalayan/Nepalese","Bangladeshi",
    "Dim Sum","Singaporean","Uzbek","Korean","Shanghainese","Burmese","Noodles","Israeli",
    "Indian","Arabic","Asian Fusion","Cambodian","Malaysian","Hainan","Indonesian","Laotian",
    "Falafel","Cantonese","Hong Kong Style Cafe","Sri Lankan","Turkish","Persian/Iranian"
]

american_european_categories = [
    "French","Brasseries","Bistros","Burgers","Macarons","Bagels","Sicilian","Pretzels",
    "Tapas Bars","British","Pizza","Greek","Serbo Croatian",
    "Modern European","Gelato","Creperies","Pasta Shops","Irish","Scottish",
    "German","Ukrainian","Polish","Soul Food","Georgian","Sardinian","Tex-Mex",
    "Fondue","Cucina campana","Czech","Portuguese","Irish Pub","Cheesesteaks",
    "Roman","Belgian","Russian","Scandinavian","Fish & Chips","Spanish",
    "Guamanian","Pancakes","Cupcakes","Patisserie/Cake Shop",
    "American (Traditional)","Tapas/Small Plates","Donairs","Calabrian","Poutineries",
    "Waffles","Southern","American (New)","New Mexican Cuisine",
    "Donuts","Supper Clubs","Iberian","Italian","Kosher","Austrian","Tuscan","Hungarian",
    "Eastern European","Hot Dogs"
]


categories = asian_categories + american_european_categories
print(f"Number of categories: {len(categories)}")

Number of categories: 117


# Filter Businesses and Reviews

In [10]:
def business_satisfies_categories(business):
    business_categories = business['categories']
    business_categories = business_categories.split(", ") if business_categories else []
    
    for category in business_categories:
        if category in categories:
            return True
    return False

def get_businesses():
    with open(business_filepath, "r") as f:
        lines = f.readlines()
    businesses = []
    for line in lines:
        business = json.loads(line)
        businesses.append(business)
    businesses = pd.DataFrame(businesses)
    
    businesses = businesses[businesses.apply(business_satisfies_categories, axis=1)]
    
    # add column to differentiate between Asian and American / European cuisine
    def cuisine(business):
        is_asian = False
        is_american_european = False

        for category in business['categories'].split(", "):
            if category in asian_categories:
                is_asian = True
            if category in american_european_categories:
                is_american_european = True

        if is_asian and is_american_european:
            return "Both"
        elif is_asian:
            return "Asian"
        else:
            return "American European"
    
    businesses['cuisine'] = businesses.apply(cuisine, axis = 1)
    
    return businesses
    
businesses = get_businesses()

In [11]:
# get reviews filtered and joined by businesses
def get_reviews(businesses):
    with open(review_filepath, "r") as f:
        lines = f.readlines()
    reviews = []
    for line in lines:
        review = json.loads(line)
        reviews.append(review)
    reviews = pd.DataFrame(reviews)
    
    return pd.merge(reviews, businesses, on='business_id', how='inner')

reviews = get_reviews(businesses)    

In [8]:
# save the reviews in csv file
reviews.to_csv("reviews.csv", index=False)
