In [1]:
# Import libraries
import pandas as pd

In [2]:
df_b = pd.read_csv('../datasets/business4.csv')

In [3]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19534 entries, 0 to 19533
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   business_id        19534 non-null  object 
 1   name               19534 non-null  object 
 2   address            19467 non-null  object 
 3   city               19534 non-null  object 
 4   state              19534 non-null  object 
 5   postal_code        19532 non-null  float64
 6   latitude           19534 non-null  float64
 7   longitude          19534 non-null  float64
 8   stars              19534 non-null  float64
 9   review_count       19534 non-null  int64  
 10  is_open            19534 non-null  int64  
 11  attributes         19527 non-null  object 
 12  categories         19534 non-null  object 
 13  hours              18432 non-null  object 
 14  stability_score    19534 non-null  float64
 15  loyalty_score      19534 non-null  float64
 16  reliability_score  195

In [4]:
df_b[['business_id', 'categories']].head()

Unnamed: 0,business_id,categories
0,MTSW4McQd7CbVtyjqoe9mw,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
1,il_Ro8jwPlHresjw9EGmBg,"American (Traditional), Restaurants, Diners, B..."
2,0bPLkL0QhhPO5kt1_EXmNQ,"Food, Delis, Italian, Bakeries, Restaurants"
3,MUTTqe8uqyMdBl186RmNeA,"Sushi Bars, Restaurants, Japanese"
4,ROeacJQwBeh05Rqg7F6TCg,"Korean, Restaurants"


In [5]:
# -----------------------------
# 1. Define Category Mapping
# -----------------------------
# Each key represents a consolidated category label (feature column name),
# and the value is a list of raw Yelp category strings associated with that group.
# This mapping will later be used to assign each business to one or more standardized categories.


category_map = {
    "c_nightlife": [
        "Bars", "Lounges", "Pubs", "Sports Bars", "Dive Bars", "Cocktail Bars",
        "Whiskey Bars", "Wine Bars", "Beer Bar", "Champagne Bars", "Hookah Bars",
        "Cigar Bars", "Tiki Bars", "Speakeasies", "Nightlife"
    ],
    "c_cafes": [
        "Cafes", "Coffee & Tea", "Coffee Roasteries", "Themed Cafes",
        "Internet Cafes", "Parent Cafes", "Hong Kong Style Cafe", "Bubble Tea"
    ],
    "c_brunch": [
        "Breakfast & Brunch", "Pancakes", "Waffles", "Bagels", "Cafeteria", "Diners"
    ],
    "c_sandwich": [
        "Sandwiches", "Delis", "Delicatessen", "Cheesesteaks", "Hot Dogs", "Wraps", "Pretzels"
    ],
    "c_american_trad": [
        "American (Traditional)", "Soul Food", "Southern", "Comfort Food",
        "Barbeque", "BBQ", "Steakhouses"
    ],
    "c_american_new": [
        "American (New)", "Gastropubs", "Modern European", "New American"
    ],
    "c_italian": [
        "Italian", "Pasta Shops", "Tuscan", "Sicilian", "Roman", "Pizza"
    ],
    "c_burgers": [
        "Burgers", "Fast Food (Burger Chains)", "Drive-Thru Bars"
    ],
    "c_mexican": [
        "Mexican", "Tex-Mex", "Tacos", "New Mexican Cuisine", "Latin American"
    ],
    "c_seafood": [
        "Seafood", "Fish & Chips", "Seafood Markets", "Smokehouse", "Grill Services"
    ],
    "c_fastfood": [
        "Fast Food", "Chicken Shop", "Chicken Wings", "Donuts", "Buffets", "Poutineries"
    ],
    "c_asian_fusion": [
        "Asian Fusion", "Pan Asian", "Noodles", "Hot Pot", "Dim Sum", "Ramen",
        "Japanese Curry", "Izakaya", "Teppanyaki", "Sushi Bars", "Japanese", "Korean",
        "Chinese", "Thai", "Vietnamese", "Indonesian", "Malaysian", "Singaporean",
        "Filipino", "Burmese", "Cantonese", "Szechuan", "Himalayan/Nepalese", "Taiwanese"
    ],
    "c_bakeries": [
        "Bakeries", "Patisserie/Cake Shop", "Cupcakes", "Macarons", "Bagels",
        "Donuts", "Dessert", "Pretzels", "Creperies"
    ],
    "c_dietary": [
        "Vegetarian", "Vegan", "Gluten-Free", "Halal", "Kosher"
    ]
}


In [6]:
# -----------------------------
# 2. Apply Category Mapping
# -----------------------------
# For each mapped category, create a binary indicator column (1 = category matched)
for col, keywords in category_map.items():
    df_b[col] = df_b["categories"].apply(
        lambda x: int(any(k.lower() in x.lower() for k in keywords))
    )

# Create an "other" category that flags businesses
# which do not belong to any predefined category group
category_cols = list(category_map.keys())
df_b["c_other"] = df_b[category_cols].apply(lambda row: int(row.sum() == 0), axis=1)

# Preview the result
df_b.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,c_american_new,c_italian,c_burgers,c_mexican,c_seafood,c_fastfood,c_asian_fusion,c_bakeries,c_dietary,c_other
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,0,0,0,0,0,0,0,1,0,0
1,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227.0,39.637133,-86.127217,2.5,28,...,0,0,0,0,0,0,0,0,0,0
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771.0,27.916116,-82.760461,4.5,100,...,0,1,0,0,0,0,0,1,0,0
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106.0,39.953949,-75.143226,4.0,245,...,0,0,0,0,0,0,1,0,0,0
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147.0,39.943223,-75.162568,4.5,205,...,0,0,0,0,0,0,1,0,0,0


In [7]:
df_b.to_csv('../datasets/business5.csv', index=False)