In [None]:
# Import Libraries and Load Files

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from pandas import json_normalize
import ast

from sklearn.neighbors import BallTree

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [2]:
# Load business file
df_b = pd.read_csv('../datasets/business5.csv')

In [3]:
# Load review file
df_r = pd.read_parquet("../datasets/parquet_file/review_filtered.parquet")

# 2. Feature Engineering for ML

### (1) Attribute-Derived Columns

In [4]:
# Convert attribute strings into dictionaries (handles non-string safely)
df_b['attributes'] = df_b['attributes'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Flatten nested dictionaries into a normalized column structure
attr_flat = json_normalize(
    df_b.loc[df_b['attributes'].notna(), 'attributes']
)

In [5]:
# Match flattened attribute rows with the corresponding business_id
attr_flat['business_id'] = df_b.loc[df_b['attributes'].notna(), 'business_id'].values

# Preview the result
attr_flat.head()

Unnamed: 0,AcceptsInsurance,AgesAllowed,Alcohol,Ambience,BYOB,BYOBCorkage,BestNights,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,...,RestaurantsDelivery,RestaurantsGoodForGroups,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,Smoking,WheelchairAccessible,WiFi,business_id
0,,,u'none',,,,,True,,False,...,False,,1,,,True,,,u'free',MTSW4McQd7CbVtyjqoe9mw
1,,,'none',"{'touristy': None, 'hipster': None, 'romantic'...",,,,False,,True,...,True,True,1,False,,True,,,u'no',il_Ro8jwPlHresjw9EGmBg
2,,,u'none',"{'romantic': False, 'intimate': False, 'touris...",,,,True,False,True,...,True,False,1,False,False,True,,True,u'no',0bPLkL0QhhPO5kt1_EXmNQ
3,,,'full_bar',"{'touristy': False, 'hipster': False, 'romanti...",,,,,,True,...,True,True,2,True,True,True,,True,'free',MUTTqe8uqyMdBl186RmNeA
4,,,u'none',"{'touristy': False, 'hipster': False, 'romanti...",,,,True,,True,...,,False,1,False,True,True,,,u'no',ROeacJQwBeh05Rqg7F6TCg


In [6]:
attr_flat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19527 entries, 0 to 19526
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   AcceptsInsurance            0 non-null      object
 1   AgesAllowed                 36 non-null     object
 2   Alcohol                     17679 non-null  object
 3   Ambience                    18175 non-null  object
 4   BYOB                        2141 non-null   object
 5   BYOBCorkage                 841 non-null    object
 6   BestNights                  2535 non-null   object
 7   BikeParking                 16672 non-null  object
 8   BusinessAcceptsBitcoin      3402 non-null   object
 9   BusinessAcceptsCreditCards  18921 non-null  object
 10  BusinessParking             19041 non-null  object
 11  ByAppointmentOnly           1794 non-null   object
 12  Caters                      16245 non-null  object
 13  CoatCheck                   2263 non-null   ob

In [7]:
# -------------------------
# 1. a_full_expansion
# -------------------------
# Select relevant columns from attr_flat
cols = ["business_id", "RestaurantsDelivery", "RestaurantsTakeOut", "Caters"]
attr_subset = attr_flat[cols].copy()

# Normalize string values (e.g., 'True', 'False', None)
for c in ["RestaurantsDelivery", "RestaurantsTakeOut", "Caters"]:
    attr_subset[c] = attr_subset[c].astype(str).str.lower().replace({"none": None})

# Create 'a_full_expansion' column
attr_subset["a_full_expansion"] = (
    (attr_subset["RestaurantsDelivery"] == "true") &
    (attr_subset["RestaurantsTakeOut"] == "true") &
    (attr_subset["Caters"] == "true")
).astype(int)

# Merge with df_b on business_id
df_b = df_b.merge(
    attr_subset[["business_id", "a_full_expansion"]],
    on="business_id",
    how="left"
)

# Fill missing values with 0 (restaurants with no info)
df_b["a_full_expansion"] = df_b["a_full_expansion"].fillna(0).astype(int)

# Check results
df_b.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,c_italian,c_burgers,c_mexican,c_seafood,c_fastfood,c_asian_fusion,c_bakeries,c_dietary,c_other,a_full_expansion
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,0,0,0,0,0,0,1,0,0,0
1,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227.0,39.637133,-86.127217,2.5,28,...,0,0,0,0,0,0,0,0,0,0
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771.0,27.916116,-82.760461,4.5,100,...,1,0,0,0,0,0,1,0,0,1
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106.0,39.953949,-75.143226,4.0,245,...,0,0,0,0,0,1,0,0,0,0
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147.0,39.943223,-75.162568,4.5,205,...,0,0,0,0,0,1,0,0,0,0


In [8]:
# -------------------------
# 2. a_reservation_model
# -------------------------
# Select relevant columns from attr_flat
cols = ["business_id", "RestaurantsReservations", "RestaurantsTableService", "ByAppointmentOnly"]
attr_subset = attr_flat[cols].copy()

# Normalize string values ('True', 'False', None → lower case)
for c in ["RestaurantsReservations", "RestaurantsTableService", "ByAppointmentOnly"]:
    attr_subset[c] = attr_subset[c].astype(str).str.lower().replace({"none": None})

# Convert to boolean (True=1, False/None=0)
for c in ["RestaurantsReservations", "RestaurantsTableService", "ByAppointmentOnly"]:
    attr_subset[c] = (attr_subset[c] == "true").astype(int)

# Create 'a_reservation_model' column
attr_subset["a_reservation_model"] = (
    (attr_subset[["RestaurantsReservations", "RestaurantsTableService", "ByAppointmentOnly"]].sum(axis=1) >= 2)
).astype(int)

# Merge with df_b
df_b = df_b.merge(
    attr_subset[["business_id", "a_reservation_model"]],
    on="business_id",
    how="left"
)

# Fill missing values with 0
df_b["a_reservation_model"] = df_b["a_reservation_model"].fillna(0).astype(int)

# Check results
df_b.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,c_burgers,c_mexican,c_seafood,c_fastfood,c_asian_fusion,c_bakeries,c_dietary,c_other,a_full_expansion,a_reservation_model
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,0,0,0,0,0,1,0,0,0,0
1,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227.0,39.637133,-86.127217,2.5,28,...,0,0,0,0,0,0,0,0,0,0
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771.0,27.916116,-82.760461,4.5,100,...,0,0,0,0,0,1,0,0,1,0
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106.0,39.953949,-75.143226,4.0,245,...,0,0,0,0,1,0,0,0,0,1
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147.0,39.943223,-75.162568,4.5,205,...,0,0,0,0,1,0,0,0,0,0


In [9]:
# -------------------------
# 3. a_parking_space
# -------------------------
# Safely parse BusinessParking column
def safe_eval(x):
    try:
        if isinstance(x, str) and x.strip().startswith("{"):
            return ast.literal_eval(x)
        else:
            return {}
    except (ValueError, SyntaxError):
        return {}

attr_flat["BusinessParking_dict"] = attr_flat["BusinessParking"].apply(safe_eval)

# Extract parking features and determine availability
def has_parking(d):
    if not isinstance(d, dict):
        return 0
    # Check if any of the five options is True
    return int(any([
        d.get("street") is True,
        d.get("lot") is True,
        d.get("valet") is True,
        d.get("validated") is True,
        d.get("garage") is True
    ]))

attr_flat["a_parking_space"] = attr_flat["BusinessParking_dict"].apply(has_parking)

# Merge with df_b on business_id
df_b = df_b.merge(
    attr_flat[["business_id", "a_parking_space"]],
    on="business_id",
    how="left"
)

# Fill missing values with 0 (no info = no parking)
df_b["a_parking_space"] = df_b["a_parking_space"].fillna(0).astype(int)

# Check results
df_b.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,c_mexican,c_seafood,c_fastfood,c_asian_fusion,c_bakeries,c_dietary,c_other,a_full_expansion,a_reservation_model,a_parking_space
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,0,0,0,0,1,0,0,0,0,1
1,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227.0,39.637133,-86.127217,2.5,28,...,0,0,0,0,0,0,0,0,0,1
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771.0,27.916116,-82.760461,4.5,100,...,0,0,0,0,1,0,0,1,0,1
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106.0,39.953949,-75.143226,4.0,245,...,0,0,0,1,0,0,0,0,1,1
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147.0,39.943223,-75.162568,4.5,205,...,0,0,0,1,0,0,0,0,0,1


In [10]:
# -------------------------
# 4. a_outdoor_seating
# -------------------------
df_b = df_b.merge(
    attr_flat[['business_id', 'OutdoorSeating']],
    on='business_id',
    how='left'
)

df_b = df_b.rename(columns={'OutdoorSeating':'a_outdoor_seating'})

In [11]:
# -------------------------
# 5. a_price_range
# -------------------------
df_b = df_b.merge(
    attr_flat[['business_id', 'RestaurantsPriceRange2']],
    on='business_id',
    how='left'
)

df_b = df_b.rename(columns={'RestaurantsPriceRange2':'a_price_range'})

In [12]:
# -------------------------
# 6. a_good_for_group
# -------------------------
df_b = df_b.merge(
    attr_flat[['business_id', 'RestaurantsGoodForGroups']],
    on='business_id',
    how='left'
)

df_b = df_b.rename(columns={'RestaurantsGoodForGroups':'a_good_for_group'})

In [13]:
# -------------------------
# 7. a_good_for_kids
# -------------------------
df_b = df_b.merge(
    attr_flat[['business_id', 'GoodForKids']],
    on='business_id',
    how='left'
)

df_b = df_b.rename(columns={'GoodForKids':'a_good_for_kids'})

In [14]:
# -------------------------
# 8. a_alcohol
# -------------------------
def clean_alcohol(x):
    if pd.isna(x):
        return None

    if isinstance(x, str):
        # Remove extra formatting artifacts: whitespace, unicode prefix, and quotes
        x = x.strip().lower().replace("u'", "").replace("'", "")

        # Keep only valid standardized values
        if x in ['none', 'beer_and_wine', 'full_bar']:
            return x
        else:
            return None

    return x

# Apply cleaning logic to Alcohol attribute
attr_flat['Alcohol_clean'] = attr_flat['Alcohol'].apply(clean_alcohol)

# Merge cleaned attribute back into the main business table
df_b = df_b.merge(
    attr_flat[['business_id', 'Alcohol_clean']],
    on='business_id',
    how='left'
)

# Rename the column following naming convention
df_b = df_b.rename(columns={'Alcohol_clean': 'a_alcohol'})

In [15]:
# -------------------------
# 9. a_has_tv
# -------------------------
df_b = df_b.merge(
    attr_flat[['business_id', 'HasTV']],
    on='business_id',
    how='left'
)

df_b = df_b.rename(columns={'HasTV':'a_has_tv'})

In [16]:
# -------------------------
# 10. a_noise_level
# -------------------------
def clean_noise(x):
    if pd.isna(x):
        return None

    if isinstance(x, str):
        # Normalize formatting: lowercase, trim whitespace, remove unicode prefix and quotes
        x = x.strip().lower().replace("u'", "").replace("'", "")

        # Keep only standardized valid values
        if x in ['quiet', 'average', 'loud', 'very_loud']:
            return x
        else:
            return None

    return x

# Apply cleaning function to NoiseLevel attribute
attr_flat['NoiseLevel_clean'] = attr_flat['NoiseLevel'].apply(clean_noise)

# Merge cleaned result back into the main business table
df_b = df_b.merge(
    attr_flat[['business_id', 'NoiseLevel_clean']],
    on='business_id',
    how='left'
)

# Rename the column to follow naming convention
df_b = df_b.rename(columns={'NoiseLevel_clean': 'a_noise_level'})

In [17]:
# -------------------------
# 11. a_ambience
# -------------------------
# Parse the Ambience field into a dictionary
def parse_ambience(x):
    if pd.isna(x):
        return {}
    if isinstance(x, dict):
        return x
    try:
        return ast.literal_eval(x)  # Safely evaluate string representation of a dictionary
    except:
        return {}

# Extract the primary ambience label based on available True values
def extract_main_ambience(d):
    if not isinstance(d, dict) or len(d) == 0:
        return 'none'
    
    valid_classes = [
        'hipster', 'romantic', 'divey', 'classy', 'casual', 'trendy',
        'upscale', 'intimate', 'touristy', 'none'
    ]
    
    # Return the first matching True ambience category
    for k, v in d.items():
        if v is True and k in valid_classes:
            return k
    
    # Default fallback category
    return 'none'

# Apply parsing and category extraction
attr_flat['Ambience_dict'] = attr_flat['Ambience'].apply(parse_ambience)
attr_flat['Ambience_clean'] = attr_flat['Ambience_dict'].apply(extract_main_ambience)

# Merge the cleaned ambience field back into the main dataframe
df_b = df_b.merge(
    attr_flat[['business_id', 'Ambience_clean']],
    on='business_id',
    how='left'
)

# Rename column following naming convention
df_b = df_b.rename(columns={'Ambience_clean': 'a_ambience'})


In [18]:
# -------------------------
# 12. a_happy_hour
# -------------------------
df_b = df_b.merge(
    attr_flat[['business_id', 'HappyHour']],
    on='business_id',
    how='left'
)

df_b = df_b.rename(columns={'HappyHour':'a_happy_hour'})

In [19]:
# List of columns where missing values will be standardized
cols_to_clean = [
    'a_outdoor_seating', 'a_price_range', 'a_good_for_group',
    'a_good_for_kids', 'a_has_tv', 'a_noise_level', 'a_ambience', 'a_happy_hour'
]

# Replace inconsistent missing-value indicators with np.nan
for col in cols_to_clean:
    df_b[col] = df_b[col].replace(['None', 'none', 'NaN', 'nan'], np.nan)

### (2) Features Derived from Operating Hours

In [20]:
# -------------------------
# 1. Operating availability by weekday and time slot
# -------------------------
# Convert stored hour strings into dictionaries (safe parsing)
df_b['hours'] = df_b['hours'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Define the list of weekday keys
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Create one column per day to store operating hours
for day in days:
    df_b[day] = df_b['hours'].apply(
        lambda x: x.get(day) if isinstance(x, dict) else None
    )

# Preview extracted daily operating hour fields
df_b[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']].head()

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-21:0
1,6:0-22:0,6:0-22:0,6:0-22:0,6:0-22:0,6:0-22:0,6:0-22:0,6:0-22:0
2,10:0-18:0,10:0-20:0,10:0-20:0,10:0-20:0,10:0-20:0,10:0-20:0,
3,,13:30-22:0,13:30-22:0,13:30-22:0,13:30-23:0,13:30-23:0,13:30-22:0
4,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,


In [21]:
# Define time blocks (start hour, end hour) in 24-hour format
time_blocks = {
    'morning': (6, 12),
    'afternoon': (12, 18),
    'evening': (18, 24),
    'night': (0, 6)
}

# List of weekdays
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Initialize new binary columns for each day–time block combination
for day in days:
    for block in time_blocks:
        df_b[f"{day[:3].lower()}_{block}"] = 0

# Parse opening hours in each cell and map them to the corresponding time blocks
for idx, row in df_b.iterrows():
    for day in days:
        value = row[day]
        if pd.isna(value) or value == 'None':
            continue
        try:
            start_str, end_str = value.split('-')
            start = int(start_str.split(':')[0])
            end = int(end_str.split(':')[0])
        except:
            continue

        # Adjust for cases where closing time goes past midnight
        if end <= start:
            end += 24

        # Check overlap with each time block
        for block, (block_start, block_end) in time_blocks.items():
            for hour in range(start, end):
                hour_mod = hour % 24
                if block_start <= hour_mod < block_end:
                    df_b.at[idx, f"{day[:3].lower()}_{block}"] = 1
                    break  # Break once at least one hour falls into the block


In [22]:
df_b.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,fri_evening,fri_night,sat_morning,sat_afternoon,sat_evening,sat_night,sun_morning,sun_afternoon,sun_evening,sun_night
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,1,0,1,1,1,0,1,1,1,0
1,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227.0,39.637133,-86.127217,2.5,28,...,1,0,1,1,1,0,1,1,1,0
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771.0,27.916116,-82.760461,4.5,100,...,1,0,1,1,1,0,0,0,0,0
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106.0,39.953949,-75.143226,4.0,245,...,1,0,0,1,1,0,0,1,1,0
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147.0,39.943223,-75.162568,4.5,205,...,1,0,1,1,1,0,0,0,0,0


In [23]:
# -------------------------
# 2. Weekend Operating Status
# -------------------------
# Assign 1 if the business operates at any time during Saturday or Sunday
df_b['open_weekend'] = (
    (df_b['sat_morning'] == 1) | (df_b['sat_afternoon'] == 1) |
    (df_b['sat_evening'] == 1) | (df_b['sat_night'] == 1) |
    (df_b['sun_morning'] == 1) | (df_b['sun_afternoon'] == 1) |
    (df_b['sun_evening'] == 1) | (df_b['sun_night'] == 1)
).astype(int)


In [24]:
# -------------------------
# 3. Late-Night Operating Status
# -------------------------
# Assign 1 if the business operates during any late-night period (00:00–06:00) on any day of the week
df_b['open_night'] = (
    (df_b['mon_night'] == 1) | (df_b['tue_night'] == 1) |
    (df_b['wed_night'] == 1) | (df_b['thu_night'] == 1) |
    (df_b['fri_night'] == 1) | (df_b['sat_night'] == 1) |
    (df_b['sun_night'] == 1)
).astype(int)

In [25]:
# -------------------------
# 4. 7-Day Operating Status
# -------------------------
# Assign 1 if the business operates on all seven days of the week
df_b['open_7days'] = (
    (
        (df_b['mon_morning'] == 1) | (df_b['mon_afternoon'] == 1) | (df_b['mon_evening'] == 1) | (df_b['mon_night'] == 1)
    ) & (
        (df_b['tue_morning'] == 1) | (df_b['tue_afternoon'] == 1) | (df_b['tue_evening'] == 1) | (df_b['tue_night'] == 1)
    ) & (
        (df_b['wed_morning'] == 1) | (df_b['wed_afternoon'] == 1) | (df_b['wed_evening'] == 1) | (df_b['wed_night'] == 1)
    ) & (
        (df_b['thu_morning'] == 1) | (df_b['thu_afternoon'] == 1) | (df_b['thu_evening'] == 1) | (df_b['thu_night'] == 1)
    ) & (
        (df_b['fri_morning'] == 1) | (df_b['fri_afternoon'] == 1) | (df_b['fri_evening'] == 1) | (df_b['fri_night'] == 1)
    ) & (
        (df_b['sat_morning'] == 1) | (df_b['sat_afternoon'] == 1) | (df_b['sat_evening'] == 1) | (df_b['sat_night'] == 1)
    ) & (
        (df_b['sun_morning'] == 1) | (df_b['sun_afternoon'] == 1) | (df_b['sun_evening'] == 1) | (df_b['sun_night'] == 1)
    )
).astype(int)


In [26]:
# -------------------------
# 5. Total Weekly Operating Hours
# -------------------------
# List of weekday columns (aligned with the dataframe structure)
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def parse_hours(time_range):
    """
    Convert a time range string (e.g., '7:00-20:00') into the total number of operating hours for that day.
    Returns 0 if parsing fails or no hours are available.
    """
    if pd.isna(time_range):
        return 0
    try:
        start, end = time_range.split('-')
        start_h, start_m = map(float, start.split(':'))
        end_h, end_m = map(float, end.split(':'))

        start_time = start_h + (start_m / 60)
        end_time = end_h + (end_m / 60)

        # Handle cases where the business operates past midnight (e.g., 18:00 → 02:00)
        if end_time < start_time:
            end_time += 24

        return end_time - start_time
    except:
        return 0


def calc_total_weekly_hours(row):
    """
    Sum operating hours across all seven weekdays to compute weekly total operating hours.
    """
    total = 0
    for day in days:
        total += parse_hours(row[day])
    return total

# Compute total weekly operating hours for each business
df_b['total_weekly_hours'] = df_b.apply(calc_total_weekly_hours, axis=1)

### (3) Category Variety

In [27]:
# Define category keywords to exclude from counting
exclude_keywords = {"Restaurants", "Food"}

# Create a feature indicating the number of distinct category tags (excluding generic labels)
df_b['category_variety'] = df_b['categories'].apply(
    lambda x: len([cat.strip() for cat in str(x).split(',') if cat.strip() not in exclude_keywords])
)


### (4) tip_count

In [28]:
# Load filtered tip dataset
df_t = pd.read_parquet("../datasets/parquet_file/tip_filtered.parquet")

# Count the number of tip entries per business_id
tip_count = df_t.groupby('business_id')['text'].count().reset_index()
tip_count.rename(columns={'text': 'tip_count'}, inplace=True)

# Merge the tip count into the main business dataframe
df_b = df_b.merge(tip_count, on='business_id', how='left')

# Replace NaN and infinite values with 0, then convert to integer
df_b['tip_count'] = (
    df_b['tip_count']
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
    .astype(int)
)

# Preview the result
df_b[['business_id', 'tip_count']].head()

Unnamed: 0,business_id,tip_count
0,MTSW4McQd7CbVtyjqoe9mw,10
1,il_Ro8jwPlHresjw9EGmBg,5
2,0bPLkL0QhhPO5kt1_EXmNQ,18
3,MUTTqe8uqyMdBl186RmNeA,17
4,ROeacJQwBeh05Rqg7F6TCg,20


### (5) Latitude/Longitude-Based Columns

In [29]:
# -------------------------
# 1. Nearby Restaurant Density
# -------------------------
# Extract latitude and longitude values and convert them to radians
coords = np.radians(df_b[['latitude', 'longitude']].values)

# Build a BallTree using the Haversine distance metric
tree = BallTree(coords, metric='haversine')

# Define a 1km radius (in radians) based on Earth's radius (~6371 km)
radius = 1 / 6371

# Count the number of neighboring restaurants within the 1km radius (includes the restaurant itself)
counts = tree.query_radius(coords, r=radius, count_only=True)

# Store the number of nearby restaurants excluding the location itself
df_b['neighbor_density'] = counts - 1


In [30]:
# -------------------------
# 2. Category Similarity Among Nearby Stores
# -------------------------
# List of major category indicator columns
cat_cols = [
    'c_nightlife', 'c_cafes', 'c_brunch', 'c_sandwich', 
    'c_american_trad', 'c_american_new', 'c_italian', 
    'c_burgers', 'c_mexican', 'c_seafood', 'c_fastfood',
    'c_asian_fusion', 'c_bakeries', 'c_dietary', 'c_other'
]

def calc_neighbor_similarity_bool(df_b, tree, cat_cols, radius_km=1.0):
    """
    Calculates the proportion of nearby restaurants (within a given radius)
    that share at least one category with the focal business.
    """
    
    # Convert search radius from kilometers to radians
    radius = radius_km / 6371  

    results = []

    # Convert category columns to NumPy array for efficient comparison
    cat_matrix = df_b[cat_cols].values

    for i, row in df_b.iterrows():
        
        # Query indices of nearby stores within the specified radius
        ind = tree.query_radius(
            [np.radians([row['latitude'], row['longitude']])], 
            r=radius
        )[0]

        # Exclude the store itself
        neighbors = df_b.iloc[ind].copy()
        neighbors = neighbors[neighbors.index != i]

        if len(neighbors) == 0:
            results.append(0)
            continue

        # Category vector of the focal business
        base_vec = cat_matrix[i]

        # Category vectors of nearby stores
        neighbors_vecs = cat_matrix[neighbors.index]

        # Determine if each neighbor shares at least one category
        shared = (neighbors_vecs & base_vec).any(axis=1)

        # Compute the share of category-overlapping neighbors
        same_cat_ratio = shared.sum() / len(shared)
        results.append(same_cat_ratio)
    
    return results

# Apply function to compute category similarity score
df_b['neighbor_similarity'] = calc_neighbor_similarity_bool(df_b, tree, cat_cols, radius_km=1.0)


In [31]:
# -------------------------
# 3. Average Rating of Nearby Restaurants
# -------------------------
def calc_neighbor_avg_stars(df_b, tree, radius_km=1.0):
    """
    Computes the average rating of nearby restaurants within a specified radius.
    The focal restaurant is excluded from the calculation.
    """

    # Convert search radius from kilometers to radians
    radius = radius_km / 6371  
    results = []

    for i, row in df_b.iterrows():

        # Query nearby restaurant indices within the radius
        ind = tree.query_radius(
            [np.radians([row['latitude'], row['longitude']])], 
            r=radius
        )[0]

        # Exclude the restaurant itself
        neighbors = df_b.iloc[ind].copy()
        neighbors = neighbors[neighbors.index != i]

        if len(neighbors) == 0:
            results.append(np.nan)
            continue

        # Compute the mean star rating among neighbors
        avg_stars = neighbors['stars'].mean()
        results.append(avg_stars)

    return results

# Apply function to generate neighbor rating feature
df_b['neighbor_avg_stars'] = calc_neighbor_avg_stars(df_b, tree, radius_km=1.0)


### (6) Review-Based Derived Columns

In [32]:
# -------------------------
# 1. Average Review Length
# -------------------------
# Compute review length in word count
df_r['review_length'] = df_r['text'].fillna('').apply(lambda x: len(x.split()))

# Calculate the average review length per business
avg_review_len = (
    df_r.groupby('business_id')['review_length']
        .mean()
        .reset_index()
        .rename(columns={'review_length': 'avg_review_length'})
)

# Merge result into the main business table
df_b = df_b.merge(avg_review_len, on='business_id', how='left')

In [33]:
# -------------------------
# 2. Rating Variance
# -------------------------
# Compute the standard deviation of ratings for each business
df_var = (
    df_r.groupby('business_id')['stars']
        .std()
        .reset_index()
        .rename(columns={'stars': 'stars_std'})
)

# Merge the result into the main business table
df_b = df_b.merge(df_var, on='business_id', how='left')

In [34]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19534 entries, 0 to 19533
Data columns (total 91 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   business_id          19534 non-null  object 
 1   name                 19534 non-null  object 
 2   address              19467 non-null  object 
 3   city                 19534 non-null  object 
 4   state                19534 non-null  object 
 5   postal_code          19532 non-null  float64
 6   latitude             19534 non-null  float64
 7   longitude            19534 non-null  float64
 8   stars                19534 non-null  float64
 9   review_count         19534 non-null  int64  
 10  is_open              19534 non-null  int64  
 11  attributes           19527 non-null  object 
 12  categories           19534 non-null  object 
 13  hours                18432 non-null  object 
 14  stability_score      19534 non-null  float64
 15  loyalty_score        19509 non-null 

### (7) VADER Sentiment Score

In [35]:
# Download and initialize the VADER sentiment analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Compute sentiment score (compound polarity) for each review
df_r['sentiment_score'] = df_r['text'].apply(
    lambda x: sia.polarity_scores(str(x))['compound']
)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\toomu\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [36]:
# Compute the average sentiment score for each business

# Calculate the mean sentiment score grouped by business_id
sentiment_by_business = (
    df_r.groupby('business_id')['sentiment_score']
        .mean()
        .reset_index()
        .rename(columns={'sentiment_score': 'avg_sentiment'})
)

In [37]:
# Merge the sentiment score into the main business table
df_b = df_b.merge(sentiment_by_business, on='business_id', how='left')


In [38]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19534 entries, 0 to 19533
Data columns (total 92 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   business_id          19534 non-null  object 
 1   name                 19534 non-null  object 
 2   address              19467 non-null  object 
 3   city                 19534 non-null  object 
 4   state                19534 non-null  object 
 5   postal_code          19532 non-null  float64
 6   latitude             19534 non-null  float64
 7   longitude            19534 non-null  float64
 8   stars                19534 non-null  float64
 9   review_count         19534 non-null  int64  
 10  is_open              19534 non-null  int64  
 11  attributes           19527 non-null  object 
 12  categories           19534 non-null  object 
 13  hours                18432 non-null  object 
 14  stability_score      19534 non-null  float64
 15  loyalty_score        19509 non-null 

# 3. Save the final dataset as a CSV file

In [39]:
df_targets = df_b[['business_id', 'stability_score', 'loyalty_score', 'reliability_score']]

In [40]:
df_final = df_b.drop(columns=['stability_score', 'loyalty_score', 'reliability_score'])

In [41]:
df_final = df_final.merge(
    df_targets[['business_id', 'stability_score', 'loyalty_score', 'reliability_score']],
    on='business_id',
    how='left'
)

In [42]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19534 entries, 0 to 19533
Data columns (total 92 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   business_id          19534 non-null  object 
 1   name                 19534 non-null  object 
 2   address              19467 non-null  object 
 3   city                 19534 non-null  object 
 4   state                19534 non-null  object 
 5   postal_code          19532 non-null  float64
 6   latitude             19534 non-null  float64
 7   longitude            19534 non-null  float64
 8   stars                19534 non-null  float64
 9   review_count         19534 non-null  int64  
 10  is_open              19534 non-null  int64  
 11  attributes           19527 non-null  object 
 12  categories           19534 non-null  object 
 13  hours                18432 non-null  object 
 14  store_status         19534 non-null  object 
 15  c_nightlife          19534 non-null 

In [43]:
df_final.to_csv('../datasets/business6.csv', index=False)