In [1]:
# Packages
import json
import pandas as pd
import tarfile
import matplotlib.pyplot as plt
import numpy as np
import datetime
import tqdm

In [9]:
# Data import
red_city = "Tampa"
blue_city = "Philadelphia"

business_full = pd.read_json("yelp_academic_dataset_business.json", lines=True, engine="pyarrow")
print(business_full['city'].value_counts())
red_business = business_full[business_full['city'] == red_city]["business_id"].values
blue_business = business_full[business_full['city'] == blue_city]["business_id"].values
business = business_full[business_full['city'].isin([red_city, blue_city])]


try:
    checkins = pd.read_json("yelp_academic_dataset_checkin.json", lines=True, engine="pyarrow")
    print(checkins.columns.tolist())
    reviews = pd.read_json("yelp_academic_dataset_review.json", lines=True,engine = "pyarrow")
    print(reviews.columns.tolist())
    if 'business_id' not in reviews.columns:
        raise KeyError("Column 'business_id' not found in review data.")
    reviews = reviews[reviews['business_id'].isin((red_business.tolist() + blue_business.tolist()))]
    checkins = checkins[checkins['business_id'].isin((red_business.tolist() + blue_business.tolist()))]
except Exception as e:
    # Fallback: Load a sample to debug
    reviews = pd.read_json("yelp_academic_dataset_review.json", lines=True, nrows=10)
    print("Error loading data:", e)

print("number of reviews: ", len(reviews))
print("number of checkins: ", len(checkins))

reviews.loc[reviews['business_id'].isin(red_business), 'city'] = red_city
reviews.loc[reviews['business_id'].isin(blue_business), 'city'] = blue_city
reviews['date'] = pd.to_datetime(reviews['date'])


checkins.loc[checkins['business_id'].isin(red_business), 'city'] = red_city
checkins.loc[checkins['business_id'].isin(blue_business), 'city'] = blue_city   


checkins['date_list'] = None
checkins['date_list'] = checkins['date'].apply(lambda x: [datetime.datetime.strptime(date.strip(), "%Y-%m-%d %H:%M:%S") 
                                                         for date in x.split(',')])


city
Philadelphia      14569
Tucson             9250
Tampa              9050
Indianapolis       7540
Nashville          6971
                  ...  
Gentilly              1
pennsauken            1
Hamiltion             1
Newtown square        1
Apollo beach          1
Name: count, Length: 1416, dtype: int64
['business_id', 'date']
['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']
number of reviews:  1422441
number of checkins:  20897


In [None]:
print(business_full['city'] == 'St. Petersburg')

['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']


In [1]:
import pickle

# Import pickle data
file = 'restaurants.pkl'
with open(file, 'rb') as f:
    restaurants = pickle.load(f)

file = 'restaurant_reviews.pkl'
with open(file, 'rb') as f:
    reviews = pickle.load(f)

# Check the data
print("Number of restaurants: ", len(restaurants))
print("Number of reviews: ", len(reviews))

  restaurants = pickle.load(f)
  reviews = pickle.load(f)


Number of restaurants:  8857
Number of reviews:  992701


In [2]:
import calplot
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt

# Define restriction dates
start_restrictions = datetime.datetime(2020, 3, 16)
end_restrictions_blue = datetime.datetime(2021, 6, 2)
end_restrictions_red = datetime.datetime(2021, 5, 3)

# Get restaurants that are open before and after the restrictions
# Get businesses that are currently open
business_open_after = business[business['is_open'] == 1]
# Ger reviews with date before the restrictions
reviews_before = reviews[reviews['date'] < start_restrictions]
# Get businesses whose business_id is in the reviews before the restrictions
business_open_after = business_open_after[business_open_after['business_id'].isin(reviews_before['business_id'])]

# Get closes businesses
business_closed = business[business['is_open'] == 0]
# Get businesses with business_id in the reviews before the restrictions
business_closed = business_closed[business_closed['business_id'].isin(reviews_before['business_id'])]


# Filter by red and blue cities
business_open_after_red = business_open_after[business_open_after['city'] == red_city]
business_open_after_blue = business_open_after[business_open_after['city'] == blue_city]
business_closed_red = business_closed[business_closed['city'] == red_city]
business_closed_blue = business_closed[business_closed['city'] == blue_city]

# Get the number of businesses in each category
print(len(business_open_after_red), len(business_open_after_blue))
print(len(business_closed_red), len(business_closed_blue))

# Get reviews for each category
reviews_open_after_red = reviews[reviews['business_id'].isin(business_open_after_red['business_id'])]
reviews_open_after_blue = reviews[reviews['business_id'].isin(business_open_after_blue['business_id'])]
reviews_closed_red = reviews[reviews['business_id'].isin(business_closed_red['business_id'])]
reviews_closed_blue = reviews[reviews['business_id'].isin(business_closed_blue['business_id'])]

# Subset all reviews to only 2019 onwards
reviews_open_after_red = reviews_open_after_red[reviews_open_after_red['date'] >= '2019-01-01']
reviews_open_after_blue = reviews_open_after_blue[reviews_open_after_blue['date'] >= '2019-01-01']
reviews_closed_red = reviews_closed_red[reviews_closed_red['date'] >= '2019-01-01']
reviews_closed_blue = reviews_closed_blue[reviews_closed_blue['date'] >= '2019-01-01']

# Calplot of reviews
colors_normal = [
    (1.00, 1.00, 0.90),  
    (0.90, 0.97, 0.70),  
    (0.78, 0.94, 0.55),  
    (0.65, 0.90, 0.40),  
    (0.45, 0.80, 0.20),  
    (0.30, 0.70, 0.10),  
    (0.15, 0.50, 0.05),  
    (0.00, 0.00, 0.00),   
]
reviews_indexed = reviews_open_after_red['date'].value_counts().sort_index()
cmap_normal = LinearSegmentedColormap.from_list("custom_YlGn", colors_normal, N=100)
calplot.calplot(reviews_indexed, edgecolor='black', linewidth=0.5, cmap=cmap_normal, suptitle='Number of Reviews')

reviews_indexed = reviews_open_after_blue['date'].value_counts().sort_index()
cmap_normal = LinearSegmentedColormap.from_list("custom_YlGn", colors_normal, N=100)
calplot.calplot(reviews_indexed, edgecolor='black', linewidth=0.5, cmap=cmap_normal, suptitle='Number of Reviews')

reviews_indexed = reviews_closed_red['date'].value_counts().sort_index()
cmap_normal = LinearSegmentedColormap.from_list("custom_YlGn", colors_normal, N=100)
calplot.calplot(reviews_indexed, edgecolor='black', linewidth=0.5, cmap=cmap_normal, suptitle='Number of Reviews')

reviews_indexed = reviews_closed_blue['date'].value_counts().sort_index()
cmap_normal = LinearSegmentedColormap.from_list("custom_YlGn", colors_normal, N=100)
calplot.calplot(reviews_indexed, edgecolor='black', linewidth=0.5, cmap=cmap_normal, suptitle='Number of Reviews')

NameError: name 'datetime' is not defined

In [3]:
restaurants['categories'] = restaurants['categories'].str.split(',').apply(lambda lst: [x.strip() for x in lst])

category_counts = restaurants.explode('categories')['categories'].value_counts()

In [5]:
print(category_counts.head(20))

categories
Restaurants                  8857
Food                         2910
Nightlife                    1564
Bars                         1492
Sandwiches                   1407
Pizza                        1106
American (New)               1105
American (Traditional)       1091
Breakfast & Brunch           1023
Coffee & Tea                  814
Italian                       721
Fast Food                     710
Burgers                       648
Chinese                       617
Seafood                       582
Cafes                         549
Mexican                       546
Event Planning & Services     482
Salad                         471
Chicken Wings                 442
Name: count, dtype: int64


In [13]:
import pandas as pd

# Convert review date to datetime
reviews['date'] = pd.to_datetime(reviews['date'])

# Define time periods
pre_covid = reviews[reviews['date'] < '2020-03-01']
during_covid = reviews[(reviews['date'] >= '2020-03-01') & (reviews['date'] <= '2021-06-30')]
post_covid = reviews[reviews['date'] > '2021-06-30']

# Convert comma-separated category strings to lists
# restaurants['categories'] = restaurants['categories'].fillna('').apply(lambda x: [cat.strip() for cat in x.split(',') if cat.strip()])

# Explode categories so each row has one category
restaurants_exploded = restaurants.explode('categories')

# Function to count reviews per category for a given period
def count_reviews_by_period(reviews_period, label):
    merged = reviews_period.merge(restaurants_exploded[['business_id', 'categories']], on='business_id')
    return merged.groupby('categories').size().rename(label)

# Count reviews for each period
pre = count_reviews_by_period(pre_covid, 'pre')
during = count_reviews_by_period(during_covid, 'during')
post = count_reviews_by_period(post_covid, 'post')

# Combine into one DataFrame
combined = pd.concat([pre, during, post], axis=1).fillna(0)

# Normalize by total reviews in each period
combined['pre_norm'] = combined['pre'] / len(pre_covid)
combined['during_norm'] = combined['during'] / len(during_covid)
combined['post_norm'] = combined['post'] / len(post_covid)

# Calculate relative growth
combined['post_vs_pre'] = (combined['post_norm'] - combined['pre_norm']) / combined['pre_norm']
combined['during_vs_pre'] = (combined['during_norm'] - combined['pre_norm']) / combined['pre_norm']

# Optional: sort by post-pandemic growth
combined = combined.sort_values(by='post_vs_pre', ascending=False)

# Example: view changes for Mexican and American restaurants
print(combined.loc[category_counts.index])


                     pre   during     post  pre_norm  during_norm  post_norm  \
categories                                                                     
Restaurants     875106.0  78618.0  38977.0  1.000000     1.000000   1.000000   
Food            284454.0  27224.0  12982.0  0.325051     0.346282   0.333068   
Nightlife       270952.0  20768.0  11632.0  0.309622     0.264163   0.298432   
Bars            263180.0  20245.0  11315.0  0.300741     0.257511   0.290299   
Sandwiches      134994.0  12056.0   5565.0  0.154260     0.153349   0.142777   
...                  ...      ...      ...       ...          ...        ...   
Hainan               0.0     44.0     25.0  0.000000     0.000560   0.000641   
Costumes           408.0      9.0     11.0  0.000466     0.000114   0.000282   
South African        8.0     10.0      3.0  0.000009     0.000127   0.000077   
Pet Stores          51.0     15.0      2.0  0.000058     0.000191   0.000051   
Spiritual Shop      11.0      7.0      5