In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import re, json

In [4]:
# load yelp_academic_dataset_review.json
review_path = '/content/drive/MyDrive/data/raw/yelp_extracted/yelp_academic_dataset_review.json'

cols_to_keep = ["review_id", "user_id", "business_id", "stars", "text"]
chunks = pd.read_json(review_path, lines=True, chunksize=100000)

filtered_reviews = []

for chunk in chunks:
    chunk = chunk[cols_to_keep]
    filtered_reviews.append(chunk)

reviews_df = pd.concat(filtered_reviews, ignore_index=True)



In [5]:
reviews_df.columns


Index(['review_id', 'user_id', 'business_id', 'stars', 'text'], dtype='object')

In [6]:
# Check NaN
reviews_df.isna().sum()

Unnamed: 0,0
review_id,0
user_id,0
business_id,0
stars,0
text,0


In [7]:
# load yelp_academic_dataset_business.json
business_path = '/content/drive/MyDrive/data/raw/yelp_extracted/yelp_academic_dataset_business.json'

business_df = pd.read_json(business_path, lines=True)


In [8]:
# Check NaN
business_df.isna().sum()

Unnamed: 0,0
business_id,0
name,0
address,0
city,0
state,0
postal_code,0
latitude,0
longitude,0
stars,0
review_count,0


In [9]:
# Extract only restaurant in FL with at least 50 reviews ((3917, 14))
pattern = r'\b(?:Restaurants?|Caf(?:e|és)|Coffee|Food)\b'

restaurant_fl = business_df[
    (business_df['state'] == 'FL') &
    (business_df['categories'].fillna('').str.contains(pattern, case=False, regex=True)) &
    (business_df['review_count'] >= 50)
]


In [10]:
restaurant_fl.shape

(3917, 14)

In [11]:
# Merge reviews_df with restaurant_fl on business_id
reviews_fl = reviews_df.merge(
    restaurant_fl[['business_id']],
    on='business_id',
    how='inner'
)

In [12]:
# Count total reviews
reviews_fl['business_id'].value_counts().sum()

np.int64(718557)

In [13]:
reviews_fl.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'text'], dtype='object')

In [14]:
# Combine all reviews into one text per restaurant and rename a column to all_reviews
agg_reviews = (
    reviews_fl.groupby('business_id')['text'].apply(lambda x: ''.join(x)).reset_index().rename(columns = {'text':'all_reviews'})
)

In [15]:
# Merge all_reviews back to restaurant_fl
restaurant_fl = restaurant_fl.merge(agg_reviews,on='business_id', how='left')


In [16]:
# Filter restaurants open only
dat = restaurant_fl[restaurant_fl['is_open'] == 1]

In [17]:
# Check dimension
print(dat.shape)
# Count total review
print(dat['review_count'].sum())
dat.isna().sum()

(3066, 15)
588377


Unnamed: 0,0
business_id,0
name,0
address,0
city,0
state,0
postal_code,0
latitude,0
longitude,0
stars,0
review_count,0


In [18]:
dat.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,all_reviews
1,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,3173 Cypress Ridge Blvd,Wesley Chapel,FL,33544,28.196252,-82.380615,4.5,95,1,"{'BestNights': '{'monday': False, 'tuesday': F...","Burgers, Sports Bars, Bars, Lounges, Restauran...","{'Monday': '11:30-22:0', 'Tuesday': '11:30-23:...",Best sports bar with amazingly good food. Ice ...
2,vje0KIiE7vtpx7JzmBx5LQ,The Pearl,163 107th Ave,Treasure Island,FL,33706,27.769405,-82.767317,4.0,129,1,"{'WiFi': ''free'', 'NoiseLevel': 'u'average'',...","Restaurants, French, Moroccan, Seafood, Medite...","{'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21...",I went to the pearl for my birthday and every ...
3,CtMEJxpVMlNzFpB4PtFjfA,Aussie Grill,25340 Sierra Center Blvd,Lutz,FL,33559,28.192191,-82.396367,4.0,75,1,"{'BYOB': 'False', 'OutdoorSeating': 'True', 'B...","Restaurants, American (New), Burgers, Fast Foo...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ...",I've always loved Outback and the fact that th...
4,Ucl9Vo5lwrUmYbV8Dv8X5g,O'Briens Irish Pub,15435 N Dale Mabry Hwy,Tampa,FL,33618,28.09236,-82.500588,4.0,108,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Bars, Pubs, Nightlife, Arts & Entertainment, I...","{'Monday': '18:0-2:30', 'Tuesday': '15:0-1:0',...",I must say that this location is much better t...
5,py5aKmlTB2NarfsfcOpHOQ,Cafe Con Leche,4100 George J Bean Pkwy,Tampa,FL,33607,27.976775,-82.537423,2.5,55,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Restaurants, Breakfast & Brunch, Coffee Roaste...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",Cashier was very rude fat Hispanic girl at the...


In [19]:
# Save as csv
path = '/content/drive/MyDrive/data/processed/yelp_restaurant_florida_reviews.csv'
dat.to_csv(path, index=False)

In [24]:
dat.to_parquet("/content/drive/MyDrive/data/processed/yelp_restaurant_florida_reviews.parquet", index=False)