In [3]:
#import
import os
import json

import numpy as np
import pandas as pd
from hashlib import sha1

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy

In [17]:
data_offerings = pd.read_csv("data/offerings.csv")
data_reviews = pd.read_csv("data/reviews.csv")

In [18]:
data_offerings.head()

Unnamed: 0,hotel_class,region_id,url,phone,details,address,type,id,name
0,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '147 West 4...",hotel,113317,Casablanca Hotel Times Square
1,5.0,32655,http://www.tripadvisor.com/Hotel_Review-g32655...,,,"{'region': 'CA', 'street-address': '300 S Dohe...",hotel,76049,Four Seasons Hotel Los Angeles at Beverly Hills
2,3.5,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '790 Eighth...",hotel,99352,Hilton Garden Inn Times Square
3,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '152 West 5...",hotel,93589,The Michelangelo Hotel
4,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '130 West 4...",hotel,217616,The Muse Hotel New York


In [19]:
data_reviews.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


In [21]:

# Merge datasets on the appropriate keys
merged_df = pd.merge(data_reviews, data_offerings, left_on='offering_id', right_on='id', how='inner')

merged_df = merged_df.rename(columns={'id_x': 'review_id', 'id_y': 'hotel_id'})

# Drop redundant columns
merged_df = merged_df.drop(columns=['offering_id']) 
merged_df.head()




Unnamed: 0,ratings,title,text,author,date_stayed,num_helpful_votes,date,review_id,via_mobile,hotel_class,region_id,url,phone,details,address,type,hotel_id,name
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,0,2012-12-17,147643103,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,0,2012-12-17,147639004,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,0,2012-12-18,147697954,False,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '485 5th Av...",hotel,1762573,Andaz 5th Avenue
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,0,2012-12-17,147625723,False,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '485 5th Av...",hotel,1762573,Andaz 5th Avenue
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,0,2012-12-17,147612823,False,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '485 5th Av...",hotel,1762573,Andaz 5th Avenue


In [29]:
# Safely parse 'details' and 'address' fields
def safe_json_parse(x, key):
    """Safely parse a JSON-like string and extract a specific key."""
    if isinstance(x, str):  # Only process strings
        try:
            data = json.loads(x.replace("'", '"'))  # Handle single quotes
            return data.get(key, None)  # Return the value for the given key
        except (json.JSONDecodeError, TypeError):
            return None
    return None

# Use the function to extract 'username' from 'author' column
merged_df['user_id'] = merged_df['author'].apply(lambda x: safe_json_parse(x, 'username'))

# Extract 'region' from 'details'
merged_df['region'] = merged_df['details'].apply(lambda x: safe_json_parse(x, 'region'))

# Extract 'street-address' from 'address'
merged_df['street_address'] = merged_df['address'].apply(lambda x: safe_json_parse(x, 'street-address'))

# Extract review date features
merged_df['review_year'] = pd.to_datetime(merged_df['date'], errors='coerce').dt.year
merged_df['review_month'] = pd.to_datetime(merged_df['date'], errors='coerce').dt.month

In [30]:
merged_df.head()

Unnamed: 0,ratings,title,text,author,date_stayed,num_helpful_votes,date,review_id,via_mobile,hotel_class,...,details,address,type,hotel_id,name,region,street_address,review_year,review_month,user_id
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,0,2012-12-17,147643103,False,3.0,...,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon,,2130 Broadway at 75th Street,2012,12,Papa_Panda
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,0,2012-12-17,147639004,False,3.0,...,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon,,2130 Broadway at 75th Street,2012,12,Maureen V
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,0,2012-12-18,147697954,False,4.0,...,,"{'region': 'NY', 'street-address': '485 5th Av...",hotel,1762573,Andaz 5th Avenue,,485 5th Avenue,2012,12,vuguru
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,0,2012-12-17,147625723,False,4.0,...,,"{'region': 'NY', 'street-address': '485 5th Av...",hotel,1762573,Andaz 5th Avenue,,485 5th Avenue,2012,12,Hotel-Designer
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,0,2012-12-17,147612823,False,4.0,...,,"{'region': 'NY', 'street-address': '485 5th Av...",hotel,1762573,Andaz 5th Avenue,,485 5th Avenue,2012,12,JamesE339


In [31]:
merged_df.columns

Index(['ratings', 'title', 'text', 'author', 'date_stayed',
       'num_helpful_votes', 'date', 'review_id', 'via_mobile', 'hotel_class',
       'region_id', 'url', 'phone', 'details', 'address', 'type', 'hotel_id',
       'name', 'region', 'street_address', 'review_year', 'review_month',
       'user_id'],
      dtype='object')

In [32]:
merged_df.nunique()

ratings               55523
title                631300
text                 877573
author               577942
date_stayed             168
num_helpful_votes       100
date                   3948
review_id            878561
via_mobile                2
hotel_class               9
region_id                25
url                    3945
phone                     0
details                   0
address                3937
type                      1
hotel_id               3945
name                   3876
region                    0
street_address         3916
review_year              12
review_month             12
user_id              536180
dtype: int64

In [33]:
merged_df.isnull().sum()

ratings                   0
title                     0
text                      0
author                    0
date_stayed           67594
num_helpful_votes         0
date                      0
review_id                 0
via_mobile                0
hotel_class           34937
region_id                 0
url                       0
phone                878561
details              878561
address                   0
type                      0
hotel_id                  0
name                      0
region               878561
street_address         6894
review_year               0
review_month              0
user_id                1145
dtype: int64

In [46]:
# Selected a subset with customers and products with at least 100 reviews
# Step 1: Filter customers with at least 100 reviews
customer_review_counts = merged_df.groupby('user_id').size().reset_index(name='review_count')
customers_with_at_least_100_reviews = customer_review_counts[customer_review_counts['review_count'] >= 100]

# Step 2: Filter products with at least 100 reviews
hotel_review_counts = merged_df.groupby('hotel_id').size().reset_index(name='review_count')
hotels_with_at_least_100_reviews = hotel_review_counts[hotel_review_counts['review_count'] >= 100]

# Step 3: Filter the original dataset to only include customers and products with at least 100 reviews
filtered_data = merged_df[
    (merged_df['user_id'].isin(customers_with_at_least_100_reviews['user_id'])) &
    (merged_df['hotel_id'].isin(hotels_with_at_least_100_reviews['hotel_id']))
]
filtered_data.shape

(71480, 23)

In [42]:
filtered_data.head()

Unnamed: 0,ratings,title,text,author,date_stayed,num_helpful_votes,date,review_id,via_mobile,hotel_class,...,details,address,type,hotel_id,name,region,street_address,review_year,review_month,user_id
1689,{'overall': 5.0},“Lovely with HUGE rooms”,I have stayed at the hotel twice. It is excell...,"{'username': '', 'id': '', 'location': ''}",,15,2005-05-02,3431120,False,4.5,...,,"{'region': 'NY', 'street-address': '781 Fifth ...",hotel,93559,The Sherry-Netherland Hotel,,781 Fifth Avenue,2005,5,
1690,{'overall': 4.0},"“If you like traditional hotels, then you will...",We have just returned from our first trip to N...,"{'username': '', 'id': '', 'location': ''}",,15,2005-01-26,3087505,False,4.5,...,,"{'region': 'NY', 'street-address': '781 Fifth ...",hotel,93559,The Sherry-Netherland Hotel,,781 Fifth Avenue,2005,1,
1692,{'overall': 4.0},“See yourself at the Sherry-Netherland”,"Many years ago, as a student and NYC taxi driv...","{'username': '', 'id': '', 'location': ''}",,20,2003-08-20,1237061,False,4.5,...,,"{'region': 'NY', 'street-address': '781 Fifth ...",hotel,93559,The Sherry-Netherland Hotel,,781 Fifth Avenue,2003,8,
1900,{'overall': 5.0},“We will be back”,Allt i toppskick. Verkligen ett toppenläge. En...,"{'username': '', 'id': '', 'location': ''}",,0,2011-04-29,106447112,False,4.5,...,,"{'region': 'NY', 'street-address': '781 Fifth ...",hotel,93559,The Sherry-Netherland Hotel,,781 Fifth Avenue,2011,4,
3900,"{'cleanliness': 5.0, 'value': 5.0, 'overall': ...",“Wonderful Hotel !”,The staff was very helpful and courteous. The ...,"{'username': '', 'id': '', 'location': ''}",April 2006,0,2006-05-02,5028014,False,3.5,...,,"{'region': 'TX', 'street-address': '2900 Briar...",hotel,98920,Houston Marriott Westchase,,2900 Briarpark Dr.,2006,5,


In [4]:
yelp_business = pd.read_json("data/yelp/yelp_academic_dataset_business.json", lines=True)
yelp_reviews = pd.read_json("data/yelp/yelp_academic_dataset_review.json", lines = True)



In [5]:
yelp_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [6]:
yelp_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [7]:
yelp_merged_df = pd.merge(yelp_business, yelp_reviews, on='business_id')

yelp_merged_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,...,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,"Doctors, Traditional Chinese Medicine, Naturop...",,9vwYDBVI3ymdqcyJ5WW2Tg,e0imecnX_9MtLnS2rUZM-A,5,3,2,1,I've had acupuncture treatments with Abby over...,2012-05-02 18:07:38
1,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,"Doctors, Traditional Chinese Medicine, Naturop...",,OXgg1LdxHDv3CBU5-xi2lA,_Q0fdLVoTnlNkEypUvNkHA,5,1,0,0,Abby is an amazing practitioner. In a treatmen...,2013-03-01 06:11:05
2,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,"Doctors, Traditional Chinese Medicine, Naturop...",,DG64cjud9cWB4fANskVxSw,ycUooVIDWPgXPf6niW-FWQ,4,2,0,0,I went to see Abby for some digestive issues. ...,2013-01-17 00:05:43
3,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,"Doctors, Traditional Chinese Medicine, Naturop...",,hzvRRb40oPttxAdyr7kfow,CiwVvb7jWijWB5jkmatzKA,5,0,1,0,"Abby helped me with some longstanding issues, ...",2015-03-16 03:43:08
4,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,"Doctors, Traditional Chinese Medicine, Naturop...",,xUkBPk-QfcW4i3MRU5TeXw,QkCbMKBktkrkOFJugHvY6w,5,0,0,0,"Recently, I referred a patient of mine with mu...",2013-03-05 18:45:07


In [8]:
restaurants_df = yelp_merged_df[yelp_merged_df['categories'].str.contains('Restaurant', case=False, na=False)]

print(f"Number of restaurants: {restaurants_df.shape[0]}")


Number of restaurants: 4724684


In [9]:
restaurants_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,...,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date
46,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",BXQcBN0iAi1lAUxibGLFzA,6_SpY41LIHZuIaiDs5FMKA,4,0,0,1,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53
47,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",uduvUCvi9w3T2bSGivCfXg,tCXElwhzekJEH6QJe3xs7Q,4,3,1,2,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06
48,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",a0vwPOqDXXZuJkbBW2356g,WqfKtI-aGMmvbA9pPUxNQQ,5,0,0,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57
49,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",MKNp_CdR2k2202-c8GN5Dw,3-1va0IQfK-9tUMzfHWfTA,5,5,0,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57
50,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",D1GisLDPe84Rrk_R4X2brQ,EouCKoDfzaVG0klEgdDvCQ,4,2,1,1,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35


In [None]:
# Selected a subset with customers and products with at least 100 reviews
# Step 1: Filter customers with at least 100 reviews
yelp_customer_review_counts = restaurants_df.groupby('user_id').size().reset_index(name='review_count')
yelp_customers_with_at_least_100_reviews = customer_review_counts[customer_review_counts['review_count'] >= 100]

# Step 2: Filter products with at least 100 reviews
yelp_review_counts = restaurants_df.groupby('business_id').size().reset_index(name='review_count')
yelp_with_at_least_100_reviews = hotel_review_counts[hotel_review_counts['review_count'] >= 100]

# Step 3: Filter the original dataset to only include customers and products with at least 100 reviews
yelp_filtered_data = restaurants_df[
    (restaurants_df['user_id'].isin(yelp_customers_with_at_least_100_reviews['user_id'])) &
    (restaurants_df['hotel_id'].isin(yelp_with_at_least_100_reviews['hotel_id']))
]
filtered_data.shape