In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from itertools import product
from surprise import Dataset, Reader, SVD, accuracy
import pandas as pd

In [2]:
# Load the dataset
file_path = './hawaii_ratings.csv.gz'
df = pd.read_csv(file_path)

In [3]:
# Split into training and temp (for validation + test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into validation and test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
test_df_baseline = test_df.copy()

# Print sizes to verify
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))

Training set size: 2459448
Validation set size: 307431
Test set size: 307431


In [4]:
# Baseline
globalAverage = train_df['rating'].mean()

userAverage = train_df.groupby('user')['rating'].mean().to_dict()

predictions = []

for _, row in test_df_baseline.iterrows():
    user = row['user']
    business = row['business']
    if user in userAverage:
        predictions.append(userAverage[user])
    else:
        predictions.append(globalAverage)

test_df_baseline['prediction'] = predictions
rmse = np.sqrt(mean_squared_error(test_df_baseline['rating'], test_df_baseline['prediction']))
print("RMSE:", rmse)

RMSE: 1.0215813911155576


In [5]:
reader = Reader(rating_scale=(1, 5))

train_data = Dataset.load_from_df(train_df[['user', 'business', 'rating']], reader)
trainset = train_data.build_full_trainset()

validset = list(zip(val_df['user'], val_df['business'], val_df['rating']))

param_grid = {
    'n_factors': [1],
    'n_epochs': [20],
    'lr_bu': [0.01],
    'lr_bi': [0.003],
    'lr_pu': [0.001],
    'lr_qi': [0.001],
    'reg_all': [0.02],
}

param_names = list(param_grid.keys())
param_values = list(param_grid.values())
param_combinations = [dict(zip(param_names, v)) for v in product(*param_values)]

best_rmse = float('inf')
best_params = None

# Hyperparameter tuning
for params in param_combinations:
    algo_params = {k: v for k, v in params.items() if v is not None}
    algo_params['random_state'] = 1234
    algo = SVD(**algo_params)
    algo.fit(trainset)
    predictions = algo.test(validset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print(f"Parameters: {algo_params} => RMSE: {rmse:.4f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = algo_params

print(f"\nBest RMSE: {best_rmse:.4f} with parameters: {best_params}")

# Use best parameters to make predictions on training and validation sets
best_params['random_state'] = 1234
best_algo = SVD(**best_params)
best_algo.fit(trainset)

# Make predictions on training set
train_predictions = [
    best_algo.predict(row['user'], row['business']).est for _, row in train_df.iterrows()
]
train_df['prediction'] = train_predictions

# Make predictions on validation set
val_predictions = [
    best_algo.predict(row['user'], row['business']).est for _, row in val_df.iterrows()
]
val_df['prediction'] = val_predictions

# Calculate and print RMSE for training and validation sets
train_rmse = np.sqrt(mean_squared_error(train_df['rating'], train_df['prediction']))
val_rmse = np.sqrt(mean_squared_error(val_df['rating'], val_df['prediction']))
print(f"\nTraining RMSE: {train_rmse:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}")


Parameters: {'n_factors': 1, 'n_epochs': 20, 'lr_bu': 0.01, 'lr_bi': 0.003, 'lr_pu': 0.001, 'lr_qi': 0.001, 'reg_all': 0.02, 'random_state': 1234} => RMSE: 0.9162

Best RMSE: 0.9162 with parameters: {'n_factors': 1, 'n_epochs': 20, 'lr_bu': 0.01, 'lr_bi': 0.003, 'lr_pu': 0.001, 'lr_qi': 0.001, 'reg_all': 0.02, 'random_state': 1234}

Training RMSE: 0.8068
Validation RMSE: 0.9162


In [10]:
best_params['random_state'] = 1234
algo = SVD(**best_params)
algo.fit(trainset)

predictions = [
    algo.predict(row['user'], row['business']).est for _, row in test_df.iterrows()
]

test_df['prediction'] = predictions

if 'rating' in test_df.columns:
    rmse = np.sqrt(mean_squared_error(test_df['rating'], test_df['prediction']))
    print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 0.9225


In [8]:
# Load the dataset
metadata_path = './hawaii_metadata.json.gz'
metadata_df = pd.read_json(metadata_path, lines=True)

metadata_df.columns

Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')

In [6]:
from sklearn.linear_model import LinearRegression

In [11]:
# Create feature dataframe from metadata
feature_df = metadata_df[['gmap_id', 'avg_rating', 'num_of_reviews', 'price']].copy()
feature_df['price_len'] = feature_df['price'].str.len()
feature_df = feature_df.drop('price', axis=1)

# Combine features with training data
train_features = train_df.merge(feature_df, left_on='business', right_on='gmap_id', how='left')
val_features = val_df.merge(feature_df, left_on='business', right_on='gmap_id', how='left')
test_features = test_df.merge(feature_df, left_on='business', right_on='gmap_id', how='left')

# Fill any missing values with means
for df in [train_features, val_features, test_features]:
    df['avg_rating'] = df['avg_rating'].fillna(df['avg_rating'].mean())
    df['num_of_reviews'] = df['num_of_reviews'].fillna(df['num_of_reviews'].mean())
    df['price_len'] = df['price_len'].fillna(df['price_len'].mean())

# Prepare feature matrices
X_train = train_features[['prediction', 'avg_rating', 'num_of_reviews', 'price_len']]
X_val = val_features[['prediction', 'avg_rating', 'num_of_reviews', 'price_len']]
X_test = test_features[['prediction', 'avg_rating', 'num_of_reviews', 'price_len']]

y_train = train_features['rating']
y_val = val_features['rating']

# Train linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make final predictions
train_final_pred = lr_model.predict(X_train)
val_final_pred = lr_model.predict(X_val)
test_final_pred = lr_model.predict(X_test)

# Calculate and print final RMSE scores
train_final_rmse = np.sqrt(mean_squared_error(y_train, train_final_pred))
val_final_rmse = np.sqrt(mean_squared_error(y_val, val_final_pred))
print("\nFinal Model Results:")
print(f"Training RMSE: {train_final_rmse:.4f}")
print(f"Validation RMSE: {val_final_rmse:.4f}")

if 'rating' in test_df.columns:
    test_final_rmse = np.sqrt(mean_squared_error(test_features['rating'], test_final_pred))
    print(f"Test RMSE: {test_final_rmse:.4f}")

# Add print statements to debug the sizes
print("Original dataframe sizes:")
print(f"Train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Test: {len(test_df)}")

print("\nAfter merge sizes:")
print(f"Train features: {len(train_features)}")
print(f"Val features: {len(val_features)}")
print(f"Test features: {len(test_features)}")

# Modify the final prediction assignments to match the merged dataframes
train_df = train_features[['user', 'business', 'rating', 'prediction']].copy()
train_df['final_prediction'] = train_final_pred

val_df = val_features[['user', 'business', 'rating', 'prediction']].copy()
val_df['final_prediction'] = val_final_pred

test_df = test_features[['user', 'business', 'rating', 'prediction']].copy()
test_df['final_prediction'] = test_final_pred
# Print model coefficients and intercept
print("\nModel Coefficients:")
feature_names = ['prediction', 'avg_rating', 'num_of_reviews', 'price_len']
for feature, coef in zip(feature_names, lr_model.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {lr_model.intercept_:.4f}")



Final Model Results:
Training RMSE: 0.7812
Validation RMSE: 0.9414
Test RMSE: 0.9484
Original dataframe sizes:
Train: 2459448
Val: 307431
Test: 307431

After merge sizes:
Train features: 2462415
Val features: 307808
Test features: 307795

Model Coefficients:
prediction: 1.5309
avg_rating: -0.4604
num_of_reviews: 0.0000
price_len: -0.0000
Intercept: -0.2966


In [39]:
import torch
import torch.nn as nn
import torch.optim as optim


In [None]:
# Create feature dataframe from metadata
feature_df = metadata_df[['gmap_id', 'avg_rating', 'num_of_reviews', 'price']].copy()
feature_df['price_len'] = feature_df['price'].str.len()
feature_df = feature_df.drop('price', axis=1)

# Combine features with training data
train_features = train_df.merge(feature_df, left_on='business', right_on='gmap_id', how='left')
val_features = val_df.merge(feature_df, left_on='business', right_on='gmap_id', how='left')
test_features = test_df.merge(feature_df, left_on='business', right_on='gmap_id', how='left')

# Fill any missing values with means
for df in [train_features, val_features, test_features]:
    df['avg_rating'] = df['avg_rating'].fillna(df['avg_rating'].mean())
    df['num_of_reviews'] = df['num_of_reviews'].fillna(df['num_of_reviews'].mean())
    df['price_len'] = df['price_len'].fillna(df['price_len'].mean())

# Prepare feature matrices
X_train = train_features[['prediction', 'avg_rating', 'num_of_reviews', 'price_len']].values
X_val = val_features[['prediction', 'avg_rating', 'num_of_reviews', 'price_len']].values
X_test = test_features[['prediction', 'avg_rating', 'num_of_reviews', 'price_len']].values

y_train = train_features['rating'].values
y_val = val_features['rating'].values

X_train = torch.FloatTensor(X_train)
X_val = torch.FloatTensor(X_val)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train)
y_val = torch.FloatTensor(y_val)

class RatingPredictor(nn.Module):
    def __init__(self, input_size=4):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    
    def forward(self, x):
        return self.layers(x)

model = RatingPredictor()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

n_epochs = 100
batch_size = 64
for epoch in range(n_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]
        
        optimizer.zero_grad()
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        train_preds = model(X_train).squeeze()
        val_preds = model(X_val).squeeze()
        train_rmse = torch.sqrt(criterion(train_preds, y_train))
        val_rmse = torch.sqrt(criterion(val_preds, y_val))
        
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{n_epochs}")
        print(f"Training RMSE: {train_rmse:.4f}")
        print(f"Validation RMSE: {val_rmse:.4f}\n")

model.eval()
with torch.no_grad():
    train_final_pred = model(X_train).squeeze().numpy()
    val_final_pred = model(X_val).squeeze().numpy()
    test_final_pred = model(X_test).squeeze().numpy()

train_final_rmse = np.sqrt(mean_squared_error(y_train, train_final_pred))
val_final_rmse = np.sqrt(mean_squared_error(y_val, val_final_pred))
print("\nFinal Model Results:")
print(f"Training RMSE: {train_final_rmse:.4f}")
print(f"Validation RMSE: {val_final_rmse:.4f}")

if 'rating' in test_df.columns:
    test_final_rmse = np.sqrt(mean_squared_error(test_features['rating'], test_final_pred))
    print(f"Test RMSE: {test_final_rmse:.4f}")

print("\nOriginal dataframe sizes:")
print(f"Train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Test: {len(test_df)}")

print("\nAfter merge sizes:")
print(f"Train features: {len(train_features)}")
print(f"Val features: {len(val_features)}")
print(f"Test features: {len(test_features)}")

train_df = train_features[['user', 'business', 'rating', 'prediction']].copy()
train_df['final_prediction'] = train_final_pred

val_df = val_features[['user', 'business', 'rating', 'prediction']].copy()
val_df['final_prediction'] = val_final_pred

test_df = test_features[['user', 'business', 'rating', 'prediction']].copy()
test_df['final_prediction'] = test_final_pred
