In [1]:
import pandas as pd
import numpy as np

import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read data from Cleaned JSON
df = pd.read_json('ratebeer_cleaned.json.gz', compression='gzip', lines=True)

In [3]:
# Preprocess the data
le_user = LabelEncoder()
le_item = LabelEncoder()

df['user_id'] = le_user.fit_transform(df['review/profileName'])
df['item_id'] = le_item.fit_transform(df['beer/beerId'])

# Convert 'review/taste' to numeric and normalize
df['rating'] = df['review/taste'].apply(lambda x: int(x.split('/')[0]) / int(x.split('/')[1]))


In [4]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

# Initialize model parameters
n_users = df['user_id'].nunique()
n_items = df['item_id'].nunique()
n_factors = 10  # Number of latent factors

alpha = np.mean(train_data['rating'])
beta_u = np.zeros(n_users)
beta_i = np.zeros(n_items)
gamma_u = np.random.normal(0, 0.1, (n_users, n_factors))
gamma_i = np.random.normal(0, 0.1, (n_items, n_factors))

# Hyperparameters
learning_rate = 0.01
regularization = 0.02
n_epochs = 50

In [5]:
# Function to train the model and return test loss
def objective(trial):
    # Hyperparameters to optimize
    # learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    reg1 = trial.suggest_loguniform('reg1', 1e-4, 1e-1)  # Regularization for biases
    reg2 = trial.suggest_loguniform('reg2', 1e-4, 1e-1)  # Regularization for latent factors
    n_factors = trial.suggest_int('n_factors', 1, 10)  # Number of latent factors
    learning_rate = 0.01

    # Initialize model parameters
    n_users = df['user_id'].nunique()
    n_items = df['item_id'].nunique()
    # n_factors = 10  # Number of latent factors

    alpha = np.mean(train_data['rating'])
    beta_u = np.zeros(n_users)
    beta_i = np.zeros(n_items)
    gamma_u = np.random.normal(0, 0.1, (n_users, n_factors))
    gamma_i = np.random.normal(0, 0.1, (n_items, n_factors))

    # Training loop
    for epoch in range(1):  # Use a smaller number of epochs for faster optimization
        for _, row in train_data.iterrows():
            u, i, r = row['user_id'], row['item_id'], row['rating']
            
            # Predict rating
            r_pred = alpha + beta_u[u] + beta_i[i] + np.dot(gamma_u[u], gamma_i[i])
            
            # Compute error
            e = r - r_pred
            
            # Update parameters with separate regularizations
            beta_u[u] += learning_rate * (e - reg1 * beta_u[u])
            beta_i[i] += learning_rate * (e - reg1 * beta_i[i])
            gamma_u[u] += learning_rate * (e * gamma_i[i] - reg2 * gamma_u[u])
            gamma_i[i] += learning_rate * (e * gamma_u[u] - reg2 * gamma_i[i])

    # Compute test loss
    test_loss = 0
    for _, row in test_data.iterrows():
        u, i, r = row['user_id'], row['item_id'], row['rating']
        r_pred = alpha + beta_u[u] + beta_i[i] + np.dot(gamma_u[u], gamma_i[i])
        test_loss += (r - r_pred) ** 2
    test_loss /= len(test_data)
    
    return test_loss

In [6]:
# Use Optuna to find the best hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

print("Best hyperparameters: ", study.best_params)


[I 2024-12-03 01:22:33,000] A new study created in memory with name: no-name-9c3966bb-4b1c-4d26-92bc-1c31491d494e
  reg1 = trial.suggest_loguniform('reg1', 1e-4, 1e-1)  # Regularization for biases
  reg2 = trial.suggest_loguniform('reg2', 1e-4, 1e-1)  # Regularization for latent factors
[I 2024-12-03 01:23:19,410] Trial 0 finished with value: 0.013433345612251308 and parameters: {'reg1': 0.00011320889941452063, 'reg2': 0.0013627942564387147, 'n_factors': 3}. Best is trial 0 with value: 0.013433345612251308.
[I 2024-12-03 01:24:07,392] Trial 1 finished with value: 0.013545354146855154 and parameters: {'reg1': 0.0003297443902782769, 'reg2': 0.011828070866803014, 'n_factors': 6}. Best is trial 0 with value: 0.013433345612251308.
[I 2024-12-03 01:24:53,699] Trial 2 finished with value: 0.013824572019084 and parameters: {'reg1': 0.00011174493178332685, 'reg2': 0.0008819476190752157, 'n_factors': 8}. Best is trial 0 with value: 0.013433345612251308.
[I 2024-12-03 01:25:40,075] Trial 3 finish

Best hyperparameters:  {'reg1': 0.00017214465658472533, 'reg2': 0.09858916869989881, 'n_factors': 1}


In [7]:
# Train final model with best hyperparameters found by Optuna
# best_learning_rate = study.best_params['learning_rate']
best_learning_rate = 0.01
best_reg1 = study.best_params['reg1']
best_reg2 = study.best_params['reg2']
n_factors = study.best_params['n_factors']

# Reinitialize model parameters using best hyperparameters
alpha = np.mean(train_data['rating'])
beta_u = np.zeros(n_users)
beta_i = np.zeros(n_items)
gamma_u = np.random.normal(0, 0.1, (n_users, n_factors))
gamma_i = np.random.normal(0, 0.1, (n_items, n_factors))


In [8]:
# Final training loop with best hyperparameters
for epoch in range(50):  # Full number of epochs for final training
    for _, row in train_data.iterrows():
        u, i, r = row['user_id'], row['item_id'], row['rating']
        
        # Predict rating
        r_pred = alpha + beta_u[u] + beta_i[i] + np.dot(gamma_u[u], gamma_i[i])
        
        # Compute error
        e = r - r_pred
        
        # Update parameters with separate regularizations
        beta_u[u] += best_learning_rate * (e - best_reg1 * beta_u[u])
        beta_i[i] += best_learning_rate * (e - best_reg1 * beta_i[i])
        gamma_u[u] += best_learning_rate * (e * gamma_i[i] - best_reg2 * gamma_u[u])
        gamma_i[i] += best_learning_rate * (e * gamma_u[u] - best_reg2 * gamma_i[i])

    # Compute and print test loss after each epoch during final training
    test_loss = 0
    for _, row in test_data.iterrows():
        u, i, r = row['user_id'], row['item_id'], row['rating']
        r_pred = alpha + beta_u[u] + beta_i[i] + np.dot(gamma_u[u], gamma_i[i])
        test_loss += (r - r_pred) ** 2
    test_loss /= len(test_data)
    
    print(f"Epoch {epoch + 1}/50, Test Loss: {test_loss:.4f}")


Epoch 1/50, Test Loss: 0.0132
Epoch 2/50, Test Loss: 0.0131
Epoch 3/50, Test Loss: 0.0130
Epoch 4/50, Test Loss: 0.0130
Epoch 5/50, Test Loss: 0.0130
Epoch 6/50, Test Loss: 0.0129
Epoch 7/50, Test Loss: 0.0129
Epoch 8/50, Test Loss: 0.0129
Epoch 9/50, Test Loss: 0.0129
Epoch 10/50, Test Loss: 0.0129
Epoch 11/50, Test Loss: 0.0129
Epoch 12/50, Test Loss: 0.0129
Epoch 13/50, Test Loss: 0.0129
Epoch 14/50, Test Loss: 0.0129
Epoch 15/50, Test Loss: 0.0129
Epoch 16/50, Test Loss: 0.0129
Epoch 17/50, Test Loss: 0.0129
Epoch 18/50, Test Loss: 0.0129
Epoch 19/50, Test Loss: 0.0129
Epoch 20/50, Test Loss: 0.0129
Epoch 21/50, Test Loss: 0.0129
Epoch 22/50, Test Loss: 0.0130
Epoch 23/50, Test Loss: 0.0130
Epoch 24/50, Test Loss: 0.0130
Epoch 25/50, Test Loss: 0.0130
Epoch 26/50, Test Loss: 0.0130
Epoch 27/50, Test Loss: 0.0130
Epoch 28/50, Test Loss: 0.0130
Epoch 29/50, Test Loss: 0.0130
Epoch 30/50, Test Loss: 0.0130
Epoch 31/50, Test Loss: 0.0130
Epoch 32/50, Test Loss: 0.0130
Epoch 33/50, Test

In [9]:
# Function to predict rating given beerID and profileName using final trained model
def predict_rating(beer_id, profile_name):
    u = le_user.transform([profile_name])[0]
    i = le_item.transform([beer_id])[0]
    return alpha + beta_u[u] + beta_i[i] + np.dot(gamma_u[u], gamma_i[i])

# Example usage with final model
beer_id = '51'
profile_name = 'azlondon'
predicted_rating = predict_rating(beer_id, profile_name)
print(f"Predicted rating for beer {beer_id} by user {profile_name}: {predicted_rating:.2f}")

Predicted rating for beer 51 by user azlondon: 0.74


In [14]:
eval(df[(df['beer/beerId'] == 51) & (df['review/profileName'] == 'azlondon')]['review/taste'].item())

0.8