# Review Generator
The code takes a dataset with books as input (could also be used for movies or similar). The dataset should include titles, plots, genres and reviews (specified as a number). The code then uses BERT to predict what review a book would get based on the plot, title and genre.

## Imports

Necessary libraries and modules are imported.

In [1]:
import numpy as np
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset, random_split


# Limiting libraries to use all but one CPU core, in order to prevent thread oversubscription and improve system responsiveness
torch.set_num_threads(max(1, os.cpu_count() - 1))
os.environ["OMP_NUM_THREADS"] = str(max(1, os.cpu_count() - 1))
os.environ["MKL_NUM_THREADS"] = str(max(1, os.cpu_count() - 1))

## Load data

The dataset is loaded and names are changed accordingly.

In [2]:
books = pd.read_csv("booksdata.csv",low_memory=False)
title = 'title'
plot = 'description'
genre = 'categories'
review = 'average_rating'

titles = books[title].fillna("").astype(str).tolist()
plots  = books[plot].fillna("").astype(str).tolist()
genres = books[genre].fillna("").astype(str).tolist()

# Creating pytorch device object, making tensors & models run on CPU
device = torch.device("cpu")

## Create tensors with BERT

AutoTokenizer and AutoModel are used to convert strings into tokens and then convert those into contextual embeddings. The pretrained model ("distilbert-base-uncased") is a lightweight, fast version of BERT.

In [3]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert = AutoModel.from_pretrained("distilbert-base-uncased").to(device).eval()


# Function for encoding text into embeddings
def encode_text(titles, plots, genres, batch_size=32, max_len=128):
    texts = [f"Title: {t}. Genre: {g}. Plot: {p}" for t, p, g in zip(titles, plots, genres)]
    all_embs = []

    with torch.no_grad():     # Disable gradient tracking to save memory and time
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]

            # Convert text into token IDs and attention masks that BERT understands
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
            
            # Run BERT model on tokenized inputs
            outputs = bert(**enc)

            # Extract first token embedding and store it
            cls = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embs.append(cls)
    
    all_embs = np.vstack(all_embs)
    return all_embs


# Create tensors (x=inputs, y=targets)
x = encode_text(titles, plots, genres, batch_size=32, max_len=128)
x = torch.from_numpy(x).float()
y = books[review].fillna(0).astype(float).to_numpy()
y = torch.from_numpy(y).float()

## Create datasets

A TensorDataset is created and split into train, val and test sets. The number in val and test can be changed to change the ratio.

In [4]:
full_dataset = TensorDataset(x, y)

# Split data
val_amount = int(len(full_dataset)*0.15)
test_amount = int(len(full_dataset)*0.15)
train_amount = len(full_dataset) - val_amount - test_amount
train_data, val_data, test_data = random_split(full_dataset, [train_amount, val_amount, test_amount])

## Create model

A simple feedforward neural network is created, using two hidden layers with ReLU activation and dropout for regularization to help prevent overfitting.

In [5]:
class BERTReviewPredictor(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
    def forward(self, x):
        return self.net(x).squeeze(1)

## Training

A function for training and evaluation the model over multiple epochs is created. The function takes a model, optimizer, loss function, number of epochs, device and dataloaders for train and val sets as input. The function prints the average training and validation losses after each epoch.

In [6]:
def train_eval(model, optimizer, loss_fn, num_epochs, train_dataloader, val_dataloader, device):
    model.to(device)
    for epoch in range(num_epochs):

        # Training
        model.train()
        train_loss = []
        for batch_x, batch_y in train_dataloader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            pred = model(batch_x)
            loss = loss_fn(pred, batch_y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        # Evaluation
        model.eval()
        eval_loss = []
        with torch.no_grad():
            for batch_x, batch_y in val_dataloader:
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
            
                pred = model(batch_x)
                loss = loss_fn(pred, batch_y)
                eval_loss.append(loss.item())

        # Printing
        print(f"Epoch {epoch:03d} | Train Loss: {np.mean(train_loss):.4f} | Val Loss: {np.mean(eval_loss):.4f}")
        

## Run training

Hyperparameters and model architecture are chosen and the model is created and trained. The Adam optimizer and Mean Squared Error loss was chosen.

In [7]:
# Create data loaders
batch = 128
num_workers = max(1, min(8, os.cpu_count()-1))
loader_train = DataLoader(train_data, batch_size=batch, shuffle=True, num_workers=num_workers)
loader_val = DataLoader(val_data, batch_size=batch, num_workers=num_workers)
loader_test = DataLoader(test_data, batch_size=batch, num_workers=num_workers)

# Create model
input_size = 768
hidden_size = 256
model = BERTReviewPredictor(input_size, hidden_size)

# Train model
num_epochs = 10
lr = 0.0001
optimizer = Adam(model.parameters(), lr=lr)
loss_fn = nn.MSELoss()
train_eval(model,optimizer,loss_fn,num_epochs,loader_train,loader_val,"cpu")

Epoch 000 | Train Loss: 9.4014 | Val Loss: 2.2582
Epoch 001 | Train Loss: 0.6006 | Val Loss: 0.3430
Epoch 002 | Train Loss: 0.2969 | Val Loss: 0.3281
Epoch 003 | Train Loss: 0.2792 | Val Loss: 0.3136
Epoch 004 | Train Loss: 0.2814 | Val Loss: 0.3051
Epoch 005 | Train Loss: 0.2584 | Val Loss: 0.3010
Epoch 006 | Train Loss: 0.2517 | Val Loss: 0.2980
Epoch 007 | Train Loss: 0.2470 | Val Loss: 0.2959
Epoch 008 | Train Loss: 0.2503 | Val Loss: 0.2946
Epoch 009 | Train Loss: 0.2491 | Val Loss: 0.2942


## Evaluation

The model is evaluated using the test dataset. Mean Squared Error (MSE) and Mean Absolute Error (MAE) are computed to measure how accurate the predictions are.

In [8]:
# Evaluate model
model.eval()
with torch.no_grad():    # gradient computation is disabled for efficiency
    preds, labels = [], []
    for x_batch, y_batch in loader_test:
        x_batch = x_batch.to(device)
        pred = model(x_batch).cpu()
        preds.append(pred)
        labels.append(y_batch)
    preds = torch.cat(preds).squeeze()
    labels = torch.cat(labels)

# Calculate and print MSE & MAE
mse = nn.MSELoss()(preds, labels)
mae = torch.mean(torch.abs(preds - labels))
print(f"Test MSE: {mse:.4f}, MAE: {mae:.4f}")

Test MSE: 0.2131, MAE: 0.2831


## Test predictions

A function is created to allow for predicting the review of a specific book.

In [10]:
def predict_book(title: str, plot: str, genre: str, model):
    model.eval()
    xp = encode_text([title], [plot], [genre])
    xp = torch.from_numpy(xp).float()
    device = next(model.parameters()).device
    xp = xp.to(device)
    
    with torch.no_grad():
        pred = model(xp)
    if pred.numel() == 1:
        return pred.item()
    else:
        return pred.squeeze(0).cpu().numpy()


print(predict_book(
    title="Harry Potter",
    plot="A young wizard.",
    genre="fantasy",
    model=model
))

4.022887706756592
