## Book Recommender System using GoodReads Ratings

This Jupyter notebook implements a book recommender system using ratings data from Goodreads, a popular online platform for book enthusiasts.

In [1]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import data

from IPython.display import HTML as html_print
from IPython.display import Markdown

In [2]:
# configurable parameters, change as needed

# set to true if loading existing model file, false if training a new model
skip_training = True
explicit = False
data_dir = 'data'
model_save_path = 'models/goodreads_recsys.pth'

pd.options.display.max_rows = None

In [3]:
# create dirs if not existing
os.makedirs(data_dir, exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)

In [4]:
# additional settings, automatically selects cuda if available
if skip_training:
    device_type = 'cpu'
elif torch.cuda.is_available():
    device_type = 'cuda:0'
else:
    device_type = 'cpu'

# set manually if needed e.g. device_type = 'cpu'
print("Using device type:", device_type)
device = torch.device(device_type)

Using device type: cpu


In [5]:
trainset = data.GoodReadsRatingsDataset(root=data_dir, mode='train', explicit=explicit)
testset = data.GoodReadsRatingsDataset(root=data_dir, mode='test', explicit=explicit)

Dataset: train, User count: 6769, Book count: 92034, Ratings count: 966946
Dataset: test, User count: 6376, Book count: 79887, Ratings count: 288004


In [6]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1024, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)

In [7]:
class RecommenderSystem(nn.Module):
    def __init__(self, n_users, n_items):
        """
        Args:
          n_users: Number of users.
          n_items: Number of items.
        """
        super(RecommenderSystem, self).__init__()

        self.user_em = nn.Embedding(n_users, 20)
        self.item_em = nn.Embedding(n_items, 100)
        self.drop0 = nn.Dropout(0.02)
        
        self.fc1 = nn.Linear(120, 100)
        self.relu1 = nn.ReLU()
        self.drop1 = nn.Dropout(0.02)
        
        self.fc2 = nn.Linear(100, 10)
        self.relu2 = nn.ReLU()
        self.drop2 = nn.Dropout(0.02)
        
        self.fc3 = nn.Linear(10, 1)
        
    def forward(self, user_ids, item_ids):
        """
        Args:
          user_ids of shape (batch_size): User ids (starting from 0).
          item_ids of shape (batch_size): Item ids (starting from 0).
        
        Returns:
          outputs of shape (batch_size): Predictions of ratings.
        """
        x = torch.cat([self.user_em(user_ids), self.item_em(item_ids)], dim=1)
        x = self.drop0(x)

        x = self.fc1(x)
        x = self.relu1(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.drop2(x)

        x = torch.sigmoid(self.fc3(x))
        # min_rating, max_rating = (0.5, 5.5)
        # x = x*(max_rating - min_rating) + min_rating
        x = x.view(-1)
        
        return x

In [8]:
model = RecommenderSystem(trainset.n_users, trainset.n_items)
model.to(device)

RecommenderSystem(
  (user_em): Embedding(6769, 20)
  (item_em): Embedding(92034, 100)
  (drop0): Dropout(p=0.02, inplace=False)
  (fc1): Linear(in_features=120, out_features=100, bias=True)
  (relu1): ReLU()
  (drop1): Dropout(p=0.02, inplace=False)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (relu2): ReLU()
  (drop2): Dropout(p=0.02, inplace=False)
  (fc3): Linear(in_features=10, out_features=1, bias=True)
)

In [9]:
# computes the loss:
def compute_loss(model, testloader):
    model.eval()
    cost = nn.MSELoss()
    total_loss = 0
    total_data = 0
    prediction_list = []

    with torch.no_grad():
        for user_ids, item_ids, labels in testloader:
            user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
            predictions = model(user_ids, item_ids)
            prediction_list.append(predictions.cpu())

            loss = cost(predictions, labels)
            total_loss += (loss.item() * labels.size(0))
            total_data += labels.size(0)

    loss = total_loss / total_data
    return torch.cat(prediction_list), loss

In [10]:
# training loop
if not skip_training:
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
    cost = nn.MSELoss()
    total_loss = 0
    total_data = 0

    # save historical losses and accs
    hist_metrics = dict()
    hist_metrics['epoch'] = []
    hist_metrics['train_loss'] = []
    hist_metrics['test_loss'] = []

    epochs = 20
    for epoch in range(epochs):
        model.train()
        for user_ids, item_ids, labels in trainloader:  
            user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(user_ids, item_ids)

            loss = cost(predictions, labels)
            total_loss += (loss.item() * labels.size(0))
            total_data += labels.size(0)
            loss.backward()
            optimizer.step()

        train_loss = total_loss / total_data
        _, test_loss = compute_loss(model, testloader)

        hist_metrics['epoch'].append(epoch)
        hist_metrics['train_loss'].append(train_loss)
        hist_metrics['test_loss'].append(test_loss)

        print('Epoch {}: Train error: {:.4f}, Test error: {:.4f}'.format(epoch, train_loss, test_loss))

In [11]:
# plot and save historical train/test loss
def plot_metrics(metrics, save_path='logs/goodreads_recsys_{}.{}'):
    plt.plot(metrics['train_loss'])
    plt.plot(metrics['test_loss'])
    plt.ylabel("loss")
    plt.xlabel("epochs")
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(save_path.format('loss', "png"))
    plt.show()

    with open(save_path.format("hist", "json"), 'w') as f:
        json.dump(metrics, f)

In [12]:
# save trained model
if not skip_training:
    torch.save(model.state_dict(), model_save_path)
    plot_metrics(hist_metrics)

In [13]:
if skip_training:
    model = RecommenderSystem(trainset.n_users, trainset.n_items)
    model.load_state_dict(torch.load(model_save_path, map_location=lambda storage, loc: storage))
    print('Model loaded from: {}'.format(model_save_path))
    model.to(device)
    model.eval()

Model loaded from: models/goodreads_recsys.pth


In [14]:
predictions, loss = compute_loss(model, testloader)
print('Test loss: {:.4f}'.format(loss))

Test loss: 0.1816


In [15]:
book_titles_df = pd.read_csv(os.path.join(data_dir, 'book_titles.csv'), header=0)
book_titles_df.set_index('encoded_book_id', inplace=True)

In [16]:
next_user_i = 0
top_half_hit = 0
top10_hit = 0
top01_hit = 0
count = 0

while next_user_i < len(testset):
    user_id = testset[next_user_i][0]
    last_user_i = next_user_i
    while next_user_i < len(testset) and testset[next_user_i][0] == user_id:
        next_user_i += 1
    indices = range(last_user_i, next_user_i)

    labels = testset[indices][2]
    _, top_reco = torch.topk(predictions[indices], k=int(len(indices)/2))
    top_half_hit += labels[top_reco].sum()/int(len(indices)/2)

    _, top_reco = torch.topk(predictions[indices], k=min(int(len(indices)/2), 10))
    top10_hit += labels[top_reco].sum()/min(int(len(indices)/2), 10)

    _, top_reco = torch.topk(predictions[indices], k=1)
    top01_hit += labels[top_reco].sum()

    count += 1


top_half_hit = top_half_hit / count
print("top 50% hit rate:", top_half_hit.item())
top10_hit = top10_hit / count
print("top 10 hit rate:", top10_hit.item())
top01_hit = top01_hit / count
print("top 1 hit rate:", top01_hit.item())

top 50% hit rate: 0.7215026617050171
top 10 hit rate: 0.7813742160797119
top 1 hit rate: 0.8574341535568237


In [17]:
indices = testset.get_user_record()
sampleset = torch.utils.data.Subset(testset, indices)
sampleloader = torch.utils.data.DataLoader(sampleset, batch_size=len(indices), shuffle=False)

if explicit:
    max_range = 5.0
else:
    max_range = 1.0

with torch.no_grad():
    for user_ids, item_ids, labels in sampleloader:
        print("Showing (sample) books from user_enc_id={}".format(user_ids[0]))
        user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
        predictions = model(user_ids, item_ids)

        tmp_df = book_titles_df.loc[item_ids.cpu()]
        tmp_df['actual_ratings'] = labels.cpu() * max_range
        tmp_df['predicted_ratings'] = predictions.cpu().round(decimals=2) * max_range
        tmp_df['correct'] = (tmp_df['predicted_ratings'].round() == tmp_df['actual_ratings'])
        
        # display title of correct predictions as green
        # tmp_df.loc[tmp_df['correct'], 'title'] = '<span style="color: #00ff00">' + tmp_df.loc[tmp_df['correct'], 'title'] + '</span>'
        # display title of correct predictions as red
        # tmp_df.loc[~tmp_df['correct'], 'title'] = '<span style="color: #ff0000">' + tmp_df.loc[~tmp_df['correct'], 'title'] + '</span>'
        
        tmp_df = tmp_df.sample(n=min(len(tmp_df.index), 40))
        tmp_df.sort_values(by=['actual_ratings', 'title'], ascending=[False, True], inplace=True)

        tmp_df['read?'] = '<span style="color: #ff0000">no</span>'
        tmp_df.loc[tmp_df['actual_ratings'] == 1, 'read?'] = '<span style="color: #00ff00">yes</span>'
        
        tmp_df['recommend?'] = '<span style="color: #ff0000">no</span>'
        tmp_df.loc[tmp_df['predicted_ratings'] >= 0.5, 'recommend?'] = '<span style="color: #00ff00">yes</span>'

        tmp_df['correctly recommended?'] = '<span style="color: #ff0000">no</span>'
        tmp_df.loc[tmp_df['correct'] == 1, 'correctly recommended?'] = '<span style="color: #00ff00">yes</span>'

        print("{:.2f}% of the books have been correctly recommended".format(tmp_df['correct'].sum() / len(tmp_df['correct']) * 100))
        # display(tmp_df[['book_id', 'title', 'actual_ratings', 'predicted_ratings', 'correct', 'test']])
        display(Markdown(tmp_df[['book_id', 'title', 'read?', 'recommend?', 'correctly recommended?']].to_markdown()))

Showing (sample) books from user_enc_id=1117
85.00% of the books have been correctly recommended


|   encoded_book_id |   book_id | title                                                                                                   | read?                                   | recommend?                              | correctly recommended?                  |
|------------------:|----------:|:--------------------------------------------------------------------------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|
|              6406 |    256683 | City of Bones (The Mortal Instruments, #1)                                                              | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|               188 |      6969 | Emma                                                                                                    | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             41724 |  11857408 | Fifty Shades Darker (Fifty Shades, #2)                                                                  | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             38998 |  10818853 | Fifty Shades of Grey (Fifty Shades, #1)                                                                 | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              9585 |    476494 | Heart of the Dragon (Atlantis, #1)                                                                      | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              9585 |    476494 | Heart of the Dragon (Atlantis, #1)                                                                      | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             22674 |   3475054 | Hex Appeal (Hex #2)                                                                                     | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             26156 |   6376794 | Hex in High Heels (Hex, #4)                                                                             | <span style="color: #00ff00">yes</span> | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  |
|             26156 |   6376794 | Hex in High Heels (Hex, #4)                                                                             | <span style="color: #00ff00">yes</span> | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  |
|              4684 |    146744 | Jewel of Atlantis (Atlantis, #2)                                                                        | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              4684 |    146744 | Jewel of Atlantis (Atlantis, #2)                                                                        | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              1782 |     38548 | Love Bites (Argeneau #2)                                                                                | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             27888 |   6668868 | Pride Mates (Shifters Unbound, #1)                                                                      | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             32115 |   8112340 | Primal Bonds (Shifters Unbound, #2)                                                                     | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             31114 |   7812659 | Safe Haven                                                                                              | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|             60093 |  17407748 | The Longest Ride                                                                                        | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              3570 |     96131 | The Nymph King (Atlantis, #3)                                                                           | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              3570 |     96131 | The Nymph King (Atlantis, #3)                                                                           | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              9072 |    432522 | The Pleasure Slave (Imperia, #2)                                                                        | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> | <span style="color: #00ff00">yes</span> |
|              9945 |    501435 | Under Cover                                                                                             | <span style="color: #00ff00">yes</span> | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  |
|             24578 |   5999949 | Wicked by Any Other Name (Hex, #3)                                                                      | <span style="color: #00ff00">yes</span> | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  |
|             82701 |  23736027 | Alpha Contender Volume 1                                                                                | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             87812 |  25944381 | Cruel Crown (Red Queen, #0.1-#0.2)                                                                      | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> | <span style="color: #ff0000">no</span>  |
|             49240 |  13601567 | Gifted (Donovan Circus, #1)                                                                             | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             58609 |  17254498 | Gölge ve Kemik  (The Grisha, #1)                                                                        | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             22603 |   3430869 | Mister Mistress, Volume 2                                                                               | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             79219 |  22857416 | Off Campus (Bend or Break, #1)                                                                          | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             31895 |   8029972 | Our World (The Dresden Files Roleplaying Game, #2; The Dresden Files, #10.11)                           | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             20414 |   2409564 | Ravenous                                                                                                | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|              9710 |    485933 | Record of Lodoss War: Chronicles of the Heroic Knight, Book Three (Chronicles of the Heroic Knight, #3) | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             24475 |   5971977 | Rough Stock                                                                                             | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             80975 |  23311422 | Shiver: 13 Sexy Tales of Humor and Horror                                                               | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             33131 |   8439097 | The Ambassador's Mission (Traitor Spy Trilogy, #1)                                                      | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             13781 |    851218 | The Supernaturalist                                                                                     | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             30341 |   7556058 | Tracking the Tempest (Jane True, #2)                                                                    | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> | <span style="color: #ff0000">no</span>  |
|             85388 |  25192682 | Tragic Soul (Triple Threat, #0.75)                                                                      | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             79124 |  22846945 | Traitor's Blade (Greatcoats, #1)                                                                        | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             25048 |   6110386 | Virgin Mistress, Scandalous Love-Child                                                                  | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             83860 |  24578509 | War                                                                                                     | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |
|             15668 |   1108017 | Where Angels Go (Angels Everywhere, #6)                                                                 | <span style="color: #ff0000">no</span>  | <span style="color: #ff0000">no</span>  | <span style="color: #00ff00">yes</span> |

In [19]:
tmp_df = None
indices = testset.get_user_record()
user_ids = testset[indices[0]][0].repeat(book_titles_df.shape[0])
item_ids = torch.LongTensor(book_titles_df.index.values)

with torch.no_grad():
    user_ids, item_ids = user_ids.to(device), item_ids.to(device)
    predictions = model(user_ids, item_ids)

    # tmp_df = pd.DataFrame(index=item_ids.numpy(), columns=['book_id', 'title', 'predicted_ratings'])
    tmp_df = book_titles_df.copy()
    tmp_df['predicted_ratings'] = predictions.cpu().round(decimals=2) * max_range
    tmp_df.sort_values(by=['predicted_ratings'], ascending=False, inplace=True)
    tmp_df = tmp_df.loc[tmp_df['title'].notna()]
    display(Markdown(tmp_df[['title', 'predicted_ratings']].head(20).to_markdown()))

|   encoded_book_id | title                                                                       |   predicted_ratings |
|------------------:|:----------------------------------------------------------------------------|--------------------:|
|                 2 | Harry Potter and the Sorcerer's Stone (Harry Potter, #1)                    |                0.94 |
|                 0 | Harry Potter and the Half-Blood Prince (Harry Potter, #6)                   |                0.93 |
|                 4 | Harry Potter and the Goblet of Fire (Harry Potter, #4)                      |                0.93 |
|               728 | Harry Potter and the Chamber of Secrets (Harry Potter, #2)                  |                0.93 |
|              4516 | Harry Potter and the Deathly Hallows (Harry Potter, #7)                     |                0.93 |
|                18 | The Fellowship of the Ring (The Lord of the Rings, #1)                      |                0.92 |
|               438 | A Game of Thrones (A Song of Ice and Fire, #1)                              |                0.91 |
|                57 | Pride and Prejudice                                                         |                0.91 |
|              1165 | The Lightning Thief (Percy Jackson and the Olympians, #1)                   |                0.91 |
|                 1 | Harry Potter and the Order of the Phoenix (Harry Potter, #5)                |                0.91 |
|              5298 | The Name of the Wind (The Kingkiller Chronicle, #1)                         |                0.9  |
|              3697 | The Lion, the Witch, and the Wardrobe (Chronicles of Narnia, #1)            |                0.9  |
|               634 | The Two Towers (The Lord of the Rings, #2)                                  |                0.9  |
|               843 | The Return of the King (The Lord of the Rings, #3)                          |                0.9  |
|               160 | The Hobbit                                                                  |                0.9  |
|                 8 | The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1) |                0.9  |
|             24562 | Blood Promise (Vampire Academy, #4)                                         |                0.89 |
|               602 | Neverwhere                                                                  |                0.89 |
|              4562 | Dead to the World (Sookie Stackhouse, #4)                                   |                0.89 |
|             26772 | Spirit Bound (Vampire Academy, #5)                                          |                0.89 |