In [1]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import data

In [2]:
# configurable parameters, change as needed

# set to true if loading existing model file, false if training a new model
skip_training = True
explicit = False
data_dir = 'data'
model_save_path = 'models/goodreads_recsys.pth'

pd.options.display.max_rows = None

In [3]:
# create dirs if not existing
os.makedirs(data_dir, exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)

In [4]:
# additional settings, automatically selects cuda if available
if skip_training:
    device_type = 'cpu'
elif torch.cuda.is_available():
    device_type = 'cuda:0'
else:
    device_type = 'cpu'

# set manually if needed e.g. device_type = 'cpu'
print("Using device type:", device_type)
device = torch.device(device_type)

Using device type: cpu


In [5]:
trainset = data.GoodReadsRatingsDataset(root=data_dir, mode='train', explicit=explicit)
testset = data.GoodReadsRatingsDataset(root=data_dir, mode='test', explicit=explicit)

Dataset: train, User count: 24405, Book count: 84384, Ratings count: 1963906
Dataset: test, User count: 22198, Book count: 84150, Ratings count: 852278


In [6]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1024, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)

In [7]:
class RecommenderSystem(nn.Module):
    def __init__(self, n_users, n_items):
        """
        Args:
          n_users: Number of users.
          n_items: Number of items.
        """
        super(RecommenderSystem, self).__init__()

        self.user_em = nn.Embedding(n_users, 100)
        self.item_em = nn.Embedding(n_items, 100)
        self.drop0 = nn.Dropout(0.02)
        
        self.fc1 = nn.Linear(200, 100)
        self.relu1 = nn.ReLU()
        self.drop1 = nn.Dropout(0.02)
        
        self.fc2 = nn.Linear(100, 10)
        self.relu2 = nn.ReLU()
        self.drop2 = nn.Dropout(0.02)
        
        self.fc3 = nn.Linear(10, 1)
        
    def forward(self, user_ids, item_ids):
        """
        Args:
          user_ids of shape (batch_size): User ids (starting from 0).
          item_ids of shape (batch_size): Item ids (starting from 0).
        
        Returns:
          outputs of shape (batch_size): Predictions of ratings.
        """
        x = torch.cat([self.user_em(user_ids), self.item_em(item_ids)], dim=1)
        x = self.drop0(x)

        x = self.fc1(x)
        x = self.relu1(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.drop2(x)

        x = torch.sigmoid(self.fc3(x))
        # min_rating, max_rating = (0.5, 5.5)
        # x = x*(max_rating - min_rating) + min_rating
        x = x.view(-1)
        
        return x

In [8]:
model = RecommenderSystem(trainset.n_users, trainset.n_items)
model.to(device)

RecommenderSystem(
  (user_em): Embedding(24405, 100)
  (item_em): Embedding(84384, 100)
  (drop0): Dropout(p=0.02, inplace=False)
  (fc1): Linear(in_features=200, out_features=100, bias=True)
  (relu1): ReLU()
  (drop1): Dropout(p=0.02, inplace=False)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (relu2): ReLU()
  (drop2): Dropout(p=0.02, inplace=False)
  (fc3): Linear(in_features=10, out_features=1, bias=True)
)

In [9]:
# computes the loss:
def compute_loss(model, testloader):
    model.eval()
    cost = nn.MSELoss()
    total_loss = 0
    total_data = 0
    prediction_list = []

    with torch.no_grad():
        for user_ids, item_ids, labels in testloader:
            user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
            predictions = model(user_ids, item_ids)
            prediction_list.append(predictions.cpu())

            loss = cost(predictions, labels)
            total_loss += (loss.item() * labels.size(0))
            total_data += labels.size(0)

    loss = total_loss / total_data
    return torch.cat(prediction_list), loss

In [10]:
# training loop
if not skip_training:
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
    cost = nn.MSELoss()
    total_loss = 0
    total_data = 0

    # save historical losses and accs
    hist_metrics = dict()
    hist_metrics['epoch'] = []
    hist_metrics['train_loss'] = []
    hist_metrics['test_loss'] = []

    epochs = 20
    for epoch in range(epochs):
        model.train()
        for user_ids, item_ids, labels in trainloader:  
            user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(user_ids, item_ids)

            loss = cost(predictions, labels)
            total_loss += (loss.item() * labels.size(0))
            total_data += labels.size(0)
            loss.backward()
            optimizer.step()

        train_loss = total_loss / total_data
        _, test_loss = compute_loss(model, testloader)

        hist_metrics['epoch'].append(epoch)
        hist_metrics['train_loss'].append(train_loss)
        hist_metrics['test_loss'].append(test_loss)

        print('Epoch {}: Train error: {:.4f}, Test error: {:.4f}'.format(epoch, train_loss, test_loss))

In [11]:
# plot and save historical train/test loss
def plot_metrics(metrics, save_path='logs/goodreads_recsys_{}.{}'):
    plt.plot(metrics['train_loss'])
    plt.plot(metrics['test_loss'])
    plt.ylabel("loss")
    plt.xlabel("epochs")
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(save_path.format('loss', "png"))
    plt.show()

    with open(save_path.format("hist", "json"), 'w') as f:
        json.dump(metrics, f)

In [12]:
# save trained model
if not skip_training:
    torch.save(model.state_dict(), model_save_path)
    plot_metrics(hist_metrics)

In [13]:
if skip_training:
    model = RecommenderSystem(trainset.n_users, trainset.n_items)
    model.load_state_dict(torch.load(model_save_path, map_location=lambda storage, loc: storage))
    print('Model loaded from: {}'.format(model_save_path))
    model.to(device)
    model.eval()

Model loaded from: models/goodreads_recsys.pth


In [14]:
predictions, loss = compute_loss(model, testloader)
print('Test loss: {:.4f}'.format(loss))

Test loss: 0.1644


In [15]:
book_titles_df = pd.read_csv(os.path.join(data_dir, 'book_titles.csv'), header=0)
book_titles_df.set_index('encoded_book_id', inplace=True)

In [16]:
next_user_i = 0
top_half_hit = 0
top10_hit = 0
top01_hit = 0
count = 0

while next_user_i < len(testset):
    user_id = testset[next_user_i][0]
    last_user_i = next_user_i
    while next_user_i < len(testset) and testset[next_user_i][0] == user_id:
        next_user_i += 1
    indices = range(last_user_i, next_user_i)

    labels = testset[indices][2]
    _, top_reco = torch.topk(predictions[indices], k=int(len(indices)/2))
    top_half_hit += labels[top_reco].sum()/int(len(indices)/2)

    _, top_reco = torch.topk(predictions[indices], k=min(int(len(indices)/2), 10))
    top10_hit += labels[top_reco].sum()/min(int(len(indices)/2), 10)

    _, top_reco = torch.topk(predictions[indices], k=1)
    top01_hit += labels[top_reco].sum()

    count += 1


top_half_hit = top_half_hit / count
print("top 50% hit rate:", top_half_hit.item())
top10_hit = top10_hit / count
print("top 10 hit rate:", top10_hit.item())
top01_hit = top01_hit / count
print("top 1 hit rate:", top01_hit.item())

top 50% hit rate: 0.7641916275024414
top 10 hit rate: 0.8197729587554932
top 1 hit rate: 0.8867915868759155


In [17]:
indices = testset.get_user_record()
sampleset = torch.utils.data.Subset(testset, indices)
sampleloader = torch.utils.data.DataLoader(sampleset, batch_size=len(indices), shuffle=False)

if explicit:
    max_range = 5.0
else:
    max_range = 1.0

with torch.no_grad():
    for user_ids, item_ids, labels in sampleloader:
        print("Showing book ratings of user enc_id={}".format(user_ids[0]))
        user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
        predictions = model(user_ids, item_ids)

        tmp_df = book_titles_df.loc[item_ids.cpu()]
        tmp_df['actual_ratings'] = labels.cpu() * max_range
        tmp_df['predicted_ratings'] = predictions.cpu().round(decimals=2) * max_range
        tmp_df.sort_values(by=['actual_ratings'], ascending=False, inplace=True)
        display(tmp_df)

Showing book ratings of user enc_id=5250


Unnamed: 0_level_0,book_id,original_title,actual_ratings,predicted_ratings
encoded_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
471,13821,Lord of Light,1.0,0.82
3315,92121,Nine Princes in Amber,1.0,0.66
22876,3437309,Empire in Black and Gold,1.0,0.65
1321,30036,Elric of Melnibone,1.0,0.72
5819,234225,Dune,1.0,0.93
13969,944073,The Blade Itself,1.0,0.92
3134,82192,The Crystal Cave,1.0,0.89
2543,62012,The Guns of Avalon,1.0,0.79
1842,43545,The Once and Future King,1.0,0.91
5372,209522,The Encyclopedia of Fantasy,0.0,0.24


In [18]:
tmp_df = None
user_ids = testset[indices[0]][0].repeat(book_titles_df.shape[0])
item_ids = torch.LongTensor(book_titles_df.index.values)

with torch.no_grad():
    user_ids, item_ids = user_ids.to(device), item_ids.to(device)
    predictions = model(user_ids, item_ids)

tmp_df = book_titles_df
tmp_df['predicted_ratings'] = predictions.cpu().round(decimals=2) * max_range
tmp_df.sort_values(by=['predicted_ratings'], ascending=False, inplace=True)
tmp_df = tmp_df.loc[tmp_df['original_title'].notna()]
display(tmp_df.head(20))

Unnamed: 0_level_0,book_id,original_title,predicted_ratings
encoded_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,3,Harry Potter and the Philosopher's Stone,0.97
4279,136251,Harry Potter and the Deathly Hallows,0.97
4,6,Harry Potter and the Goblet of Fire,0.97
730,15881,Harry Potter and the Chamber of Secrets,0.97
1,2,Harry Potter and the Order of the Phoenix,0.97
18,34,The Fellowship of the Ring,0.96
0,1,Harry Potter and the Half-Blood Prince,0.96
399,13496,A Game of Thrones,0.96
606,15241,The Two Towers,0.95
1150,28187,The Lightning Thief,0.95
