In [47]:
import gzip
import torch
import json
import random

import numpy as np
import torch.nn.functional as F

from tqdm.notebook import tqdm
from random import randrange
from collections import defaultdict
from random import randrange, shuffle
from transformers import BertForNextSentencePrediction, AutoTokenizer

In [52]:
RANDOM_SEED = 42
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

MAX_SEQUENCE_LENGTH = 256

MODEL_PATH = 'data/best_model_state_10k_8ep_title_desc.bin'
BOOKS_PATH = 'data/top_200000_eng_books_filtered.json'
REVIEWS_PATH = 'data/goodreads_reviews_dedup.json.gz'

NUM_OF_USER_REVIEWS = 1000000
NUM_USERS_TO_EVALUATE = 100
K_MOST_SIMILAR_TO_CONSIDER = 10

In [16]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# User reviews data loading

In [17]:
def load_data(file_name, n):
    count = 0

    user_reviews = defaultdict(list)
    with gzip.open(file_name) as fin:
        for l in fin:
            review = json.loads(l)
            user_reviews[review['user_id']].append(review)
            count += 1
            
            # break if reaches the 100th line
            if count >= n:
                    break
    
    return user_reviews

In [18]:
reviews = load_data(REVIEWS_PATH, NUM_OF_USER_REVIEWS)
print(f'Number of unique users among the specified number of reviews: {len(reviews)}')

Number of unique users among the specified number of reviews: 20152


In [34]:
user_reviews = list(reviews.items())
user_reviews.sort(key=lambda pair: len(pair[1]), reverse=True)
user_reviews = user_reviews[:NUM_USERS_TO_EVALUATE]
for user_reviews_pair in user_reviews:
    shuffle(user_reviews_pair[1])

print(f'Maximum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(user_reviews[0][1])}')
print(f'Minimum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(user_reviews[NUM_USERS_TO_EVALUATE - 1][1])}')

Maximum number of reviews for a single user among top 100 users with most ratings: 5345
Minimum number of reviews for a single user among top 100 users with most ratings: 892


In [35]:
num_examples = 5
for idx in range(num_examples):
    print(f'{user_reviews[0][1][idx]}\n')

{'user_id': '8e7e5b546a63cb9add8431ee6914cf59', 'book_id': '4374400', 'review_id': 'f3e2013a3417bd15b3eed4c766eaeac3', 'rating': 5, 'review_text': "I can't be even a tiny bit objective about this, I don't think. Mia's family and friends are in many ways like my own. I don't identify with her, but with her mother, and with the musical milieu and the changing jobs of the adults in the story. \n So I cried. I wept for an hour straight last night as I read, tears slowing to a trickle for the funnier scenes,but never stopping entirely. I cried as if Forman had hooked up electrodes directly to my tear ducts.", 'date_added': 'Wed Jul 09 08:31:00 -0700 2014', 'date_updated': 'Wed Jul 16 07:48:37 -0700 2014', 'read_at': '', 'started_at': '', 'n_votes': 0, 'n_comments': 0}

{'user_id': '8e7e5b546a63cb9add8431ee6914cf59', 'book_id': '24765', 'review_id': '03cdbbd9133d42fb39b885b8c9dc63cc', 'rating': 4, 'review_text': 'Westerfeld\'s tricky plot reversals continue to delight. Like classic science f

# Books loading

In [36]:
with open(BOOKS_PATH, 'r') as books_file:
    books = json.load(books_file)

print(f'Loaded first {len(books)} books.')

Loaded first 200000 books.


In [37]:
filtered_user_reviews = []
for user_reviews_pair in user_reviews:
    filtered_reviews = [book_review for book_review in user_reviews_pair[1] if book_review['book_id'] in books]
    filtered_user_reviews.append((user_reviews_pair[0], filtered_reviews))
filtered_user_reviews.sort(key=lambda pair: len(pair[1]), reverse=True)

print(f'Maximum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(filtered_user_reviews[0][1])}')
print(f'Minimum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(filtered_user_reviews[NUM_USERS_TO_EVALUATE - 1][1])}')

Maximum number of reviews for a single user among top 100 users with most ratings: 816
Minimum number of reviews for a single user among top 100 users with most ratings: 11


In [38]:
num_examples = 5
for idx in range(num_examples):
    print(f'{filtered_user_reviews[0][1][idx]}\n')

{'user_id': '37b3e60b4e4152c580fd798d405150ff', 'book_id': '22383', 'review_id': 'e18d225e12ffb3f123131a3c882eb386', 'rating': 4, 'review_text': 'This book is a bit wobby as we trade off between the two main sequences of Seven Soldiers books. Fortunately, Bulleteer and Frankenstein both start off strong.', 'date_added': 'Fri Jul 05 14:05:26 -0700 2013', 'date_updated': 'Fri Jul 05 14:05:57 -0700 2013', 'read_at': 'Sun Jul 04 00:00:00 -0700 2010', 'started_at': '', 'n_votes': 0, 'n_comments': 0}

{'user_id': '37b3e60b4e4152c580fd798d405150ff', 'book_id': '22349', 'review_id': '632a31f93a316e7958ec722c341a6807', 'rating': 5, 'review_text': 'The stories continue to gain momentum, with the finales to Guardian and Shining Knight both being quite good. The crossovers, especially regarding the Newsboy Army are also very intriguing.', 'date_added': 'Fri Jul 05 14:04:56 -0700 2013', 'date_updated': 'Fri Jul 05 14:06:09 -0700 2013', 'read_at': 'Fri Jul 02 00:00:00 -0700 2010', 'started_at': '', 

# Model and tokenizer loading

In [4]:
model = BertForNextSentencePrediction.from_pretrained(PRE_TRAINED_MODEL_NAME)
model.load_state_dict(torch.load(MODEL_PATH))
model = model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Item based collaborative filtering evaluation

In [55]:
def calculate_books_similarity(book1_id, book2_id, books):
    book1, book2 = books[book1_id], books[book2_id]
    encoded_sequences = tokenizer(
        book1['description'],
        book2['description'],
        padding='longest',
        truncation='longest_first',
        return_tensors='pt',
        max_length=MAX_SEQUENCE_LENGTH
    )

    outputs = model(
        input_ids=encoded_sequences['input_ids'].to(device),
        attention_mask=encoded_sequences['attention_mask'].to(device),
        token_type_ids=encoded_sequences['token_type_ids'].to(device)
    )
    probs = F.softmax(outputs.logits, dim=1)

    # Class 0 notes if sent B follows sent A
    similarity_score = probs[0][0]
    return similarity_score.detach().item()

In [60]:
def evaluate_user(user_reviews_pair, books):
    user_id = user_reviews_pair[0]
    reviews = user_reviews_pair[1]

    # We want to estimate the rating of the first rated book
    # hence we get it and remove it from the rest of the reviews
    review_to_estimate = reviews[0]
    reviews = reviews[1:]
    similarities = []
    for review in reviews:
        similarity = calculate_books_similarity(review_to_estimate['book_id'], review['book_id'], books)
        similarities.append((similarity, review['rating']))
    similarities.sort(key=lambda pair: pair[0], reverse=True)
    similarities = similarities[:K_MOST_SIMILAR_TO_CONSIDER]
    
    sims_torch = torch.tensor([sim for sim, _ in similarities], dtype=torch.float)
    ratings_torch = torch.tensor([real_rating for _, real_rating in similarities], dtype=torch.float)
    estimated_rating = torch.dot(sims_torch, ratings_torch) / torch.sum(sims_torch)

    print(f'Similarities and real ratings: {similarities}')
    print(f'Real rating: {review_to_estimate["rating"]}')
    print(f'Estimated rating: {estimated_rating}\n')

    return abs(review_to_estimate['rating'] - estimated_rating)

In [61]:
abs_diffs = []
for user_reviews_pair in tqdm(filtered_user_reviews):
    abs_diff = evaluate_user(user_reviews_pair, books)
    abs_diffs.append(abs_diff)

mean_abs_diff = torch.sum(torch.tensor(abs_diffs)) / len(abs_diffs)
print(f'Mean absolute rating - estimated rating difference: {mean_abs_diff}')

  0%|          | 0/100 [00:00<?, ?it/s]

4), (0.9997041821479797, 3), (0.9996845722198486, 4), (0.9996647834777832, 4), (0.9996498823165894, 4), (0.9996355772018433, 4), (0.9996016621589661, 4), (0.9995859265327454, 4), (0.9995842576026917, 4)]
Real rating: 4
Estimated rating: 3.899995803833008

Similarities and real ratings: [(0.9998377561569214, 5), (0.9998227953910828, 5), (0.9998143315315247, 5), (0.9998142123222351, 5), (0.9998136162757874, 5), (0.9998125433921814, 5), (0.9998108744621277, 5), (0.9998094439506531, 5), (0.9998090863227844, 5), (0.9998086094856262, 5)]
Real rating: 5
Estimated rating: 5.000000476837158

Similarities and real ratings: [(0.9998161196708679, 4), (0.9998113512992859, 4), (0.9998109936714172, 4), (0.9998109936714172, 5), (0.9998101592063904, 4), (0.9998064637184143, 5), (0.9998055100440979, 3), (0.9998053908348083, 5), (0.9998016953468323, 4), (0.9998001456260681, 2)]
Real rating: 4
Estimated rating: 4.000001430511475

Similarities and real ratings: [(0.9998219609260559, 5), (0.9998213648796082