In [1]:
import gzip
import torch
import json
import random

import numpy as np
import torch.nn.functional as F

from tqdm.notebook import tqdm
from random import randrange
from collections import defaultdict
from random import randrange, shuffle
from transformers import BertForNextSentencePrediction, AutoTokenizer

In [2]:
RANDOM_SEED = 42
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

MAX_SEQUENCE_LENGTH = 256

MODEL_PATH = '800k_100k_100k_best_model_state.pth'
BOOKS_PATH = 'data/top_200000_eng_books_filtered.json'
REVIEWS_PATH = 'data/goodreads_reviews_dedup.json.gz'

NUM_OF_USER_REVIEWS = 1000000
NUM_USERS_TO_EVALUATE = 100
K_MOST_SIMILAR_TO_CONSIDER = 10

In [3]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# User reviews data loading

In [4]:
def load_data(file_name, n):
    count = 0

    user_reviews = defaultdict(list)
    with gzip.open(file_name) as fin:
        for l in fin:
            review = json.loads(l)
            user_reviews[review['user_id']].append(review)
            count += 1
            
            # break if reaches the 100th line
            if count >= n:
                    break
    
    return user_reviews

In [5]:
reviews = load_data(REVIEWS_PATH, NUM_OF_USER_REVIEWS)
print(f'Number of unique users among the specified number of reviews: {len(reviews)}')

Number of unique users among the specified number of reviews: 20152


In [6]:
user_reviews = list(reviews.items())
user_reviews.sort(key=lambda pair: len(pair[1]), reverse=True)
user_reviews = user_reviews[:NUM_USERS_TO_EVALUATE]
for user_reviews_pair in user_reviews:
    shuffle(user_reviews_pair[1])

print(f'Maximum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(user_reviews[0][1])}')
print(f'Minimum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(user_reviews[NUM_USERS_TO_EVALUATE - 1][1])}')

Maximum number of reviews for a single user among top 100 users with most ratings: 5345
Minimum number of reviews for a single user among top 100 users with most ratings: 892


In [7]:
num_examples = 5
for idx in range(num_examples):
    print(f'{user_reviews[0][1][idx]}\n')

{'user_id': '8e7e5b546a63cb9add8431ee6914cf59', 'book_id': '22138435', 'review_id': '3f47f8fcd54034bbde53ec9d12d557c9', 'rating': 4, 'review_text': 'I liked it, the simple rhyme scheme, the graphic art style, the whole thing. \n Library copy', 'date_added': 'Thu Jan 22 05:57:11 -0800 2015', 'date_updated': 'Mon Feb 23 10:09:54 -0800 2015', 'read_at': 'Mon Feb 09 00:00:00 -0800 2015', 'started_at': 'Sun Feb 08 00:00:00 -0800 2015', 'n_votes': 0, 'n_comments': 0}

{'user_id': '8e7e5b546a63cb9add8431ee6914cf59', 'book_id': '7163889', 'review_id': '0940680fc4d04b912525833b4f27a93e', 'rating': 5, 'review_text': "Less than 24 hours after receiving our copies, the eldest and I have already finished it, and I've read the first 20 pages aloud to the KitKatPandaBatWolf. Oh, we love these books. And this morning I realized why. Unlike so many books for the new novel reader (those Daisy Meadows books, for example) there's a little bit of snark here. Not a lot. Just enough to keep the fantasy groun

# Books loading

In [8]:
with open(BOOKS_PATH, 'r') as books_file:
    books = json.load(books_file)

print(f'Loaded first {len(books)} books.')

Loaded first 200000 books.


In [9]:
filtered_user_reviews = []
for user_reviews_pair in user_reviews:
    filtered_reviews = [book_review for book_review in user_reviews_pair[1] if book_review['book_id'] in books]
    filtered_user_reviews.append((user_reviews_pair[0], filtered_reviews))
filtered_user_reviews.sort(key=lambda pair: len(pair[1]), reverse=True)

print(f'Maximum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(filtered_user_reviews[0][1])}')
print(f'Minimum number of reviews for a single user among top {NUM_USERS_TO_EVALUATE} users with most ratings: {len(filtered_user_reviews[NUM_USERS_TO_EVALUATE - 1][1])}')

Maximum number of reviews for a single user among top 100 users with most ratings: 816
Minimum number of reviews for a single user among top 100 users with most ratings: 11


In [10]:
num_examples = 5
for idx in range(num_examples):
    print(f'{filtered_user_reviews[0][1][idx]}\n')

{'user_id': '37b3e60b4e4152c580fd798d405150ff', 'book_id': '6839093', 'review_id': 'b66a3eefb08c74170fab1ef71c383615', 'rating': 3, 'review_text': "I'm really not thrilled with the book's central premise, but in the latter half of the volume, the story started building toward something more interesting that _might_ entice me to return.", 'date_added': 'Mon Jun 24 21:35:07 -0700 2013', 'date_updated': 'Mon Jun 24 21:35:36 -0700 2013', 'read_at': 'Wed Oct 19 00:00:00 -0700 2011', 'started_at': '', 'n_votes': 0, 'n_comments': 0}

{'user_id': '37b3e60b4e4152c580fd798d405150ff', 'book_id': '2108198', 'review_id': '6b6109fd53966b4e07b4bb4a0e20c46a', 'rating': 4, 'review_text': "A cute little book with nice Norse mythology. It's a little bit weak in the middle, but overall a good read.", 'date_added': 'Fri Jun 21 13:33:08 -0700 2013', 'date_updated': 'Fri Jun 21 13:33:34 -0700 2013', 'read_at': 'Tue Apr 06 00:00:00 -0700 2010', 'started_at': '', 'n_votes': 0, 'n_comments': 0}

{'user_id': '37

# Model and tokenizer loading

In [11]:
model = BertForNextSentencePrediction.from_pretrained(PRE_TRAINED_MODEL_NAME)
model.load_state_dict(torch.load(MODEL_PATH))
model = model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Item based collaborative filtering evaluation

In [12]:
def calculate_books_similarity(book1_id, book2_id, books):
    book1, book2 = books[book1_id], books[book2_id]
    encoded_sequences = tokenizer(
        book1['description'],
        book2['description'],
        padding='longest',
        truncation='longest_first',
        return_tensors='pt',
        max_length=MAX_SEQUENCE_LENGTH
    )

    outputs = model(
        input_ids=encoded_sequences['input_ids'].to(device),
        attention_mask=encoded_sequences['attention_mask'].to(device),
        token_type_ids=encoded_sequences['token_type_ids'].to(device)
    )
    probs = F.softmax(outputs.logits, dim=1)

    # Class 0 notes if sent B follows sent A
    similarity_score = probs[0][0]
    return similarity_score.detach().item()

In [13]:
def evaluate_user(user_reviews_pair, books):
    user_id = user_reviews_pair[0]
    reviews = user_reviews_pair[1]

    # We want to estimate the rating of the first rated book
    # hence we get it and remove it from the rest of the reviews
    review_to_estimate = reviews[0]
    reviews = reviews[1:]
    similarities = []
    for review in reviews:
        similarity = calculate_books_similarity(review_to_estimate['book_id'], review['book_id'], books)
        similarities.append((similarity, review['rating']))
    similarities.sort(key=lambda pair: pair[0], reverse=True)
    similarities = similarities[:K_MOST_SIMILAR_TO_CONSIDER]
    
    sims_torch = torch.tensor([sim for sim, _ in similarities], dtype=torch.float)
    ratings_torch = torch.tensor([real_rating for _, real_rating in similarities], dtype=torch.float)
    estimated_rating = torch.dot(sims_torch, ratings_torch) / torch.sum(sims_torch)

    print(f'Similarities and real ratings: {similarities}')
    print(f'Real rating: {review_to_estimate["rating"]}')
    print(f'Estimated rating: {estimated_rating}\n')

    return abs(review_to_estimate['rating'] - estimated_rating)

In [14]:
abs_diffs = []
for user_reviews_pair in tqdm(filtered_user_reviews):
    abs_diff = evaluate_user(user_reviews_pair, books)
    abs_diffs.append(abs_diff)

mean_abs_diff = torch.sum(torch.tensor(abs_diffs)) / len(abs_diffs)
print(f'Mean absolute rating - estimated rating difference: {mean_abs_diff}')

  0%|          | 0/100 [00:00<?, ?it/s]

Similarities and real ratings: [(0.9999223947525024, 4), (0.9999150037765503, 5), (0.9999055862426758, 5), (0.999904990196228, 4), (0.9999024868011475, 4), (0.9998922348022461, 4), (0.9998916387557983, 3), (0.999889612197876, 5), (0.9998887777328491, 4), (0.9998857975006104, 5)]
Real rating: 3
Estimated rating: 4.3000006675720215

Similarities and real ratings: [(0.9999351501464844, 3), (0.9999350309371948, 4), (0.9999346733093262, 4), (0.9999301433563232, 5), (0.9999288320541382, 5), (0.9999279975891113, 4), (0.9999253749847412, 5), (0.9999208450317383, 2), (0.9999198913574219, 5), (0.9999198913574219, 5)]
Real rating: 5
Estimated rating: 4.1999993324279785

Similarities and real ratings: [(0.999936580657959, 4), (0.9999362230300903, 4), (0.9999357461929321, 5), (0.9999352693557739, 2), (0.9999344348907471, 4), (0.9999339580535889, 5), (0.9999333620071411, 3), (0.999933123588562, 5), (0.9999330043792725, 5), (0.9999328851699829, 5)]
Real rating: 4
Estimated rating: 4.199999809265137



Similarities and real ratings: [(0.9999340772628784, 5), (0.9999333620071411, 5), (0.999932050704956, 5), (0.999931812286377, 5), (0.9999303817749023, 1), (0.9999303817749023, 5), (0.9999301433563232, 4), (0.9999301433563232, 3), (0.9999299049377441, 4), (0.9999289512634277, 2)]
Real rating: 3
Estimated rating: 3.9000015258789062

Similarities and real ratings: [(0.9999328851699829, 4), (0.9999324083328247, 4), (0.9999319314956665, 4), (0.999931812286377, 4), (0.9999284744262695, 3), (0.9999278783798218, 4), (0.9999268054962158, 4), (0.9999256134033203, 3), (0.9999252557754517, 4), (0.9999231100082397, 4)]
Real rating: 3
Estimated rating: 3.8000006675720215

Similarities and real ratings: [(0.9998149275779724, 4), (0.999718964099884, 3), (0.9987132549285889, 4), (0.9966124892234802, 5), (0.9642419815063477, 4), (0.08868501335382462, 5), (0.023146022111177444, 4), (0.013803047128021717, 4), (0.00932248029857874, 3), (0.009224471636116505, 2)]
Real rating: 5
Estimated rating: 4.011327266

Similarities and real ratings: [(0.9999181032180786, 3), (0.9998934268951416, 0), (0.9998698234558105, 3), (0.9998693466186523, 3), (0.9998679161071777, 2), (0.9998606443405151, 3), (0.9998587369918823, 2), (0.9998550415039062, 4), (0.9998418092727661, 4), (0.9998334646224976, 3)]
Real rating: 4
Estimated rating: 2.6999893188476562

Similarities and real ratings: [(0.999935507774353, 4), (0.9999338388442993, 5), (0.9999295473098755, 2), (0.9999207258224487, 5), (0.9999203681945801, 5), (0.9999133348464966, 4), (0.9999102354049683, 4), (0.9998990297317505, 5), (0.9998900890350342, 2), (0.9998778104782104, 3)]
Real rating: 5
Estimated rating: 3.9000072479248047

Similarities and real ratings: [(0.999936580657959, 5), (0.9999361038208008, 5), (0.999935507774353, 5), (0.9999349117279053, 5), (0.9999321699142456, 5), (0.9999308586120605, 4), (0.9999270439147949, 5), (0.9999262094497681, 5), (0.9999233484268188, 5), (0.9999229907989502, 5)]
Real rating: 5
Estimated rating: 4.899999618530273


Similarities and real ratings: [(0.99993896484375, 2), (0.9999388456344604, 4), (0.9999326467514038, 4), (0.999927282333374, 3), (0.9999257326126099, 0), (0.9999160766601562, 4), (0.9999148845672607, 4), (0.9999104738235474, 4), (0.9999020099639893, 4), (0.999896764755249, 3)]
Real rating: 1
Estimated rating: 3.199995756149292

Similarities and real ratings: [(0.9999325275421143, 3), (0.9999315738677979, 3), (0.9999302625656128, 5), (0.9999301433563232, 2), (0.9999241828918457, 5), (0.9999220371246338, 4), (0.999919056892395, 5), (0.9999008178710938, 4), (0.999900221824646, 1), (0.9998823404312134, 5)]
Real rating: 2
Estimated rating: 3.699998378753662

Similarities and real ratings: [(0.999752938747406, 5), (0.9988570213317871, 4), (0.9987429976463318, 4), (0.9983898401260376, 4), (0.9975073933601379, 2), (0.9973190426826477, 5), (0.9907773733139038, 4), (0.9890027046203613, 4), (0.9760870337486267, 4), (0.974953830242157, 2)]
Real rating: 4
Estimated rating: 3.803671360015869

Simila