In [1]:
import torch
import json
import random

import numpy as np
import torch.nn.functional as F

from tqdm.notebook import tqdm
from random import randrange
from transformers import BertForNextSentencePrediction, AutoTokenizer

In [2]:
RANDOM_SEED = 42
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

MAX_SEQUENCE_LENGTH = 256

MODEL_PATH = '800k_100k_100k_best_model_state.pth'
BOOKS_PATH = 'data/top_200000_eng_books_filtered.json'
NUM_OF_BOOKS_TO_USE = 10000

In [3]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Model and tokenizer loading

In [4]:
model = BertForNextSentencePrediction.from_pretrained(PRE_TRAINED_MODEL_NAME)
model.load_state_dict(torch.load(MODEL_PATH))
model = model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Books loading

In [5]:
def extract_first_n_books(n):
    with open(BOOKS_PATH, 'r') as books_file:
        books = json.load(books_file)

    all_book_ids = list(books.keys())
    books_to_use = {}
    for idx in range(n):
        book_id = all_book_ids[idx]
        books_to_use[book_id] = books[book_id]

    return books_to_use, all_book_ids[:n]

In [6]:
books, book_ids = extract_first_n_books(NUM_OF_BOOKS_TO_USE)
print(f'{len(books)} books loaded.')

10000 books loaded.


# Books similar to a source one

In [7]:
source_book = book_ids[randrange(len(books))]
source_book = books[source_book]

print(f'Source book selected: {source_book}')

Source book selected: {'book_id': '12429335', 'title': 'Water Sings Blue: Ocean Poems', 'description': 'Come down to the shore with this rich and vivid celebration of the ocean! With watercolors gorgeous enough to wade in by award-winning artist Meilo So and playful, moving poems by Kate Coombs, Water Sings Blue evokes the beauty and power, the depth and mystery, and the endless resonance of the sea.', 'similar_books': ['18166919', '12962487']}


In [8]:
def get_top_k_most_similar(source_book, book_ids, books_data, k=10):
    results = []
    for book_id in tqdm(book_ids):
        book = books_data[book_id]

        encoded_sequences = tokenizer(
            source_book['description'],
            book['description'],
            padding='longest',
            truncation='longest_first',
            return_tensors='pt',
            max_length=MAX_SEQUENCE_LENGTH
        )

        outputs = model(
            input_ids=encoded_sequences['input_ids'].to(device),
            attention_mask=encoded_sequences['attention_mask'].to(device),
            token_type_ids=encoded_sequences['token_type_ids'].to(device)
        )
        probs = F.softmax(outputs.logits, dim=1)

        # Class 0 notes if sent B follows sent A
        similarity_score = probs[0][0]
        results.append((book_id, similarity_score.detach().item()))
    
    results.sort(key=lambda pair: pair[1], reverse=True)
    return results[:k]

In [9]:
similar_books = get_top_k_most_similar(source_book, book_ids, books, 10)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [10]:
print(f'Source book title: {source_book["title"]}')
print(f'Source book description: {source_book["description"]}\n\n')

print('Top 10 most similar books:\n')
for idx, (book_id, score) in enumerate(similar_books):
    similar_book = books[book_id]
    print(f'{idx + 1}.')
    print(f'Title: {similar_book["title"]}')
    print(f'Score 0-1: {score}')
    print(f'Description: {similar_book["description"]}\n')

Source book title: Water Sings Blue: Ocean Poems
Source book description: Come down to the shore with this rich and vivid celebration of the ocean! With watercolors gorgeous enough to wade in by award-winning artist Meilo So and playful, moving poems by Kate Coombs, Water Sings Blue evokes the beauty and power, the depth and mystery, and the endless resonance of the sea.


Top 10 most similar books:

1.
Title: Welou, My Brother
Score 0-1: 0.9999364614486694
Description: Welou, My Brotheris the story of a boy and his family, and their struggle to live between three cultures. They are Islanders, descended from labourers who were shanghaied from Vanuatu and put to work in the Queensland cane fields. Welou, his mother Ivy and father Wacvie, and his brothers and sisters live a simple and close family life centered on farming and domestic chores. In this book, Faith Bandler recreates the timeless, slow and relaxed rhythm of life of her childhood. It is a story that will be enjoyed by all age

# Books similar to a description

In [11]:
description = "Inventor Tony Stark is no ordinary billionaire. He may have a big house and lots of money, but he also has a powerful suit that he wears to help fight evil and protect people in need. Tony Stark is the Invincible Iron Man!"

In [12]:
def get_top_k_most_similar_for_description(description, book_ids, books_data, k=10):
    results = []
    for book_id in tqdm(book_ids):
        book = books_data[book_id]

        encoded_sequences = tokenizer(
            description,
            book['description'],
            padding='longest',
            truncation='longest_first',
            return_tensors='pt',
            max_length=MAX_SEQUENCE_LENGTH
        )

        outputs = model(
            input_ids=encoded_sequences['input_ids'].to(device),
            attention_mask=encoded_sequences['attention_mask'].to(device),
            token_type_ids=encoded_sequences['token_type_ids'].to(device)
        )
        probs = F.softmax(outputs.logits, dim=1)

        # Class 0 notes if sent B follows sent A
        similarity_score = probs[0][0]
        results.append((book_id, similarity_score.detach().item()))
    
    results.sort(key=lambda pair: pair[1], reverse=True)
    return results[:k]

In [13]:
similar_books_to_desc = get_top_k_most_similar_for_description(description, book_ids, books, 10)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [14]:
print(f'Source description: {description}\n\n')

print('Top 10 most similar books based on the description:\n')
for idx, (book_id, score) in enumerate(similar_books_to_desc):
    similar_book = books[book_id]
    print(f'{idx + 1}.')
    print(f'Title: {similar_book["title"]}')
    print(f'Score 0-1: {score}')
    print(f'Description: {similar_book["description"]}\n')

Source description: Inventor Tony Stark is no ordinary billionaire. He may have a big house and lots of money, but he also has a powerful suit that he wears to help fight evil and protect people in need. Tony Stark is the Invincible Iron Man!


Top 10 most similar books based on the description:

1.
Title: The Compleat Terminal City
Score 0-1: 0.9999097585678101
Description: Visionary designer and comics creator Dean Motter (Mister X, Electropolis, Batman: Nine Lives) returns with the purest expression to date of his patented retro futurism! Terminal City is a place where transistor-tube robots rub elbows with old-time gangsters, where bright, shiny technologies cast deep noir shadows. Teaming Motter with celebrated artist Michael Lark (Daredevil, Gotham Central), this massive collection reprints the original series, along with its sequel, Terminal City: Aerial Graffiti! * All fourteen issues of the classic series, collected for the first time! * Eisner-and Harvey Award-nominated serie