In [1]:
import numpy as np
import pandas as pd
import torch
import transformers

import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

In [2]:
RANDOM_SEED = 11
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

BATCH_SIZE = 1
NUM_EPOCHS = 10
POSITIVE_SIMILARITY_THRESHOLD = 0.5

# Based on the recommended hyperparameters in the BERT paper
DROPOUT_RATE = 0.3
LEARNING_RATE = 2e-5

In [3]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Book pair datasets and dataloaders

In [4]:
book_pairs_df = pd.read_csv('data/similar_book_pairs.csv')
book_pairs_df.head(10)

Unnamed: 0,pair_id,book1_id,book1_title,book1_description,book2_id,book2_title,book2_description,target_class
0,0,15984348,"X-Men, by Brian Wood, Volume 1: Blank Generation",Superstar writer Brian Wood (Wolverine & the X...,16002124,"Astonishing X-Men, Volume 11: Weaponized",It's the explosive aftermath to the year's mos...,1
1,1,15779244,"1971 (Mark Miller's One, #10)",100% of the author's proceeds will be donated ...,18270710,The Tiger (The Donkey and the Wall #3),"""Lost and found..."" The Tiger chronicles the L...",0
2,2,17338329,"Beware of Bad Boy (Beware of Bad Boy, #1)","**Ebook FREE on Amazon, iTunes, Google Play an...",17611210,The Mistress Mistake,Jessica Conway is at the end of her rope. She ...,1
3,3,32329580,"Fur (Becoming, #1)",Have you ever wondered what it's like to becom...,12434408,"Goodnight, Angels","Goodnight, rubber duckie. Thank you for the sc...",0
4,4,18584581,Exposed,Nia doesn't know why the TV show Dirty Rotten ...,17665070,Unearthing Cole,Cole Alston swore he'd never return to his chi...,0
5,5,30282842,Day Dreamer (Undeadly Secrets Book 2),Alex Hensley thought that discovering vampires...,12042965,Raphael's Mating,[Siren Classic ManLove: Erotic Alternative Par...,0
6,6,417803,"Shooting Chant (Ella Clah, #5)",Navajo Police Special Investigator Ella Clah i...,22022427,When Lines Are Blurred,J2 Fanfic High school au in which Jensen Ackle...,0
7,7,21531497,The Laughing Monsters,Denis Johnson's The Laughing Monsters is a hig...,18490553,American Innovations: Stories,A brilliant new collection of short stories fr...,1
8,8,19462885,"The Adventure Continues (Adventure, #2)",Rose and Ian were a match made in heaven. Or s...,25088474,Senses Series Box Set (Senses #1-5),This box set contains Senses Series Books 1-5....,0
9,9,31867595,Dreaming of Love (The Bradens at Trusty #5; T...,DREAMING OF LOVE is a USA TODAY BESTSELLER Emi...,22031611,Man from the Sky,"For seventy-three-year-old Jaime, the answer t...",0


In [5]:
class SimilarBooksDataset(Dataset):

  def __init__(self, book_pairs_df):
    self.book_pairs_df = book_pairs_df
  
  def __len__(self):
    return len(self.book_pairs_df)
  
  def __getitem__(self, item):
    pair = self.book_pairs_df.iloc[item]
    
    return {
        'book1_sequence': f'"{pair["book1_title"]}" - {pair["book1_description"]}',
        'book2_sequence': f'"{pair["book2_title"]}" - {pair["book2_description"]}',
        'target_class': pair['target_class']
    }

In [6]:
class SequenceCollate:
    """
    Collate to tokenize and apply the padding to the sequences with dataloader
    """
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
    
    def __call__(self, batch):
        book1_sequences, book2_sequences, target_classes = [], [], []
        for pair in batch:
            book1_sequences.append(pair['book1_sequence'])
            book2_sequences.append(pair['book2_sequence'])
            target_classes.append(pair['target_class'])

        encoded_sequences = self.tokenizer(
            book1_sequences,
            book2_sequences,
            padding='longest',
            truncation='longest_first',
            return_tensors='pt'
        )

        return {
            'sequences': encoded_sequences['input_ids'],
            'attention_masks': encoded_sequences['attention_mask'],
            'target_classes': torch.as_tensor(target_classes, dtype=torch.float)
        }

In [7]:
def create_data_loader(df, batch_size, mode='train'):
    dataset = SimilarBooksDataset(
        book_pairs_df=df
    )

    should_shuffle = mode == 'train'
    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=0,
        collate_fn=SequenceCollate(),
        shuffle=should_shuffle
    )

In [8]:
df_train, df_test = train_test_split(book_pairs_df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [9]:
print(f'Train shape     : {df_train.shape}')
print(f'Validation shape: {df_val.shape}')
print(f'Test shape      : {df_test.shape}')

Train shape     : (441700, 8)
Validation shape: (55213, 8)
Test shape      : (55213, 8)


In [10]:
train_data_loader = create_data_loader(df_train, BATCH_SIZE, mode='train')
val_data_loader = create_data_loader(df_val, BATCH_SIZE, mode='eval')
test_data_loader = create_data_loader(df_test, BATCH_SIZE, mode='eval')

### Example data:

In [11]:
data = next(iter(train_data_loader))
data.keys()

dict_keys(['sequences', 'attention_masks', 'target_classes'])

In [12]:
print(data['sequences'].shape)
print(data['attention_masks'].shape)
print(data['target_classes'].shape)

print(f'Target classes: {data["target_classes"]}')

torch.Size([1, 468])
torch.Size([1, 468])
torch.Size([1])
Target classes: tensor([0.])


# BERT for book similarity training

In [13]:
class BookSimilarityBERT(nn.Module):

    def __init__(self):
        super(BookSimilarityBERT, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=DROPOUT_RATE)
        self.out = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        # The model returns the output from the first token as pooled_output
        # The first token is used exactly for classification hence it is what we need
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )

        dropout = self.drop(pooled_output)
        forward = self.out(dropout).squeeze(dim=1)
        return self.sigmoid(forward)

In [14]:
model = BookSimilarityBERT()
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Example forward:

In [15]:
input_ids = data['sequences'].to(device)
attention_mask = data['attention_masks'].to(device)
target_classes = data['target_classes']

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length
print(target_classes.shape) # batch size x 1

torch.Size([1, 468])
torch.Size([1, 468])
torch.Size([1])


In [16]:
model(input_ids, attention_mask)

tensor([0.5434], device='cuda:0', grad_fn=<SigmoidBackward>)

### Actual training

In [17]:
# This setup is recommended for BERT training by the authors
# Batch size: 16, 32
# Learning rate (Adam): 5e-5, 3e-5, 2e-5
# Number of epochs: 2, 3, 4

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * NUM_EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.BCELoss().to(device)

In [18]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler):
    model = model.train()

    losses = []
    correct_predictions = 0
    all_predictions = 0
  
    for d in tqdm(data_loader):
        input_ids = d["sequences"].to(device)
        attention_masks = d["attention_masks"].to(device)
        targets = d["target_classes"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_masks
        )
        
        loss = loss_fn(outputs, targets)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        losses.append(loss.detach().item())
        predicted_classes = outputs > POSITIVE_SIMILARITY_THRESHOLD
        all_predictions += input_ids.shape[0]
        correct_predictions += torch.sum(predicted_classes == targets)
        
    return correct_predictions.double() / all_predictions, np.mean(losses)

In [19]:
def eval_model(model, data_loader, loss_fn):
    model = model.eval()

    losses = []
    correct_predictions = 0
    all_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_masks = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_masks
            )

            loss = loss_fn(outputs, targets)
            losses.append(loss.detach().item())

            predicted_classes = outputs > POSITIVE_SIMILARITY_THRESHOLD
            all_predictions += input_ids.shape[0]
            correct_predictions += torch.sum(predicted_classes == targets)

    return correct_predictions.double() / all_predictions, np.mean(losses)

In [20]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}')
    print('#' * 100)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,    
        loss_fn, 
        optimizer,
        scheduler
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn
    )

    print(f'Validation loss {val_loss} accuracy {val_acc}\n')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

  0%|          | 0/441700 [00:00<?, ?it/s]Epoch 1/10
####################################################################################################
  0%|          | 3/441700 [00:01<73:15:16,  1.67it/s]


RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 4.00 GiB total capacity; 2.84 GiB already allocated; 9.35 MiB free; 2.85 GiB reserved in total by PyTorch)