In [102]:
import pandas as pd
import re
import torch
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, set_seed, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import transformers
from torch.utils.data import Dataset, DataLoader

In [103]:
data = pd.read_csv("data/goodreads_books.csv", converters={'COLUMN_NAME': pd.eval})
data = data.sample(n = 1000)

In [104]:
data = data.filter(["title", "rating_count", "average_rating", "genre_and_votes"], axis=1)

In [105]:
def genre_break(genre):
    for index, item in enumerate(genre):
        new_item = re.sub('\s[0-9]+', '', item)
        genre[index] = new_item
    
    return genre

In [106]:
data["genre"] = data["genre_and_votes"].str.split(", ")
data.drop(axis=1, columns=["genre_and_votes"], inplace=True)

In [107]:
data.dropna(inplace=True, axis=0)

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 947 entries, 33901 to 36835
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           947 non-null    object 
 1   rating_count    947 non-null    int64  
 2   average_rating  947 non-null    float64
 3   genre           947 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 37.0+ KB


In [109]:
data["genre"].apply(lambda item: genre_break(item))

33901    [Fantasy, Childrens-Middle Grade, Childrens, A...
35383                                            [Romance]
35418                [Contemporary, Retellings, Childrens]
27807    [Romance-Paranormal Romance, Paranormal-Vampir...
41157    [Travel, Nonfiction, Autobiography-Memoir, Cul...
                               ...                        
50474          [Classics, Mystery, Fiction, Short Stories]
11804    [Fantasy, European Literature-Czech Literature...
18063    [Fiction, Contemporary, European Literature-Ge...
11346                                          [Christian]
36835           [Short Stories, Fiction, Literary Fiction]
Name: genre, Length: 947, dtype: object

In [110]:
data

Unnamed: 0,title,rating_count,average_rating,genre
33901,Switch,2163,4.10,"[Fantasy, Childrens-Middle Grade, Childrens, A..."
35383,Until Forever,465,4.69,[Romance]
35418,Looking-Glass Girl,881,4.23,"[Contemporary, Retellings, Childrens]"
27807,Because Your Vampire Said So,4530,4.08,"[Romance-Paranormal Romance, Paranormal-Vampir..."
41157,Beyond the Sky and the Earth: A Journey Into B...,3383,4.15,"[Travel, Nonfiction, Autobiography-Memoir, Cul..."
...,...,...,...,...
50474,"The Complete Sherlock Holmes, Volume I",28906,4.49,"[Classics, Mystery, Fiction, Short Stories]"
11804,VlÃ¡dci strachu,663,4.23,"[Fantasy, European Literature-Czech Literature..."
18063,Vom Ende der Einsamkeit,9687,4.27,"[Fiction, Contemporary, European Literature-Ge..."
11346,What Gets You Through,23,4.43,[Christian]


## Fine-Tuning Set-Up

In [111]:
class BookTitles(Dataset):
    def __init__(self, content, gpt2_type="gpt2", max_length=256):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type, bos_token = '<startoftext>', eos_token = '<endoftext>', pad_token = '<pad>')
        self.input_ids = []

        for _, row in content.iterrows():
          tags = ""
          for item in row[3]:
              tags = tags + item + ', '
          prep = self.tokenizer.encode(f'<startoftext>Tags: {tags}\nTitle: {row[0]}<endoftext>', truncation = True, max_length = max_length, padding="max_length")
          self.input_ids.append(torch.tensor(prep))
        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, item):
        return self.input_ids[item]

In [112]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [113]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=4, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):
    acc_steps = 100
    device = torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [114]:
train_set = BookTitles(data)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Model Set-Up

In [115]:
torch.manual_seed(32)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token = '<startoftext>', eos_token = '<endoftext>', pad_token = '<pad>')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id = tokenizer.eos_token_id).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50260. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50260, 768)

In [116]:
model = train(train_set, model, tokenizer)

Training epoch 0
0


947it [12:13,  1.29it/s]


Training epoch 1
tensor(9.1137, device='cuda:0', grad_fn=<NllLossBackward0>)


947it [12:34,  1.26it/s]


Training epoch 2
tensor(5.3341, device='cuda:0', grad_fn=<NllLossBackward0>)


947it [12:42,  1.24it/s]


Training epoch 3
tensor(2.9886, device='cuda:0', grad_fn=<NllLossBackward0>)


947it [12:55,  1.22it/s]


In [121]:
text = "Tags: Romance, \nTitle: Hi"
encoded_input = tokenizer.encode(f'{text}', return_tensors='pt').to("cuda")
outputs = model.generate(encoded_input, max_length = 128, no_repeat_ngram_size = 2, num_beams = 5, early_stopping = True)

In [122]:
print(tokenizer.decode(outputs[0], skip_special_tokens = True))

Tags: Romance, 
Title: Hi,

I I,,
I, I, my.. I'm..,.,., my.,
I," I., I,"I'm, the." I.".,.",I," my.., the,".," I," I.," the my,"the",",the,",". I."I.,,".," my.","." the,".,, a, an," a,"
