### Preferences

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 27.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 60.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import torch
import nltk

import random
import glob
import os
import re

Hyperparams:

In [None]:
device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'

print(device)

cuda


In [None]:
dataset_path = './drive/MyDrive/Colab Notebooks/Lovecraft GPT-2/data'
dot_stopwords = {'Dr', 'Ms', 'Mr', 'Mrs', 'Prof', 'Inc', 'Fr', 'St'}
model_name = 'gpt2-medium' #'distilgpt2'
output_path = 'trained_models'

batch_size = 16
epochs = 8
learning_rate = 3e-4 # 3e-5
warmup_steps = 100
max_seq_len = 400

### Without finetuning

In [None]:
def generate(prompt_text, model, tokenizer, n_seqs=1, min_length=16, max_length=32):

    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")

    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=max_length,
        min_length=min_length,
        temperature=0.8,
        top_k=0,
        top_p=0.8,
        num_beams=10,
        repetition_penalty = 1.2,
        do_sample=True,
        num_return_sequences=n_seqs,
        pad_token_id=tokenizer.eos_token_id
    )

    return [tokenizer.decode(sequence.tolist()) for sequence in output_sequences]

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

In [None]:
output = generate('Nastya Kuzina', model, tokenizer)
print(output[0])

Nastya Kuzina (BK)
: This is a fantastic win for me. The fact that I have the opportunity to run alongside these girls


### Prepare dataset

In [None]:
nltk.download('punkt')
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def preprocess_text(tokenizer, text):
    text = text.replace('\n', ' ')

    for stopword in dot_stopwords:
        text = text.replace(stopword + '. ', stopword +'._')

    tokenized = tokenizer.tokenize(text)
    random.shuffle(tokenized)

    sentences = []
    for sentence in tokenized[:10]:
        for stopword in dot_stopwords:
            sentence = sentence.replace(stopword + '._', stopword + '. ')

        sentences.append(sentence)

    return sentences

In [None]:
def preprocess_title(title):
    return os.path.basename(title).replace('_', ' ').replace('.txt', '')

In [None]:
def load_data(tokenizer, path):
    title = preprocess_title(path)

    with open(path, 'r') as file:
        text = file.read()
        sentences = preprocess_text(tokenizer, text)
        labels = [title for sentence in sentences]

        return labels, sentences

In [None]:
class SentenceDataset(Dataset):
    def __init__(self, x, y, tokenizer, max_length=1024):
        super().__init__()
        self.sentences = [torch.tensor(tokenizer.encode(f'{y[i]}\n{x[i][:max_length]}')) for i in range(len(y))]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        return self.sentences[item]

In [None]:
books = glob.glob(f'{dataset_path}/*.txt')
y = []
x = []

for book in books:
    labels, sentences = load_data(sentence_tokenizer, book)

    y += labels
    x += sentences

In [None]:
print(len(books))

102


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
train_dataset = SentenceDataset(x_train, y_train, tokenizer)
train_data_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

test_dataset = SentenceDataset(x_test, y_test, tokenizer)

In [None]:
print(len(x))

2946


### Train

In [None]:
def train(model, output_path=output_path):
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    model = model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    tmp_items_tens = None
    for epoch in range(epochs):
        proc_seq_count = 0
        sum_loss = 0.0

        for _, item in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
            item_tens = item.to(device)

            if item_tens.size()[1] > max_seq_len:
                continue

            if not torch.is_tensor(tmp_items_tens):
                tmp_items_tens = item_tens
                continue
            else:
                if tmp_items_tens.size()[1] + item_tens.size()[1] > max_seq_len:
                    work_items_tens = tmp_items_tens
                    tmp_items_tens = item_tens
                else:
                    tmp_items_tens = torch.cat([tmp_items_tens, item_tens[:,1:]], dim=1)
                    continue

            outputs = model(work_items_tens, labels=work_items_tens)
            loss, logits = outputs[:2]
            loss.backward()
            sum_loss += loss.detach().data

            if proc_seq_count % batch_size == 0:
                optimizer.step()
                optimizer.zero_grad()
                model.zero_grad()

            proc_seq_count +=  1

        print( f"Epoch {epoch+1} | Train loss: {sum_loss}")
        torch.save(model.state_dict(), f'{output_path}/{model_name}-{epoch+1}.pt')

In [None]:
model = GPT2LMHeadModel.from_pretrained(model_name)

In [None]:
train(model)

  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 1 | Train loss: 369.568359375


  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 2 | Train loss: 295.7247314453125


  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 3 | Train loss: 242.28204345703125


  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 4 | Train loss: 200.42002868652344


  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 5 | Train loss: 155.9810028076172


  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 6 | Train loss: 117.1095199584961


  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 7 | Train loss: 82.64701843261719


  0%|          | 0/813 [00:00<?, ?it/s]

Epoch 8 | Train loss: 60.59356689453125


### Evaluate

In [None]:
def generate(prompt_text, model, tokenizer, n_seqs=1, min_length=16, max_length=32):

    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")

    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=max_length,
        min_length=min_length,
        temperature=0.8,
        num_beams=None,
        top_k=0,
        top_p=0.8,
        repetition_penalty = 1.2,
        do_sample=True,
        num_return_sequences=n_seqs,
        pad_token_id=tokenizer.eos_token_id
    )

    return [tokenizer.decode(sequence.tolist()) for sequence in output_sequences]

In [None]:
def process_output(output, string_length=64):
    counter = 0
    processed = ''

    for i in range(len(output)):
        counter += 1

        if output[i] == ' ' and counter > string_length:
            processed = f'{processed}\n'
            counter = 0
        else:
            processed = f'{processed}{output[i]}'

    return processed

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.load_state_dict(torch.load(f'{output_path}/{model_name}-8.pt', map_location=torch.device(device)))
model.eval()
model.to('cpu')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [None]:
output = generate('nameless', model, tokenizer, 1, 64, 128)
print(process_output(output[0], 64))

nameless City
The figures were startling and disquieting, for they
suggested a mammoth subterranean city whose height was over nine
hundred feet (200 m) above the present level of civilisation. Hoard
from Yuggoth  [Note: The figure in this legend is not to be found
anywhere else in the known world.] Beast in the Cave or John Lees
Adventure
And one night a mighty gulf was bridged, an ocean that
no man knows how deep; when all the dreams and nightmares that men
have ever known were swallowed up by another sea which none but him
has seen nor dared to face—the black abyss beyond which
