In [1]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch

from transformers import AutoTokenizer

from nltk.tokenize import word_tokenize
from tqdm import tqdm
import numpy as np
import os

In [2]:
# set seeds for reproducibility
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True

In [3]:
# book collection
BOOKS = {
    'TheOdyssey': "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/TheOdyssey_Homer.txt",
    'PrideAndPrejudice': "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/PrideAndPrejudice_JaneAusten.txt",
    'AJourneyToTheCentreOfTheEarth': "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/JulesVerne_AJourneyToTheCentreOfTheEarth.txt",
}
BOOKS

{'TheOdyssey': 'https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/TheOdyssey_Homer.txt',
 'PrideAndPrejudice': 'https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/PrideAndPrejudice_JaneAusten.txt',
 'AJourneyToTheCentreOfTheEarth': 'https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/JulesVerne_AJourneyToTheCentreOfTheEarth.txt'}

In [4]:
# choose a book
BOOK = 'PrideAndPrejudice'

# get the book's url 
book_url = BOOKS[BOOK]
# extract the book's filename from the url
book_filename = os.path.basename(book_url) 

# download the book in local if it doesn't already exist
if not os.path.exists(book_filename):
    try:
        !wget $book_url
        print("File is succesfully downloaded.")
    except Exception as e:
        print(f"Could not download the book from {book_url}")
        print(e)
else:
    print("File has already been downloaded.")

--2024-04-11 18:17:45--  https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/PrideAndPrejudice_JaneAusten.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 703907 (687K) [text/plain]
Saving to: 'PrideAndPrejudice_JaneAusten.txt'


2024-04-11 18:17:46 (18.5 MB/s) - 'PrideAndPrejudice_JaneAusten.txt' saved [703907/703907]

File is succesfully downloaded.


In [5]:
# define hyperparameters
N_EPOCHS = 30
BATCH_SIZE = 128
SEQ_LEN = 200
LEARNING_RATE = 1e-4
EMBED_SIZE = 128
N_HEADS = 2
NUM_LAYERS = 1
BATCH_FIRST=True
NORM_FIRST=False
TEST_SIZE = 0.2
EARLY_STOPPING_STEP_SIZE = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BEST_MODEL_FILEPATH = 'best_model.pt'

DEVICE

'cuda'

In [6]:
def load_text(fp):
    with open(fp, mode='r') as infile:
        text = ''.join([row for row in infile])
    return text

# load text
text = load_text(book_filename)

# Get the number of characters in the text
text_len = len(text)
print(f'Number of characters in the text = {text_len}')
print("--------------\n")

# Print a sample from the text
idx = np.random.randint(low=0, high=text_len-SEQ_LEN)
print(f'Sample text:\n{text[idx:(idx + SEQ_LEN)]}')
print("--------------\n")

Number of characters in the text = 690654
--------------

Sample text:
 it
was all done very well. She had also to anticipate how her visit would
pass, the quiet tenor of their usual employments, the vexatious
interruptions of Mr. Collins, and the gaieties of their inter
--------------



In [7]:
# initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [8]:
# tokenize the whole book
tokens = tokenizer.tokenize(text)
# get the number of tokens
n_tokens = len(tokens)
print(f"There are {n_tokens} tokens in the text.")

# set vocabulary (set of unique tokens in the source text)
#vocab = sorted(list(set(tokens)))
# get the vocabulary size
vocab_size = len(tokenizer)
print(f"Vocabulary size = {vocab_size}")
print("--------------\n")

Token indices sequence length is longer than the specified maximum sequence length for this model (159555 > 512). Running this sequence through the model will result in indexing errors


There are 159555 tokens in the text.
Vocabulary size = 28996
--------------



In [9]:
# define the train and test data
test_len = int(n_tokens * TEST_SIZE)
train_data = tokens[:-test_len]
test_data = tokens[-test_len:]

# print the length of train and test data
print(f'Length of the train text = {len(train_data)}')
print(f'Length of the test text  = {len(test_data)}')

Length of the train text = 127644
Length of the test text  = 31911


In [10]:
# define the TextDataset as an instance of torch.nn.Dataset
class TextDataset(Dataset):
    def __init__(self, text, seq_len):
        super().__init__()
        self.text = text
        self.seq_len = seq_len

    def __len__(self):
        # define the number of samples in the dataset
        return len(self.text) - self.seq_len - 1

    def __getitem__(self, idx):
        # x = [c0, c1, ... cN]
        x = torch.tensor(
            [tokenizer.convert_tokens_to_ids(token) for token in self.text[idx:(idx + self.seq_len)]], 
            dtype=torch.long
        )
        # y = [c1, c1, ... c(N+1)]
        y = torch.tensor(
            [tokenizer.convert_tokens_to_ids(token) for token in self.text[(idx + 1):(idx + 1 + self.seq_len)]], 
            dtype=torch.long
        )
        return x, y

In [11]:
# initialize train and test datasets
train_dset = TextDataset(train_data, SEQ_LEN)
test_dset = TextDataset(test_data, SEQ_LEN)

# initialize train and test iterators
train_iterator = DataLoader(train_dset, BATCH_SIZE, shuffle=True, drop_last=True, pin_memory=True)
test_iterator = DataLoader(test_dset, BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True)

In [12]:
# define GPT network
class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size, seq_len, num_layers, n_heads, batch_first, norm_first):
        super().__init__()
        self.seq_len = seq_len
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_embedding = nn.Embedding(seq_len, embed_size)
        # define transformer encoder network
        encoder_layer = nn.TransformerEncoderLayer(embed_size, N_HEADS, batch_first=batch_first, norm_first=norm_first)
        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_size, vocab_size)
            
    def forward(self, x, is_causal=False):
        word_embed = self.word_embedding(x)
        pos_embed = self.positional_embedding(torch.arange(x.shape[1], dtype=torch.long).to(x.device))        
        x = word_embed + pos_embed
        if is_causal:
            mask = nn.Transformer.generate_square_subsequent_mask(self.seq_len).to(x.device)
        else: mask=None
        x = self.encoder(x, mask=mask, is_causal=is_causal)
        x = self.fc(x)
        return x

In [13]:
def compute_loss(logits, y):
    # get the logits dimensions for better readibility and understandibility
    B, S, C = logits.shape
    # compute loss
    loss = F.cross_entropy(logits.view(B*S, C), y.view(-1))
    return loss

def batch_loop(model, x, y, is_causal=False):
    # forward pass
    logits = model(x, is_causal=is_causal)
    # compute loss
    loss = compute_loss(logits, y)
    return loss

def train_epoch(model, iterator):
    # global variable
    global DEVICE
    # set model to training mode
    model.train()
    # initialize epoch loss
    loss_epoch = 0.0
    
    # iterate over batches
    for iter_idx, (x, y) in tqdm(enumerate(iterator), total=len(iterator), desc="Training Progress"):
        # carry tensors to available device
        x, y = x.to(DEVICE), y.to(DEVICE)
        # train batch and compute loss
        loss = batch_loop(model, x, y, is_causal=True)        
        # gradient update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # add batch loss to epoch loss sum
        loss_epoch += loss.item()
        
    # get the number of batches
    n_batch = len(iterator)
    # compute average loss
    loss_epoch_avg = loss_epoch / n_batch
    return loss_epoch_avg

def evaluate(model, iterator):
    # global variable
    global DEVICE
    # set model to evaluation mode
    model.eval()
    # initialize loss
    loss_sum = 0.0
    
    with torch.inference_mode():
        # iterate over batches
        for iter_idx, (x, y) in tqdm(enumerate(iterator), total=len(iterator), desc="Evaluation Progress"):
            # carry tensors to available device
            x, y = x.to(DEVICE), y.to(DEVICE)
            # train batch and compute loss
            loss = batch_loop(model, x, y)        
            # add batch loss to epoch loss sum
            loss_sum += loss.item()
        
    # get the number of batches
    n_batch = len(iterator)
    # compute average loss
    loss_avg = loss_sum / n_batch
    return loss_avg

In [14]:
def generate(model, text, max_length=100, temp=1.0):
    # global variables
    global DEVICE, tokenizer
    # set model to evaluation mode
    model.eval()
    input_token_ids = [tokenizer.convert_tokens_to_ids(word) for word in tokenizer.tokenize(text)]
    tokens = torch.tensor([input_token_ids], dtype=torch.long)
    generated_token_ids = input_token_ids
    with torch.inference_mode():
        for _ in range(max_length):
            logits = model(tokens)
            next_token_logits = logits[:,-1,:]
            next_token_id = torch.multinomial(next_token_logits.div(temp).exp(), num_samples=1)
            tokens = torch.cat((tokens, torch.tensor([[next_token_id]])), dim=1)
            generated_token_ids.append(next_token_id.item())
    
    return generated_token_ids

In [17]:
# initialize GPT model
model = GPT(
    vocab_size=vocab_size,
    embed_size=EMBED_SIZE,
    seq_len=SEQ_LEN,
    num_layers=NUM_LAYERS,
    n_heads=N_HEADS,
    batch_first=BATCH_FIRST,
    norm_first=NORM_FIRST,
).to(DEVICE)
model = torch.nn.DataParallel(model)

# initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

best_test_loss = float('inf')
best_loss_streak = 0

# iterate over epochs
for epoch_idx in range(N_EPOCHS):
    train_loss = train_epoch(model, train_iterator)
    test_loss = evaluate(model, test_iterator)
    print(f"Epoch {epoch_idx:02} - Train Loss = {train_loss:.3f}\tTest Loss = {test_loss:.3f}")
    
    # save the current model as the best model if the current test loss is the least achieved 
    if test_loss < best_test_loss:
        # save the curent model's parameters as the best model parameters
        torch.save(model.state_dict(), BEST_MODEL_FILEPATH)
        # replace the best test loss with the current best loss
        best_test_loss = test_loss
        # reset early stoppping counter 
        best_streak_count = 0
        # display info
        print(f'The best model is found and saved. Current best test loss = {best_test_loss:.3f}')
        text = "A"
        generated_tokens = generate(model, text)
        print(tokenizer.decode(generated_tokens))
    else:
        # update early stoppping counter 
        best_streak_count += 1

    # check early stopping condition
    if best_streak_count == EARLY_STOPPING_STEP_SIZE:
        print(f"A better model has not been found in the last {EARLY_STOPPING_STEP_SIZE} epochs. Early stopping...")
        break
        
    print("--------------\n")

Training Progress: 100%|██████████| 995/995 [03:57<00:00,  4.19it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:32<00:00,  7.71it/s]


Epoch 00 - Train Loss = 6.182	Test Loss = 5.154
The best model is found and saved. Current best test loss = 5.154


2024-04-11 18:35:08.567079: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 18:35:08.567190: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 18:35:08.684102: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


A tags place, Camera him, and considered Mr. they - - but aloud? Bennet ; and Mr. Aftercy of Mr. Collins? but What _zad will remained, and what to any one of equal evergreen was any thing Jane case ; by his friends, it is Mrs. But thaton, that. But her for them Miss Bingley! youju I dare too, Elizabeth, as speaking to make " " it in ladies, and that it had been at four. Goldman must
---------------

--------------



Training Progress: 100%|██████████| 995/995 [03:55<00:00,  4.23it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:31<00:00,  7.90it/s]


Epoch 01 - Train Loss = 4.680	Test Loss = 4.768
The best model is found and saved. Current best test loss = 4.768
A Novel. Philips, and welcomed for the e time, was expected in silent away to his arrival of two on arguments, which the girls to make them are longnt, she in behalf by - - - on breathless himself to involve of each question, at least was agreed cannot general countenance. On on whose manner as an excellent are severe few days, " That is a manner from her into the honour of sinking to your surrounding! denying great deal for her. Gardiner ; shebred proved her
---------------

--------------



Training Progress: 100%|██████████| 995/995 [03:55<00:00,  4.23it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:31<00:00,  7.86it/s]


Epoch 02 - Train Loss = 4.228	Test Loss = 4.703
The best model is found and saved. Current best test loss = 4.703
A younger excellent water of his cousin, however, it ; and that they were attempted to the endeavour to be, in this time to have been travelling that ever motive, whim impertinentmost stepr. She blushed. game from Londoneneds. In know and in all the right, and the garden, firm, it a good at her sister, and supertivat future feelings, when she had passed between the punctulate her. She was a clever - corner
---------------

--------------



Training Progress: 100%|██████████| 995/995 [03:55<00:00,  4.22it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:31<00:00,  7.94it/s]


Epoch 03 - Train Loss = 3.975	Test Loss = 4.719
--------------



Training Progress: 100%|██████████| 995/995 [03:55<00:00,  4.22it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:31<00:00,  7.90it/s]


Epoch 04 - Train Loss = 3.796	Test Loss = 4.768
--------------



Training Progress: 100%|██████████| 995/995 [03:56<00:00,  4.21it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:31<00:00,  7.86it/s]


Epoch 05 - Train Loss = 3.658	Test Loss = 4.837
--------------



Training Progress: 100%|██████████| 995/995 [03:56<00:00,  4.21it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:31<00:00,  7.95it/s]


Epoch 06 - Train Loss = 3.538	Test Loss = 4.927
--------------



Training Progress: 100%|██████████| 995/995 [03:54<00:00,  4.24it/s]
Evaluation Progress: 100%|██████████| 248/248 [00:31<00:00,  7.95it/s]

Epoch 07 - Train Loss = 3.419	Test Loss = 5.039
A better model has not been found in the last 5 epochs. Early stopping...



