In [1]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch

from tqdm import tqdm
import numpy as np
import os

# set seeds for reproducability
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [2]:
# book collection
books = {
    'TheOdyssey': ("TheOdyssey_Homer.txt", "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/nlp/TheOdyssey_Homer.txt"),
    'PrideAndPrejudice': ("PrideAndPrejudice_JaneAusten.txt", "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/nlp/PrideAndPrejudice_JaneAusten.txt")
}

In [3]:
# choose the book
bookname = 'PrideAndPrejudice'

# get the book's filename and url 
book_filename, book_url = books[bookname] 

# download the book in local if it doesn't already exist
if not os.path.exists(book_filename):
    !wget $book_url

In [4]:
# define hyperparameters
n_epochs = 10
batch_size = 128
seq_len = 200
learning_rate = 1e-4
embed_size = 100
hidden_size = 100
num_layers = 1
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_model_filepath = 'best_model.pt'

In [5]:
def load_text(fp):
    with open(fp, mode='r') as infile:
        text = ''.join([row for row in infile])
    return text

# load text
text = load_text(book_filename)

# Get the number of characters in the text
text_len = len(text)
print(f'Number of characters in the text = {text_len}')
print("--------------\n")

# Print a sample from the text
idx = np.random.randint(low=0, high=text_len-seq_len)
print(f'Sample text:\n{text[idx:(idx + seq_len)]}')
print("--------------\n")

Number of characters in the text = 690654
--------------

Sample text:
 it
was all done very well. She had also to anticipate how her visit would
pass, the quiet tenor of their usual employments, the vexatious
interruptions of Mr. Collins, and the gaieties of their inter
--------------



In [6]:
# define vocabulary (set of unique characters in the source text)
vocab = sorted(list(set(text)))
# get the vocabulary size
vocab_size = len(vocab)

# define the mappings (as dictionary) of 
# 'a character (from the vocab) to a unique ID' ---> 'char2int'
# and
# 'a unique ID to a character (from the vocab)'  ---> 'int2char'
char2int = {c: idx for idx, c in enumerate(vocab)}
int2char = {idx: c for idx, c in enumerate(vocab)}

# define the encode() that encodes/converts a character to a unique ID 
def encode(character):
    global char2int
    return char2int[character]

# define the encode() that decodes/converts back a unique ID to a character
def decode(integer):
    global int2char
    return int2char[integer]

# print vocabulary as a single string 
print(f"Vocabulary: {''.join(vocab)}")
print(f"Vocabulary size = {vocab_size}")
print("--------------\n")

# print an example of how encode() and decode() operate
char = 'A'
print(f"char '{char}' | encode('{char}') = {encode(char)}")
print(f"char '{char}' | encode(decode('{char}')) = {decode(encode(char))}")
print("--------------\n")

Vocabulary: 
 !"&'()*,-.1234568:;?ABCDEFGHIJKLMNOPRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzàê
Vocabulary size = 78
--------------

char 'A' | encode('A') = 22
char 'A' | encode(decode('A')) = A
--------------



In [7]:
# create a dataset for the data as an instance of torch.nn.Dataset class
class TextDataset(Dataset):
    def __init__(self, text, seq_len):
        super().__init__()
        self.text = text
        self.seq_len = seq_len

    def __len__(self):
        # define the number of samples in the dataset
        return len(self.text) - self.seq_len - 1

    def __getitem__(self, idx):
        # define the input and target samples
        x = torch.tensor([encode(char) for char in self.text[idx:(idx + self.seq_len)]], dtype=torch.long)
        y = torch.tensor([encode(char) for char in self.text[(idx + 1):(idx + 1 + self.seq_len)]], dtype=torch.long)
        return x, y

In [8]:
# define test set ratio
test_size = 0.2

# define the train and test data
train_data = text[:-(int(text_len * test_size))]
test_data = text[-(int(text_len * test_size)):]

# print the length of train and test data
print(f'Length of the train text = {len(train_data)}')
print(f'Length of the test text  = {len(test_data)}')

Length of the train text = 552524
Length of the test text  = 138130


In [9]:
# initialize train and test datasets
train_dset = TextDataset(train_data, seq_len)
test_dset = TextDataset(test_data, seq_len)

# initialize train and test iterators
train_iterator = DataLoader(train_dset, batch_size, shuffle=True, drop_last=True)
test_iterator = DataLoader(test_dset, batch_size, shuffle=False, drop_last=True)  # no need to shuffle the test dset

In [10]:
# define characterRNN module
class characterRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, output_size):
        super().__init__()  
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding_layer = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden_state, cell_state):
        x = self.embedding_layer(x)
        x, (hidden_state, cell_state) = self.lstm(x, (hidden_state, cell_state))
        x = self.fc(x)
        return x, hidden_state, cell_state
    
    def init_hidden_and_cell(self, batch_size):
        # set the initial hidden and cell state tensors for LSTM training
        hidden_state = torch.zeros((self.num_layers, batch_size, hidden_size))
        cell_state = torch.zeros((self.num_layers, batch_size, hidden_size))
        return hidden_state, cell_state

In [11]:
def generate_text(model, start_char='\n', generated_text_len=200, temperature=0.8):
    
    global device
    
    # set model to train mode
    model.eval()
    
    # add start character to the beginning of the generated text sequence
    generated_text = start_char
    
    # operate under inference mode to avoid gradient computations and speed up the process
    with torch.inference_mode():
        # set the initial hidden and cell state tensors for LSTM training
        hidden_state, cell_state = model.init_hidden_and_cell(batch_size=1)
        hidden_state, cell_state = hidden_state.to(device), cell_state.to(device)
        
        # encode the start_char and reshape by adding a single batch dimensionality 
        x = torch.tensor(encode(start_char), dtype=torch.long).view(1, -1).to(device)
        
        # generate N characters, where N = generated_text_len
        for _ in range(generated_text_len):
            # predict the probabilities of the next character
            y, hidden_state, cell_state = model(x, hidden_state, cell_state)
            # select the next character from a multinomial distribution
            # (descreasing temperature value provides more flexibility for the next character selection and vice versa)
            x = torch.multinomial(y[0].div(temperature).exp(), num_samples=1)            
            # decode the next character (for human readibility) 
            next_char = decode(x.item())
            # add next character to the end of the generated text
            generated_text += next_char

    return generated_text

In [12]:
def evaluate(model, iterator):
    
    global device
    
    # set model to train mode
    model.eval()
    
    # operate under inference mode to avoid gradient computations and speed up the process
    with torch.inference_mode():
        # initialize loss
        loss_sum = 0.0
        
        # iterate over batches
        for x, y in tqdm(iterator):
            # carry tensors to selected device
            x, y = x.to(device), y.to(device)

            # set the initial hidden and cell state tensors for LSTM training
            hidden_state, cell_state = model.init_hidden_and_cell(batch_size)
            hidden_state, cell_state = hidden_state.to(device), cell_state.to(device)

            # forward pass (get logits, hidden and cell states for each characters in the sequence)
            logits, hidden_state, cell_state = model(x, hidden_state, cell_state)

            # compute loss (cross entropy) for each characters in the sequence (take average at the end)
            seq_len = len(x)
            loss = torch.mean(torch.stack([F.cross_entropy(logits[:,i,:], y[:,i]) for i in range(seq_len)]))
            
            # add batch loss to total loss sum
            loss_sum += loss.item()
    
    # compute avg loss
    loss_avg = loss_sum / len(iterator)
    return loss_avg

In [13]:
# initialize characterRNN model
model = characterRNN(
    input_size=vocab_size,
    embed_size=embed_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    output_size=vocab_size,
).to(device)

# initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# init the best test loss as positive inifinity
best_test_loss = float('inf')

###### training loop ######
# iterate over epochs
for epoch_idx in range(1, n_epochs+1):
    # set model to train mode
    model.train()  
    
    loss_train = 0.0
    # iterate over batches        
    for iter_idx, (x, y) in tqdm(enumerate(train_iterator), total=len(train_iterator)):  
        # carry tensors to selected device
        x, y = x.to(device), y.to(device)
        
        # set the initial hidden and cell state tensors for LSTM training
        hidden_state, cell_state = model.init_hidden_and_cell(batch_size)
        hidden_state, cell_state = hidden_state.to(device), cell_state.to(device)
                
        # forward pass (get logits, hidden and cell states for each characters in the sequence)
        logits, hidden_state, cell_state = model(x, hidden_state, cell_state)
        
        # compute loss (cross entropy) for each characters in the sequence (take average at the end)
        loss = torch.mean(torch.stack([F.cross_entropy(logits[:,i,:], y[:,i]) for i in range(seq_len)]))
        
        # update gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # add batch loss to total loss sum
        loss_train += loss.item()
        
    # at the end of each epoch, generate a text and display
    loss_train /= len(train_iterator)
    loss_test = evaluate(model, test_iterator)
    print(f"epoch {epoch_idx} || train loss = {loss_train:.3f}\ttest loss = {loss_test:.3f}\n")
    print(generate_text(model, generated_text_len=500))
    print("--------------\n")
    
    # save the current model as the best model if the current test loss is the least achieved 
    if loss_test < best_test_loss:
        # save the curent model's parameters as the best model parameters
        torch.save(model.state_dict(), best_model_filepath)
        # replace the best test loss with the current best loss
        best_test_loss = loss_test
        # display info
        print(f'The best model is found and saved. Current best test loss = {best_test_loss:.3f}')

100%|██████████| 4315/4315 [05:08<00:00, 13.99it/s]
100%|██████████| 1077/1077 [00:29<00:00, 37.13it/s]


epoch 1 || train loss = 2.191	test loss = 1.737


 day formering he fortur an septiaging the this genked at am my her parly to have pould was the could he vay it seepe veray of her itherd be inded, and, she to mucis mishise, as and a the not am that be hit the himsur presest to the frome the not been; turct the masted, and nation mut were of her earce, and with at it any wass insorqued at latt to porceref what be fist reated what companted the to the firged, ow not added to and Mr. Shan the
prast you for wha into must on of his self thean not D
--------------

The best model is found and saved. Current best test loss = 1.737


100%|██████████| 4315/4315 [05:06<00:00, 14.07it/s]
100%|██████████| 1077/1077 [00:28<00:00, 37.94it/s]


epoch 2 || train loss = 1.566	test loss = 1.490


the much and she such adsure by the
comforts.

Elizabets and or her felent with their considel the propite a sing happiness with proonaily sorment; but he will now him difficer's amaughter, there from of your beliest women of such
a drest Mr. Darcy the was such have their meanty to the some intlemed by she perfected Longhang, it was in a protage his ofling
by I not halment tonear of unders in the lotter's enought be had minder ffor the fatter reltance? I am intrommable been grated to the fupt, t
--------------

The best model is found and saved. Current best test loss = 1.490


100%|██████████| 4315/4315 [05:07<00:00, 14.04it/s]
100%|██████████| 1077/1077 [00:28<00:00, 37.57it/s]


epoch 3 || train loss = 1.403	test loss = 1.402


"To leave they were bring the should not be so litter to in it would be on to
acquaintancly that be
as regave, had allo--be. On that you should be rangerly were alone were excesfle. She had not concess and with of the smost were surporing but enough expressing might will be
are nothing of
those it was not feelings. "Wickham, For one that elacial sominned to his few to me in him of your against had near do her uspought I was not courre, of so the dinedness, and particers with being much and happy
--------------

The best model is found and saved. Current best test loss = 1.402


100%|██████████| 4315/4315 [05:08<00:00, 13.99it/s]
100%|██████████| 1077/1077 [00:29<00:00, 37.08it/s]


epoch 4 || train loss = 1.329	test loss = 1.360


     in his reformer; and if his considered; and she might have never being another proached by
     wo do you, it every such all Mr. Bingley was not be hart and
soon to just attending to Miss Darcy who had never neect misair, he saming a ratements, carming a wish in his his said, that she certable for the besher before they were cacking to say not have not how freh I they was expression of all the farteld them had pleasure at I gourey; there were certain that Mr. Darcy and Miss Bingley were som
--------------

The best model is found and saved. Current best test loss = 1.360


100%|██████████| 4315/4315 [05:07<00:00, 14.02it/s]
100%|██████████| 1077/1077 [00:28<00:00, 37.67it/s]


epoch 5 || train loss = 1.286	test loss = 1.334



to _he_ is as her wish to her carriate of her brother, and had not believe might and easily such must propsons; but I should have been seen by
the passed her us
persuaded the windle; but now had about from the parton, and with the take your mother, "I shall
not mach that they Longbourn than
be rememether so father to nothing to think it was each attentions of sister, that
I have such a mistach of its now quietiess. Lown Elizabeth, "they had been so mention with a little being so, he should have
--------------

The best model is found and saved. Current best test loss = 1.334


100%|██████████| 4315/4315 [05:06<00:00, 14.06it/s]
100%|██████████| 1077/1077 [00:28<00:00, 37.25it/s]


epoch 6 || train loss = 1.256	test loss = 1.317


was
me to all five must long, and, he added in Mr. Darcy, and her
so consent direful with his manner of the smile of her incontively; but I have been all to Longbourn," said
Lady Catherine's attanner afterwards to be afterwisely conceities on the presentation which Mr. Wickham had began at the pleased Mr. Darcy with a moment. I want from so many considering her farther to want of the wisely of powers to revery. He was present and bridged to make his soon to be at likely would no more."

Mr. Gard
--------------

The best model is found and saved. Current best test loss = 1.317


100%|██████████| 4315/4315 [05:09<00:00, 13.93it/s]
100%|██████████| 1077/1077 [00:28<00:00, 37.47it/s]


epoch 7 || train loss = 1.233	test loss = 1.306



and her resent that the subject of the whole procient, to go me, you will any often been undertoning to the charge of a lought, and particularly to speaks, they will not
suppose of she brother, and so beauty."

"My dear very short ender the last your confidial, he was not subseding them, know it over see. Mr. Collins had not?"

"We assured Elizabeth, "what answer the boths."

"No, last that I have in the kinding with them all the wishes of herself, replied speated. Sir Denny Mrs. Bennet have ju
--------------

The best model is found and saved. Current best test loss = 1.306


100%|██████████| 4315/4315 [05:07<00:00, 14.03it/s]
100%|██████████| 1077/1077 [00:28<00:00, 37.30it/s]


epoch 8 || train loss = 1.216	test loss = 1.298


wat
morning it. Her mother sent played to her, or a fancy of the first entuence which had nothing to be her agreeable for herself."

Elizabeth almost silent that contriety to respect, and from the country, had so firm, he had been glad to met the country stair propertly with her own on
event to deter of
some good
man
are nothing which was inhabour to make of with a consisting him, thought the door.

Mrs. Bennet,
the carriage had lively but the late on one on the room, when I
have love at Netherf
--------------

The best model is found and saved. Current best test loss = 1.298


100%|██████████| 4315/4315 [05:07<00:00, 14.04it/s]
100%|██████████| 1077/1077 [00:28<00:00, 37.73it/s]


epoch 9 || train loss = 1.202	test loss = 1.293


      for your facely bestone, I shall all the subject," said
Elizabeth, and the laws proves to answer."

"Perhaps would be as whose had a call of prover four sister in the unfier for the charms were the Parsonage, and all bestow, however, indeed brought or asking
much as she replied?"

"I do below who had then strike enjoying the good was next perios which the particulars the near mewn, there is a disportagly
she could not at least afterwards or accomplished with a rates to glad to her it. The 
--------------

The best model is found and saved. Current best test loss = 1.293


100%|██████████| 4315/4315 [05:08<00:00, 13.98it/s]
100%|██████████| 1077/1077 [00:29<00:00, 37.03it/s]


epoch 10 || train loss = 1.190	test loss = 1.289


     persuating Mr. Collins was a
character with a pride again that is concerned in the rest or a comparable regarded by pleasure might
have a laugh as pleasant written at restory, she may have been going to the first not to lear how your land; and, and subject, there concerned to Elizabeth himself that you into do not the so look of the difference,"
said Mrs.
Forster. The lost of her partly intention of many great could past as his very then worth, were made unavoid of the little partiality,
an
--------------

The best model is found and saved. Current best test loss = 1.289


In [17]:
### loading the best model

# reinitialize the model
best_model = characterRNN(
    input_size=vocab_size,
    embed_size=embed_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    output_size=vocab_size,
)
# load best model's parameters
best_model.load_state_dict(torch.load(best_model_filepath))
best_model = best_model.to(device)

In [18]:
### generating text with different temperature values 
### (descreasing temperature value provides more flexibility for the next character selection (tends to generate low-quality but more creative texts) and vice versa)

# define possible temperature values
temperatures = [1.0, 0.8, 0.5]

# iterate over temperature values
for temp in temperatures:
    # generate text
    generated_test = generate_text(best_model, generated_text_len=1000, temperature=temp)
    # display
    print(f'Temperature = {temp}')
    print(generated_test)
    print("--------------\n\n")

Temperature = 1.0

only, which as." Miss Bingley. Her Lady Catherine
and his niece. There is sister that I have all that we may look you, necessanot, she saw. Thought. It told it. Whilley next of summe, she took happiness that she were it was now again as much was being enterledness Sit astonishat
oming
trough probably to arved that he would not you case him tilling as to desirit example
as,
I wish a desire does pleasure in termities of the estates interest of glander way, convince.--What she deare, with for any deady often; and if could not going much following knew to be ideedial to have knows what I very from thinking one of this favoured to his regard to believe and perfect by elegance had
been one so approaching to see must only are
subject, I _mind_ exached on the
evowly."

"She is not satisf own many in the formpaking did so misuralice to play to her disposition with the
house, as her uncle are, all the whole interest
other give, by nead very attemon her ainder the same slighted