In [2]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch

from tqdm import tqdm
import numpy as np
import os

In [3]:
# set seeds for reproducibility
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True

In [4]:
# book collection
BOOKS = {
    'TheOdyssey': "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/TheOdyssey_Homer.txt",
    'PrideAndPrejudice': "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/PrideAndPrejudice_JaneAusten.txt",
    'AJourneyToTheCentreOfTheEarth': "https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/JulesVerne_AJourneyToTheCentreOfTheEarth.txt",
}
BOOKS

{'TheOdyssey': 'https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/TheOdyssey_Homer.txt',
 'PrideAndPrejudice': 'https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/PrideAndPrejudice_JaneAusten.txt',
 'AJourneyToTheCentreOfTheEarth': 'https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/JulesVerne_AJourneyToTheCentreOfTheEarth.txt'}

In [5]:
# choose a book
BOOK = 'PrideAndPrejudice'

# get the book's url 
book_url = BOOKS[BOOK]
# extract the book's filename from the url
book_filename = os.path.basename(book_url) 

# download the book in local if it doesn't already exist
if not os.path.exists(book_filename):
    try:
        !wget $book_url
        print("File is succesfully downloaded.")
    except Exception as e:
        print(f"Could not download the book from {book_url}")
        print(e)
else:
    print("File has already been downloaded.")

--2024-04-10 13:08:01--  https://raw.githubusercontent.com/cetinsamet/data-science/main/data/book/PrideAndPrejudice_JaneAusten.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 703907 (687K) [text/plain]
Saving to: 'PrideAndPrejudice_JaneAusten.txt'


2024-04-10 13:08:01 (21.1 MB/s) - 'PrideAndPrejudice_JaneAusten.txt' saved [703907/703907]

File is succesfully downloaded.


In [6]:
# define hyperparameters
N_EPOCHS = 30
BATCH_SIZE = 128
SEQ_LEN = 200
LEARNING_RATE = 1e-4
EMBED_SIZE = 100
HIDDEN_SIZE = 100
NUM_LAYERS = 1
TEST_SIZE = 0.2
EARLY_STOPPING_STEP_SIZE = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BEST_MODEL_FILEPATH = 'best_model.pt'

DEVICE

'cuda'

In [7]:
def load_text(fp):
    with open(fp, mode='r') as infile:
        text = ''.join([row for row in infile])
    return text

# load text
text = load_text(book_filename)

# Get the number of characters in the text
text_len = len(text)
print(f'Number of characters in the text = {text_len}')
print("--------------\n")

# Print a sample from the text
idx = np.random.randint(low=0, high=text_len-SEQ_LEN)
print(f'Sample text:\n{text[idx:(idx + SEQ_LEN)]}')
print("--------------\n")

Number of characters in the text = 690654
--------------

Sample text:
 it
was all done very well. She had also to anticipate how her visit would
pass, the quiet tenor of their usual employments, the vexatious
interruptions of Mr. Collins, and the gaieties of their inter
--------------



In [8]:
# define vocabulary (set of unique characters in the source text)
vocab = sorted(list(set(text)))
# get the vocabulary size
vocab_size = len(vocab)

# print vocabulary as a single string 
print(f"Vocabulary: {''.join(vocab)}")
print(f"Vocabulary size = {vocab_size}")
print("--------------\n")

Vocabulary: 
 !"&'()*,-.1234568:;?ABCDEFGHIJKLMNOPRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzàê
Vocabulary size = 78
--------------



In [9]:
# define the mappings (as dictionary) of 
# 'a character (from the vocab) to a unique ID' ---> 'char2int'
# and
# 'a unique ID to a character (from the vocab)'  ---> 'int2char'
char2int = {c: idx for idx, c in enumerate(vocab)}
int2char = {idx: c for idx, c in enumerate(vocab)}

# define the encode() that encodes/converts a character to a unique ID 
def encode(character):
    global char2int
    return char2int[character]

# define the encode() that decodes/converts back a unique ID to a character
def decode(integer):
    global int2char
    return int2char[integer]

# print an example of how encode() and decode() operate
char = 'A'
print(f"char '{char}' | encode('{char}') = {encode(char)}")
print(f"char '{char}' | decode(encode('{char}')) = {decode(encode(char))}")
print("--------------\n")

char 'A' | encode('A') = 22
char 'A' | decode(encode('A')) = A
--------------



In [10]:
# define the TextDataset as an instance of torch.nn.Dataset
class TextDataset(Dataset):
    def __init__(self, text, seq_len):
        super().__init__()
        self.text = text
        self.seq_len = seq_len

    def __len__(self):
        # define the number of samples in the dataset
        return len(self.text) - self.seq_len - 1

    def __getitem__(self, idx):
        # x = [c0, c1, ... cN]
        x = torch.tensor(
            [encode(char) for char in self.text[idx:(idx + self.seq_len)]], 
            dtype=torch.long
        )
        # y = [c1, c1, ... c(N+1)]
        y = torch.tensor(
            [encode(char) for char in self.text[(idx + 1):(idx + 1 + self.seq_len)]], 
            dtype=torch.long
        )
        return x, y

In [11]:
# define the train and test data
train_data = text[:-(int(text_len * TEST_SIZE))]
test_data = text[-(int(text_len * TEST_SIZE)):]

# print the length of train and test data
print(f'Length of the train text = {len(train_data)}')
print(f'Length of the test text  = {len(test_data)}')

Length of the train text = 552524
Length of the test text  = 138130


In [12]:
# initialize train and test datasets
train_dset = TextDataset(train_data, SEQ_LEN)
test_dset = TextDataset(test_data, SEQ_LEN)

# initialize train and test iterators
train_iterator = DataLoader(train_dset, BATCH_SIZE, shuffle=True, drop_last=True, pin_memory=True)
test_iterator = DataLoader(test_dset, BATCH_SIZE, shuffle=False, pin_memory=True)  # no need to shuffle the test dset

In [13]:
# define characterRNN module
class characterRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, output_size):
        super().__init__()  
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.embedding_layer = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden_state, cell_state):
        x = self.embedding_layer(x)
        x, (hidden_state, cell_state) = self.lstm(x, (hidden_state, cell_state))
        x = self.fc(x)
        return x, hidden_state, cell_state
    
    def init_hidden_and_cell(self, batch_size):
        size = (self.num_layers, batch_size, self.hidden_size)
        hidden_state, cell_state = torch.zeros(size, dtype=torch.float), torch.zeros(size, dtype=torch.float)
        return hidden_state, cell_state

In [14]:
def generate_text(model, start_char='\n', generated_text_len=250, temperature=1.0):
    # global variables
    global DEVICE
    # set model to eval mode
    model.eval()
    
    # add the start character at the beginning of the generated text sequence
    generated_text = start_char
    
    # operate under inference mode to avoid gradient computations and speed up the process
    with torch.inference_mode():
        # set the initial hidden and cell state tensors for LSTM training
        hidden_state, cell_state = model.init_hidden_and_cell(batch_size=1)
        hidden_state, cell_state = hidden_state.to(DEVICE), cell_state.to(DEVICE)
        
        # encode the start_char and add a single batch dimensionality 
        x = torch.tensor(encode(start_char), dtype=torch.long).view(1, -1).to(DEVICE)
        
        # generate N characters, where N = generated_text_len
        for _ in range(generated_text_len):
            # predict the probabilities of the next character
            y, hidden_state, cell_state = model(x, hidden_state, cell_state)
            # select the next character from a multinomial distribution
            ### (decreasing temperature value tends to generate more boring and predictable texts while increasing it brings more creativity and possibly lower quality texts)
            x = torch.multinomial(y[0].div(temperature).exp(), num_samples=1)            
            # decode the next character (for human readibility) 
            next_char = decode(x.item())
            # add next character to the end of the generated text
            generated_text += next_char

    return generated_text

In [15]:
def evaluate(model, iterator):
    # global variables
    global DEVICE
    # set model to eval mode
    model.eval()
    
    # operate under inference mode to avoid gradient computations and speed up the process
    with torch.inference_mode():
        # initialize loss
        loss_sum = 0.0
        
        # iterate over batches
        for x, y in tqdm(iterator):
            # carry tensors to available device
            x, y = x.to(DEVICE), y.to(DEVICE)

            # set the initial hidden and cell state tensors for LSTM training
            batch_size = len(x)  # get the current batch_size
            hidden_state, cell_state = model.init_hidden_and_cell(batch_size)
            hidden_state, cell_state = hidden_state.to(DEVICE), cell_state.to(DEVICE)

            # forward pass (get logits, hidden and cell states for each characters in the sequence)
            logits, hidden_state, cell_state = model(x, hidden_state, cell_state)

            # compute loss
            B, S, C = logits.shape  # B: batch size, S: seq length, C: channnel (or embed) size
            loss = F.cross_entropy(logits.view(B*S, C), y.view(-1))
            
            # add batch loss to total loss sum
            loss_sum += loss.item()
    
    # compute avg loss
    loss_avg = loss_sum / len(iterator)
    return loss_avg

In [16]:
# initialize characterRNN model
model = characterRNN(
    input_size=vocab_size,
    embed_size=EMBED_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=vocab_size,
).to(DEVICE)

# initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# init the best test loss as positive inifinity
best_test_loss = float('inf')

# initialize a counter for early stopping
best_streak_count = 0

###### training loop ######
# iterate over epochs
for epoch_idx in range(1, N_EPOCHS+1):
    # set model to train mode
    model.train()  
    # initialize train loss
    loss_train = 0.0

    # iterate over batches        
    for iter_idx, (x, y) in tqdm(enumerate(train_iterator), total=len(train_iterator)):  
        # carry tensors to available device
        x, y = x.to(DEVICE), y.to(DEVICE)
        
        # set the initial hidden and cell state tensors for LSTM training
        hidden_state, cell_state = model.init_hidden_and_cell(BATCH_SIZE)
        hidden_state, cell_state = hidden_state.to(DEVICE), cell_state.to(DEVICE)
                
        # forward pass (get logits, hidden and cell states for each characters in the sequence)
        logits, hidden_state, cell_state = model(x, hidden_state, cell_state)
        
        # compute loss
        B, S, C = logits.shape  # B: batch size, S: seq length, C: channnel (or embedding) size
        loss = F.cross_entropy(logits.view(B*S, C), y.view(-1))
        
        # update gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # add batch loss to total loss sum
        loss_train += loss.item()
        
    # print epoch logs
    loss_train /= len(train_iterator)
    loss_test = evaluate(model, test_iterator)
    print(f"epoch {epoch_idx:02} || train loss = {loss_train:.3f}\ttest loss = {loss_test:.3f}", end='\n\n')
    
    # save the current model as the best model if the current test loss is the least achieved 
    if loss_test < best_test_loss:
        # save the curent model's parameters as the best model parameters
        torch.save(model.state_dict(), BEST_MODEL_FILEPATH)
        # replace the best test loss with the current best loss
        best_test_loss = loss_test
        # reset early stoppping counter 
        best_streak_count = 0
        # display info
        print(f'The best model is found and saved. Current best test loss = {best_test_loss:.3f}')
        # generate a text and display
        print("--------------")
        print(f"GENERATED TEXT:{generate_text(model, generated_text_len=250)}")
    else:
        # update early stoppping counter 
        best_streak_count += 1

    # check early stopping condition
    if best_streak_count == EARLY_STOPPING_STEP_SIZE:
        print(f"A better model has not been found in the last {EARLY_STOPPING_STEP_SIZE} epochs. Early stopping...")
        break
        
    print("--------------\n")

100%|██████████| 4315/4315 [02:19<00:00, 30.82it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.83it/s]


epoch 01 || train loss = 2.191	test loss = 1.731

The best model is found and saved. Current best test loss = 1.731
--------------
GENERATED TEXT:
 dfy formred Pece frotur enterpxiafiage in this genk"
 a bying6 her parly terhard as friwe for courcut povay it seepe veray ofor uncosted be inded, a consusatbent is kinxidention, she the not am that be hit the
own the mishions," now poicweld not bee
--------------



100%|██████████| 4315/4315 [02:20<00:00, 30.79it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.85it/s]


epoch 02 || train loss = 1.566	test loss = 1.482

The best model is found and saved. Current best test loss = 1.482
--------------
GENERATED TEXT:

"Caus Mr. My was her feltation muty. My sade awe rading certon.
And
aboy Eliza'Ds. Haptly blatt poces, and awast before his prelits. I shanded the tole we could sowr.
Maribet, and, that you as tendist you for what amosighted porton; and becaalfied D
--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.83it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.80it/s]


epoch 03 || train loss = 1.403	test loss = 1.393

The best model is found and saved. Current best test loss = 1.393
--------------
GENERATED TEXT:
though she implaisue?" repried it that be reaidely in is no of Mr. Bingley rition yis _make is the place to think you advances coplie mights. Colfelless their even in his time Mrs. Collins, there frieved young aways he mefut Miss Bingley means persua
--------------



100%|██████████| 4315/4315 [02:19<00:00, 31.03it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.96it/s]


epoch 04 || train loss = 1.329	test loss = 1.350

The best model is found and saved. Current best test loss = 1.350
--------------
GENERATED TEXT:

"After divession of something out sore. Mr. Darcy little they when Longbongestle found to sat me high illy," said Elizabeeth's nature out. Do yen had been disennetish herself, and ffather better's pleased? I have
no rirlieve Netherfied to the might 
--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.92it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.57it/s]


epoch 05 || train loss = 1.286	test loss = 1.325

The best model is found and saved. Current best test loss = 1.325
--------------
GENERATED TEXT:
"To left the gentlence of the sured on
which
Sir William in it. Every toon my
amingation lift. The resolving, while he was
."
"I cannovect of the subject;. were alone, with your leave to my abse in the engagement. He was
of seeliss on it about hanced
--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.87it/s]
100%|██████████| 1078/1078 [00:26<00:00, 41.12it/s]


epoch 06 || train loss = 1.256	test loss = 1.308

The best model is found and saved. Current best test loss = 1.308
--------------
GENERATED TEXT:
expectal a moment, by it
arresting any that Wickham propufe to reeboring impareshonouration as young munced no ill visit to know however, had settl; but
her wask turned to give her, carfually dounty.

Ane not be visithy hoods woman in quittain she se
--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.84it/s]
100%|██████████| 1078/1078 [00:26<00:00, 41.16it/s]


epoch 07 || train loss = 1.233	test loss = 1.296

The best model is found and saved. Current best test loss = 1.296
--------------
GENERATED TEXT:
     in his reflected a said, and a sort, was sister might have never been going him was been
manner was
entaim?
And when she
business of beingsfordshire was an exaction of the thought perfectly provacomes uneasy, he verised Kent for your ratement wh
--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.82it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.64it/s]


epoch 08 || train loss = 1.216	test loss = 1.288

The best model is found and saved. Current best test loss = 1.288
--------------
GENERATED TEXT:
     any face of
his faithship's daughter, I do not know, before I am supprriently
too humbles."
 

       *
  *    *         wou entreagery tonally
apolunt, delighted.

Buts,
in the good yest treated, andactio, a laious to judge in the head to Mary 
--------------



100%|██████████| 4315/4315 [02:20<00:00, 30.78it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.99it/s]


epoch 09 || train loss = 1.202	test loss = 1.282

The best model is found and saved. Current best test loss = 1.282
--------------
GENERATED TEXT:
Denny Jane was
     mother attended to
a much connection. She does, walk it. To vexations. I shall
be, propsed by under you were not head me. In more than her uswelly annot his father by thew hardless by the care, Jane as idea in welant sharedly seen
--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.98it/s]
100%|██████████| 1078/1078 [00:26<00:00, 41.16it/s]


epoch 10 || train loss = 1.190	test loss = 1.278

The best model is found and saved. Current best test loss = 1.278
--------------
GENERATED TEXT:
     princing schies were daughters was a short enjoyment, and relate no
mealy
between they mode previded sisters. His
opening.

Mrs. Long had been more by the
present came his hand it
is describlibareshippenectly at an beharly in from his discrodure
--------------



100%|██████████| 4315/4315 [02:19<00:00, 31.02it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.43it/s]


epoch 11 || train loss = 1.180	test loss = 1.275

The best model is found and saved. Current best test loss = 1.275
--------------
GENERATED TEXT:
was
much
     flent which I conselievine that the tifferuded her,
she had been eefficient woman a longed, with gratumes them;
than her hand Mr. Bingley within
the pausitness
of conviction in London, indely.

"You know him without thising them that th
--------------



100%|██████████| 4315/4315 [02:22<00:00, 30.33it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.41it/s]


epoch 12 || train loss = 1.171	test loss = 1.273

The best model is found and saved. Current best test loss = 1.273
--------------
GENERATED TEXT:
DOarcy chose presertingly was nobody from a content, Mr. Darcy with a moment. Lady Catherine, and yessman is musif Prass of seconned to be-belicaking his sideful place now, the very farthew on every howlege, for the time
of Miss Lucas," conon the mos
--------------



100%|██████████| 4315/4315 [02:20<00:00, 30.68it/s]
100%|██████████| 1078/1078 [00:26<00:00, 41.20it/s]


epoch 13 || train loss = 1.164	test loss = 1.271

The best model is found and saved. Current best test loss = 1.271
--------------
GENERATED TEXT:

and her resentment, to have Macy was dinner had been, and give left every and often, and at little by yourself, who, and was desaid Wickham!--anstance was to making her sister remains!"

"She is still beauty."

"Could be hapbouth ove, really ask, an
--------------



100%|██████████| 4315/4315 [02:19<00:00, 31.00it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.56it/s]


epoch 14 || train loss = 1.157	test loss = 1.271

The best model is found and saved. Current best test loss = 1.271
--------------
GENERATED TEXT:
her guility, how had rather disally
     but even herself, he is to be breakfast carried of the surcits of
the young lady and nor that last taken, his incliable; it gave her an absence observed young.--But were as Miss Lucas and Mr. Bennet, "Such as

--------------



100%|██████████| 4315/4315 [02:18<00:00, 31.05it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.79it/s]


epoch 15 || train loss = 1.151	test loss = 1.270

The best model is found and saved. Current best test loss = 1.270
--------------
GENERATED TEXT:
wat
morning it!"

"And he asked yourself, and, or a friend.--He be was entering whether they bring in a home into ourselves from me appear Jane, my concern of procied, at at allowable to the care of his encoarmonts and herself that _I_ have the slace
--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.93it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.53it/s]


epoch 16 || train loss = 1.145	test loss = 1.271

--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.91it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.99it/s]


epoch 17 || train loss = 1.140	test loss = 1.271

--------------



100%|██████████| 4315/4315 [02:19<00:00, 30.95it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.80it/s]


epoch 18 || train loss = 1.135	test loss = 1.272

--------------



100%|██████████| 4315/4315 [02:20<00:00, 30.82it/s]
100%|██████████| 1078/1078 [00:26<00:00, 41.03it/s]


epoch 19 || train loss = 1.131	test loss = 1.273

--------------



100%|██████████| 4315/4315 [02:20<00:00, 30.74it/s]
100%|██████████| 1078/1078 [00:26<00:00, 40.32it/s]

epoch 20 || train loss = 1.127	test loss = 1.274

A better model has not been found in the last 5 epochs. Early stopping...





In [19]:
### loading the best model

# reinitialize the model
best_model = characterRNN(
    input_size=vocab_size,
    embed_size=EMBED_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=vocab_size,
)
# load best model's parameters
best_model.load_state_dict(torch.load(BEST_MODEL_FILEPATH))
best_model = best_model.to(DEVICE)
print("Model is initialized and the best model parameters are loaded.")

Model is initialized and the best model parameters are loaded.


In [24]:
### generating text and experimenting with different temperature values 
### (decreasing temperature value tends to generate more boring and predictable texts while increasing it brings more creativity and possibly lower quality texts)

# define possible temperature values
temperatures = [0.25, 0.5, 1.0]
generated_text_len = 1000

# iterate over temperature values
for temp in temperatures:
    # generate text
    generated_text = generate_text(best_model, generated_text_len=generated_text_len, temperature=temp)
    # display
    print(f'Temperature = {temp}')
    print(generated_text)
    print("--------------\n\n")

Temperature = 0.25

     the other friends and a most attention of the considerable to be all that he had not the letter, and they were at the same particular could not be such a settled to the subject of the family as the proper of the same present of his family and some promised to the party. When they were an actively the concern in the subject of the particular of the rest of the family and to the present at the subject of the particulars of his sister and the manner of her attention of the subjects were some of the soon attention was all the subject of her father and happiness to any other feelings and a silence of the other particular for endeavour to the particular of the next morning to the construed to her than to be all the subject of the particulars of the morning was the present of the company of her attention of his sister to the particulars of the same promised to her house to be a few moments were at the subject of the present away to the probably and the house, and the 