In [15]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data


# Load ASCII text and convert it to lowercase
file_contents = open("jokes.txt", 'r', encoding='utf-8').read()
file_contents = file_contents.lower()

# Create a mapping of unique characters to integers
unique_chars = sorted(list(set(file_contents)))
char_to_int = dict((char, integer) for integer, char in enumerate(unique_chars))

# Summarize the loaded data
total_chars = len(file_contents)
total_unique_chars = len(unique_chars)

# Define the sequence length
sequence_length = 100

# Initialize lists for input and output data
input_data = []
output_data = []

# Create input-output pairs encoded as integers
for i in range(0, total_chars - sequence_length, 1):
    input_sequence = file_contents[i:i + sequence_length]
    
    output_sequence = file_contents[i + sequence_length]
    
    input_data.append([char_to_int[char] for char in input_sequence])
    output_data.append(char_to_int[output_sequence])

# Calculate the total number of samples
total_samples = len(input_data)

# Convert into PyTorch tensors and Reshape input data X to be [samples, time steps, features] 
X_input = torch.tensor(input_data, dtype=torch.float32).reshape(total_samples, sequence_length, 1)
# Normalize the input to 0 to 1
X_input = X_input / float(total_unique_chars)  # Assuming total_unique_chars refers to the total number of unique characters
y_output = torch.tensor(output_data)

class CharModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=256, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(hidden_size, total_unique_chars)
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x

n_epochs = 10
batch_size = 128
model = CharModel()

optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
loader = data.DataLoader(data.TensorDataset(X_input, y_output), shuffle=True, batch_size=batch_size)

stored_model = None
best_loss_init = np.inf
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss_init:
            best_loss_init = loss
            stored_model = model.state_dict()
            
        print("epoch %d: loss: %.4f" % (epoch, loss))
   

        
torch.save([stored_model, char_to_int], "single-char-lstm-joke.pth")   


# generate text by defining prompt
# define a prompt from the training corpus 
prompt = "my friend told an out of place joke about police searches. but i don't think it wa"
print('Prompt: "%s"' % prompt)

# use another stored model
#stored_model, char_to_int = torch.load("single-char-lstm-joke.pth")
int_to_char = dict((i, c) for c, i in char_to_int.items())
model = CharModel()
model.load_state_dict(stored_model)
sample = [char_to_int[c] for c in prompt]


model.eval()
with torch.no_grad():
    for i in range(1000):
        # format input array of int into PyTorch tensor
        x = np.reshape(sample, (1, len(sample), 1)) / float(len(char_to_int))
        x = torch.tensor(x, dtype=torch.float32)
        prediction = model(x)
        # convert logits into one character
        index = int(prediction.argmax())
        predicted_char = int_to_char[index]
        print(predicted_char, end="")
        # append the new character into the prompt for the next iteration
        sample.append(index)
        sample = sample[1:]
     
        

print("Stop.")

epoch 0: loss: 4186.0015
epoch 1: loss: 4049.6775
epoch 2: loss: 4029.7559
epoch 3: loss: 4026.6008
epoch 4: loss: 4025.7100
epoch 5: loss: 4024.7424
epoch 6: loss: 4023.9678
epoch 7: loss: 4024.9707
epoch 8: loss: 4022.4292
epoch 9: loss: 4022.1387
Prompt: "my friend told an out of place joke about police searches. but i don't think it wa"
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

The above model did not perform well. The model only predicted " " as the next character. Consequently, the output looks empty. The training dataset itself was created by ChatGPT.

# Evaluation Snippet

In [16]:
def test_model(path:str = "", model_name:str = "", lowest_loss:str = ""):
    print(f"BEGIN - Model: {model_name}. Lowest Loss: {lowest_loss}")
    prompt = "my friend told an out of place joke about police searches. but i don't think it wa"
    print('Prompt: "%s"' % prompt)

    # use another stored model
    #stored_model, char_to_int = torch.load("single-char-lstm-joke.pth")
    int_to_char = dict((i, c) for c, i in char_to_int.items())
    model = CharModel()
    model.load_state_dict(stored_model)
    sample = [char_to_int[c] for c in prompt]


    model.eval()
    with torch.no_grad():
        for i in range(1000):
            # format input array of int into PyTorch tensor
            x = np.reshape(sample, (1, len(sample), 1)) / float(len(char_to_int))
            x = torch.tensor(x, dtype=torch.float32)
            prediction = model(x)
            # convert logits into one character
            index = int(prediction.argmax())
            predicted_char = int_to_char[index]
            print(predicted_char, end="")
            # append the new character into the prompt for the next iteration
            sample.append(index)
            sample = sample[1:]
        
            

    print(f"END - Model: {model_name}")

# Model Tuning using Grid Search

In [17]:
hyperparameters_grid = {
    'hidden_sizes': [64, 128, 256],
    'num_layers': [1, 2, 4],
    'learning_rates': [0.001, 0.01, 0.1]
}

opitmal_model = None
best_loss_optim = np.inf

print("###########################################")
for hidden_size in hyperparameters_grid["hidden_sizes"]:
    for num_layer in hyperparameters_grid["num_layers"]:
        for learning_rate in hyperparameters_grid["learning_rates"]:
            model = CharModel(input_size=1, hidden_size=hidden_size, num_layers=num_layer)
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            loss_fn = nn.CrossEntropyLoss(reduction="sum")
            loader = data.DataLoader(data.TensorDataset(X_input, y_output), shuffle=True, batch_size=batch_size)
            
            for epoch in range(n_epochs):
                model.train()
                for X_batch, y_batch in loader:
                    y_pred = model(X_batch)
                    loss = loss_fn(y_pred, y_batch)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                # Validation
                model.eval()
                loss = 0
                with torch.no_grad():
                    for X_batch, y_batch in loader:
                        y_pred = model(X_batch)
                        loss += loss_fn(y_pred, y_batch)
            print(f"HYPERPARAMS: {hidden_size} | {num_layer} | {learning_rate}. LOSS: {loss}")
            # Check if this model has a lower validation loss than the best model
            if loss < best_loss_optim:
                best_loss_optim = loss
                best_model = model.state_dict()
                best_hyperparameters = {
                    'hidden_size': hidden_size,
                    'num_layers': num_layer,
                    'learning_rate': learning_rate
                }
print("###########################################")
torch.save({'model_state_dict': best_model, 'hyperparameters': best_hyperparameters}, "best_model.pth")

###########################################
HYPERPARAMS: 64 | 1 | 0.001. LOSS: 4024.072021484375
HYPERPARAMS: 64 | 1 | 0.01. LOSS: 3990.367431640625
HYPERPARAMS: 64 | 1 | 0.1. LOSS: 4106.58251953125
HYPERPARAMS: 64 | 2 | 0.001. LOSS: 4025.832763671875
HYPERPARAMS: 64 | 2 | 0.01. LOSS: 4000.3173828125
HYPERPARAMS: 64 | 2 | 0.1. LOSS: 4033.453369140625
HYPERPARAMS: 64 | 4 | 0.001. LOSS: 4025.082275390625
HYPERPARAMS: 64 | 4 | 0.01. LOSS: 4027.57666015625
HYPERPARAMS: 64 | 4 | 0.1. LOSS: 4030.952392578125
HYPERPARAMS: 128 | 1 | 0.001. LOSS: 4023.0400390625
HYPERPARAMS: 128 | 1 | 0.01. LOSS: 3931.471923828125
HYPERPARAMS: 128 | 1 | 0.1. LOSS: 4393.4921875
HYPERPARAMS: 128 | 2 | 0.001. LOSS: 4024.607666015625
HYPERPARAMS: 128 | 2 | 0.01. LOSS: 3945.140625
HYPERPARAMS: 128 | 2 | 0.1. LOSS: 4377.18798828125
HYPERPARAMS: 128 | 4 | 0.001. LOSS: 4025.356689453125
HYPERPARAMS: 128 | 4 | 0.01. LOSS: 4028.7197265625
HYPERPARAMS: 128 | 4 | 0.1. LOSS: 4530.85888671875
HYPERPARAMS: 256 | 1 | 0.001. LO

# Compare Initial Model with Optimized Model

In [18]:

init_model_path = "single-char-lstm-joke.pth"
optim_model_path = "best_model.pth"
test_model(init_model_path,"Initial Model", best_loss_init)
test_model(optim_model_path,"Grid Search Optimized Model", best_loss_optim)

BEGIN - Model: Initial Model. Lowest Loss: 4022.138671875
Prompt: "my friend told an out of place joke about police searches. but i don't think it wa"
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

The loss is slightly lower after the grid search optimization. However, the model is still not usable, as the next prompt remains " ". This is little suprising, as the space or " " is the most common char of the training data set.

# Bayesian Hyperparameter Optimization

## Principle and Limits of Grid Search
In general, hyperparameter optimization aims to improve a models performance. In the previous section a grid search based approach was tested. The biggest downside of this grid approaches is the efficiency and consequently the potential large hyperparameter search space. Furthermore, the standard grid search is not adaptable during the search, consequently, local minima are prone to be overlooked.

The Bayesian hyperparameter optimization (BHO) takes an probabilistic approach to find optimal hyperparameters. BHO keep track of previous performance indicators and try create a probabilistic model which can map hyperparameters to possible model scores. The probabilistic model is called "surrogate".

$$ P(score|hyperparameters) $$

In general the surrogate model is (and should) be easier to optimize, then the actual model of interest. The surrogate model can only have adequate knowledge of the hyperparameter space, when enough hyperparameter sets have been estimated. Around spaces within the hyperparameter space, were little parameters have been tested, the variance of the underlying Gaussian Process is high. However, as the hyperparameterspace is not fully random, the model will be able to find promising spots and fine tune the parameters in the regions more and more. A good HBO has a decent mix between exploration and exploitation.

## Sequential Model-Based Optimization

The sequential model-based optimization (SMBO) is a formalization of the HBO. With the following 5 key aspects:
1. Domain of hyperparameters over which to search (boundaries)
2. Objective function which takes in hyperparameters and outputs a score
3. The surrogate model of the objective function
4. Selection function which decides which hyperparameters to choose next
5. Memory consisting of past hyperparameter combinations and scores

Choosing a proper domain is crucial for HBO. As the possible range of hyperparameters need to be limited. Moreover, it is also suggested to define a probability function for those hyperparameters. For instance, it does not necessarily make sense to choose a normal distribution for the depth of a network, a log-normal might make more sense. Thus, shallow networks are preferred and the overall complexity is limited.

The objective function is the most computational demanding. Based on this, the surrogate function can be evaluated. It represents the probability of the objective function built using the previous evaluations. This response surface can be of a very high dimensionality (number of hyperparameters / degrees of freedom). The selection function chooses the next hyperparameter combination, where the "expected improvement" is the most common one.