In [19]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from pprint import pprint

In [20]:
torch.__version__

'2.0.0+cu118'

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
device

device(type='cuda')

In [23]:
random_seed = 3

In [24]:
embedding_values = [5,10]
context_length_values = [5,7]

In [25]:
data = open('shakespeare2.txt', 'r').read()
data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [26]:
data[:10]

'First Citi'

In [27]:
unique_chars = list(set(''.join(data)))
unique_chars.sort()
to_string = {i:ch for i, ch in enumerate(unique_chars)}
to_int = {ch:i for i, ch in enumerate(unique_chars)}
to_string, to_int

({0: '\n',
  1: ' ',
  2: '!',
  3: '$',
  4: '&',
  5: "'",
  6: ',',
  7: '-',
  8: '.',
  9: '3',
  10: ':',
  11: ';',
  12: '?',
  13: 'A',
  14: 'B',
  15: 'C',
  16: 'D',
  17: 'E',
  18: 'F',
  19: 'G',
  20: 'H',
  21: 'I',
  22: 'J',
  23: 'K',
  24: 'L',
  25: 'M',
  26: 'N',
  27: 'O',
  28: 'P',
  29: 'Q',
  30: 'R',
  31: 'S',
  32: 'T',
  33: 'U',
  34: 'V',
  35: 'W',
  36: 'X',
  37: 'Y',
  38: 'Z',
  39: 'a',
  40: 'b',
  41: 'c',
  42: 'd',
  43: 'e',
  44: 'f',
  45: 'g',
  46: 'h',
  47: 'i',
  48: 'j',
  49: 'k',
  50: 'l',
  51: 'm',
  52: 'n',
  53: 'o',
  54: 'p',
  55: 'q',
  56: 'r',
  57: 's',
  58: 't',
  59: 'u',
  60: 'v',
  61: 'w',
  62: 'x',
  63: 'y',
  64: 'z'},
 {'\n': 0,
  ' ': 1,
  '!': 2,
  '$': 3,
  '&': 4,
  "'": 5,
  ',': 6,
  '-': 7,
  '.': 8,
  '3': 9,
  ':': 10,
  ';': 11,
  '?': 12,
  'A': 13,
  'B': 14,
  'C': 15,
  'D': 16,
  'E': 17,
  'F': 18,
  'G': 19,
  'H': 20,
  'I': 21,
  'J': 22,
  'K': 23,
  'L': 24,
  'M': 25,
  'N': 26,
  'O'

In [28]:
def get_tensors(block_size):
    X, Y = [], []
    for i in range(0, len(data)-block_size, 1):
        X.append([to_int[ch] for ch in data[i:i+block_size]])
        Y.append(to_int[data[i+block_size]])
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y


In [29]:
tensors_dict = {}
for context_length in context_length_values:
    tensors_dict[context_length] = get_tensors(context_length)
    print(context_length, tensors_dict[context_length][0].shape, tensors_dict[context_length][1].shape)

5 torch.Size([1115389, 5]) torch.Size([1115389])
7 torch.Size([1115387, 7]) torch.Size([1115387])


In [30]:
embedding_dict = {}
for embedding in embedding_values:
    embedding_dict[embedding] = nn.Embedding(len(to_string), embedding)
    print(embedding, embedding_dict[embedding].weight.shape)

5 torch.Size([65, 5])
10 torch.Size([65, 10])


In [31]:
class NextChar(nn.Module):
  def __init__(self, block_size, vocab_size, emb_dim, hidden_dims):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim)
    self.lin1 = nn.Linear(block_size * emb_dim, hidden_dims[0])
    self.lin2 = nn.Linear(hidden_dims[0], hidden_dims[1])
    self.lin3 = nn.Linear(hidden_dims[1], vocab_size)

  def forward(self, x):
    x = self.emb(x)
    x = x.view(x.shape[0], -1)
    x = torch.sin(self.lin1(x))
    x = torch.sin(self.lin2(x))
    x = self.lin3(x)
    return x

In [32]:
model_dict = {}
for context_length in context_length_values:
    for embedding in embedding_values:
        model_dict[(context_length, embedding)] = NextChar(context_length, len(to_string), embedding, [64, 64])
        model_dict[(context_length, embedding)].to(device)
        # print(context_length, embedding, model_dict[(context_length, embedding)])

In [33]:
# Generate names from untrained model
g = torch.Generator()
g.manual_seed(random_seed)
torch.manual_seed(random_seed)
def generate_name(model,sentence, itos, stoi, block_size, max_len=10):
    original_sentence = sentence
    if len(sentence) < block_size:
        sentence = " " * (block_size - len(sentence)) + sentence
    using_for_predicction = sentence[-block_size:].lower()
    context = [stoi[word] for word in using_for_predicction]
    prediction = ""
    for i in range(max_len):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        ch = itos[ix]
        prediction += ch
        context = context[1:] + [ix]

    return original_sentence + prediction


In [34]:
# Train the model

for context_length in context_length_values:
    for embedding in embedding_values:
        model = model_dict[(context_length, embedding)]
        X, Y = tensors_dict[context_length]
        loss_fn = nn.CrossEntropyLoss()
        opt = torch.optim.AdamW(model.parameters(), lr=0.01)
        import time
        # Mini-batch training
        batch_size = 4096
        print_every = 100
        elapsed_time = []
        for epoch in range(200):
            start_time = time.time()
            for i in range(0, X.shape[0], batch_size):
                x = X[i:i+batch_size].to(device)
                y = Y[i:i+batch_size].to(device)
                y_pred = model(x)
                loss = loss_fn(y_pred, y)
                loss.backward()
                opt.step()
                opt.zero_grad()
            end_time = time.time()
            elapsed_time.append(end_time - start_time)
            if epoch % print_every == 0:
                print(f"context_length = {context_length}, dimension = {embedding},epoch = {epoch}, loss = {loss.item()}")

        # saving model weights
        model = model.to('cpu')

# Save the trained model
        torch.save(model.state_dict(), f"context_{context_length}_embedding_{embedding}.pth")
        # torch.save(model.state_dict(), f"{random_seed}_context_{context_length}_embedding_{embedding}.pt")



context_length = 5, dimension = 5,epoch = 0, loss = 2.023308515548706
context_length = 5, dimension = 5,epoch = 100, loss = 1.697106957435608
context_length = 5, dimension = 10,epoch = 0, loss = 1.9821012020111084
context_length = 5, dimension = 10,epoch = 100, loss = 1.6859607696533203
context_length = 7, dimension = 5,epoch = 0, loss = 2.0991861820220947
context_length = 7, dimension = 5,epoch = 100, loss = 1.7360596656799316
context_length = 7, dimension = 10,epoch = 0, loss = 1.9347500801086426
context_length = 7, dimension = 10,epoch = 100, loss = 1.6435843706130981


Tuning knobs

1. Embedding size
2. MLP
3. Context length