In [1]:
import torch
import torch.nn as nn
from collections import defaultdict
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import tqdm
import math
from collections import defaultdict

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
FILL_IN = "FILL_IN"

In [3]:
# Dictionaries, {idx -> ch} and {ch -> idx}
itos = defaultdict(int)
stoi = defaultdict(int)
# Number of characters used to predict the target character in the MLP Language Model 
block_size = 3
# Batch size used in MLP Language Model
batch_size = 32
# Embedding dimension, per character
d_model = 10
# Hidden dimension for RNN and also MLP Language Models 
d_h = 200
# define a device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# START = START token
stoi['.'] = 0
itos[0] = '.'

# Loop over all names and create mappings itos and stoi mapping a unique character to a idx
for name in open('names.txt', 'r'):
    name = name.lower().strip()
    for char in name:
        if char not in stoi:
            idx = len(stoi)
            stoi[char] = idx
            itos[idx] = char

In [4]:
assert len(stoi) == len(itos)
vocab_size = len(stoi)
assert vocab_size == 27

In [5]:
stoi

defaultdict(int,
            {'.': 0,
             'e': 1,
             'm': 2,
             'a': 3,
             'o': 4,
             'l': 5,
             'i': 6,
             'v': 7,
             's': 8,
             'b': 9,
             'p': 10,
             'h': 11,
             'c': 12,
             'r': 13,
             't': 14,
             'y': 15,
             'n': 16,
             'g': 17,
             'z': 18,
             'f': 19,
             'd': 20,
             'u': 21,
             'k': 22,
             'w': 23,
             'q': 24,
             'x': 25,
             'j': 26})

## BiGram Language Model
- Implement the Bigram Language Model
- Get all the relevent counts, then get the train dataset Perplexity
- Use the class notes

In [6]:
# Using the formulas in class, loop over each name and get the parameters
c1 = defaultdict(int)
c2 = defaultdict(int)
for name in open('names.txt', 'r'):
    # Lowercase and remove any whitespace at the end
    name = name.lower().strip()
    # Pad with START = '.' and STOP = '.'
    name = '.' + name + '.'
    # Transform to integer
    name = [stoi[char] for char in name]
    # Get the counts for Bigrams and Unigrams
    for i in range(0, len(name)-1):
        c1[name[i]] += 1
        c2[(name[i], name[i+1])] += 1

In [7]:
# Get perplexity
sumneglogp = 0
T = 0
for name in open('names.txt', 'r'):
    # Get rid of white space and lowercase
    name = name.lower().strip()
    # Get the length of the word, without padding
    T += len(name)
    # Don't pad the STOP since we are not generating; pad with START
    name = '.' + name
    # Transform to integrs
    name = [stoi[char] for char in name]
    # Get the loss -log(p(name)); use that the log of the product is the sum of the logs
    for i in range(0, len(name)-1):
        sumneglogp += -math.log(c2[(name[i], name[i+1])] / c1[name[i]], 2)
# Print the Perplexity
# print('Preplexity: ', torch.pow(torch.tensor(sumneglogp / T ), 2).item()) <--- this was wrong
print('Perplexity: ', torch.pow(2, torch.tensor(sumneglogp / T )).item())

Perplexity:  13.24779987335205


In [8]:
# Generate a random word using this distributon
# When you generate STOP, terminate
name = '.'
while True:
    c = stoi[name[-1]]
    # Make the distribution from c to any other word other than START
    p = []
    for d in range(vocab_size):
        # Use the same indicies as the dictionary we set up
        # Populate p
        p.append(c2[(c, d)] / c1[c])
    # print(p)
    assert len(p) == vocab_size
    # Sample randmly from the probability using torch.Categorical
    categorical_dist = torch.distributions.Categorical(probs=torch.tensor(p))
    c = categorical_dist.sample()
    # Offset by 1 since we want indices [1, 2, ..., vocab_size]
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[1:])
            

Generated name:  jaierlynayran


## MLP Language Model

- Implement the MLP language model from below
- Look at page 7, Equation (1)
- https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [9]:
x_data = []
y_data = []
for name in open('names.txt', 'r'):
    name = name.lower().strip()
    # Pad with block_size START tokens and 1 STOP token
    name = ''.join(block_size * ['.']) + name + '.'
    # Loop through name and get the (x, y) pairs
    # Add (x, y) to x_data and y_data and make sure you transform to characters
    # Make sure x_data and y_data have integers, use stoi
    data = [stoi[x] for x in name]
    for i in range(len(name) - block_size):
        x_data.append(data[i:i+block_size])
        y_data.append(data[i+block_size])

In [10]:
class MLPLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # An embedding for each character; vocab_size of them
        self.e = nn.Embedding(vocab_size, d_model)
        # H; should take in block_size * d_model vector and output d_h
        self.H = nn.Linear(block_size * d_model, d_h)
        # U; should take in d_h vector and output vocab_size
        self.U = nn.Linear(d_h, vocab_size, bias=False)
        # W; for the skip connection, should take in block_size * d_model and output vocab_size
        self.W = nn.Linear(block_size * d_model, vocab_size)
        self.tanh = nn.Tanh()

    # x should be (batch_size, block_size)
    def forward(self, x):
        embed_x = self.e(x)
        embed_x = torch.flatten(embed_x, start_dim=1)
        x = self.H(embed_x)
        x = self.tanh(x)
        x = self.U(x)
        x = x + self.W(embed_x)
        return x

In [11]:
x_data[0], y_data[0]

([0, 0, 0], 1)

In [12]:
# Define a dataloader with x_data and y_data with batch_size
dataset = TensorDataset(torch.tensor(x_data, dtype=torch.long), torch.tensor(y_data, dtype=torch.long))
dl = DataLoader(dataset, batch_size=batch_size)

In [13]:
for xb, yb in dl:
    assert xb.shape == (batch_size, 3)
    assert yb.shape == (batch_size,)
    break

In [14]:
# Define the MLP model and the Adam optimizer learning rate 0.001
model = MLPLanguageModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
lossfn = torch.nn.CrossEntropyLoss()

In [15]:
total_loss = 0
total_ct = 0
total_epochs = 20

for _ in range(total_epochs):
    for xb, yb in dl:
        # Move the dataset to device
        xb = xb.to(device)
        yb = yb.to(device)
        # Zero the gradients
        optimizer.zero_grad()

        # Get the logits
        logits = model(xb)
                
        # Compute the loss
        loss = lossfn(logits, yb)

        # Get the new gradient
        loss.backward()

        # Clip the gradients to max norm 0.1
        # Use clid_grad_norm from torch
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)
        
        # Do a gradient update
        optimizer.step()

        # Get the loss for the batch and get the number of batches
        total_loss += loss.item()
        total_ct += 1

        # Print the loss
        if total_ct and total_ct % 500 == 0:
            print(total_loss / total_ct)
            total_loss = 0
            total_ct = 0

2.424802914381027
2.2805429265499115
2.3022407245635987
2.3120115604400633
2.336812123537064
2.32740464758873
2.3279603147506713
2.234252953290939
2.3639890880584717
2.356785969018936
2.4037855820655825
2.433285714626312
2.44852986240387
2.429318770647049
2.2293992881774902
2.1211539237499237
2.156935467720032
2.155583783388138
2.200127779483795
2.2304863834381106
2.237281819343567
2.213642749786377
2.2969074006080628
2.2804956331253052
2.315643363237381
2.350116686105728
2.3751796038150785
2.407296222686768
2.243400658607483
2.057163140296936
2.0970083482265474
2.10306525182724
2.1259765937328337
2.1799373745918276
2.2113619110584257
2.2060248572826384
2.225477044582367
2.236031531572342
2.2610913944244384
2.2986502966880797
2.331516325235367
2.361893581390381
2.3076548080444335
2.018446249008179
2.0408351941108704
2.07105778670311
2.077022607088089
2.136905805826187
2.181008327245712
2.190029492855072
2.2049903037548066
2.1986499705314637
2.2185363149642945
2.2688673341274264
2.28538

In [16]:
lossfn2 = torch.nn.CrossEntropyLoss(reduction='sum')
with torch.no_grad():
    # Get perplexity
    sumneglogp = 0
    T = 0
    for name in open('names.txt', 'r'):
        name = name.lower().strip()
        T += len(name)
        # Pad with block_size START tokens
        name = ''.join(block_size * ['.']) + name
        x_data = []
        y_data = []
        # Gather all the terms over the loss
        # Notice that we compute -log p(...abc)
        # Which is -log p(a | ...) - log p(b | a..) - log p(c | ba.)
        data = [stoi[x] for x in name]
        for i in range(len(name) - block_size):
            x_data.append(data[i:i+block_size])
            y_data.append(data[i+block_size])
        x_data = torch.tensor(x_data, dtype=torch.long).to(device)
        y_data = torch.tensor(y_data, dtype=torch.long).to(device)
        # Gather the loss over the name, for each term
        # You need to get the softmax loss for each term
        # Can either use the CrossEntropyLoss or do this manually
        # Compute the loss
        logits = model(x_data)

        # Use reduction "sum" so you don't need to worry about N
        loss = lossfn2(logits, y_data)

        # Change to log base 2
        loss *= 1/math.log(2)

        sumneglogp += loss

    # print('Preplexity: ', torch.pow(sumneglogp.clone().detach() / T , 2).item()) <--- this was wrong
    print('Perplexity: ', torch.pow(2, sumneglogp.clone().detach() / T).item())   

Perplexity:  10.64924430847168


In [17]:
# Generate a random word using this distributon
# When you generate STOP, terminate
name = ''.join(block_size * ['.'])
while True:
    # Get the idx
    c = torch.tensor([stoi[x] for x in name[-block_size:]], dtype=torch.long).unsqueeze(0).to(device)
    # Make the distribution from c to any other word other than START
    with torch.no_grad():
        p = model(c).squeeze()
    # Randomly sample from p a new character
    categorical_dist = torch.distributions.Categorical(logits=p)
    c = categorical_dist.sample()
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[block_size:])

Generated name:  gurame


## RNN Language Model
- For each name, run an RNN character by character
- Use the recursion x = Tanh()(Wh @ h + Wx @ x + bh + bx) and y = Softmax()(Wy h + by)
- Do not use the RNN Cell from PyTorch, do this manually as hinted below

In [18]:
class RNNLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Each token has an embedding of size vocab_size
        self.e = nn.Embedding(vocab_size, d_model)
        # Wh used to map hidden to hidden
        self.Wh = nn.Linear(d_h, d_h)
        self.Wx = nn.Linear(d_model, d_h)
        self.Wy = nn.Linear(d_h, vocab_size)
        self.tanh = nn.Tanh()

    def forward(self, x, h):
        # Run through to get the embedding for the token
        # The embedding per token is the feature vector x  we pass into the
        # Represent x as an embedding
        x = self.e(x)
        # Get the hidden state
        h = self.tanh(self.Wh(h) + self.Wx(x))
        # Get the logits we use to predict y
        z = self.Wy(h)
        # Return the z predicting y for the timestep we are at and the next hidden state
        return z, h

In [19]:
model = RNNLanguageModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
lossfn = torch.nn.CrossEntropyLoss()

In [20]:
total_loss = 0
total_ct = 0
total_epochs = 5

for _ in range(total_epochs):
    for name in open('names.txt', 'r'):
        name = name.lower().strip()
        # Add the start and end padding token
        name = '.' + name + '.'
        # name[:-1]
        x_data = torch.tensor([stoi[x] for x in name[:-1]], dtype=torch.long).to(device)
        # name[1:]
        y_data = torch.tensor([stoi[y] for y in name[1:]], dtype=torch.long).to(device)
        logits = []
        # Set the hidden state to random
        h = torch.rand((d_h)).to(device)
        # Zero the grad
        optimizer.zero_grad()
        
        # Loop through each token and get the new h and then pass it forward
        # Accumulate all the logits
        for x in x_data:
            z, h = model(x, h)
            logits.append(z)
        
        # Put all the logits into one tensor
        logits = torch.stack(logits)
                
        # Compute the loss
        loss = lossfn(logits, y_data)

        # Get the new gradient
        loss.backward()

        # Clip the gradients at max norm 0.1
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)
        
        # Do a gradient update
        optimizer.step()

        # Get the loss for the batch and get the number of batches
        total_loss += loss.item()
        total_ct += 1

        if total_ct and total_ct % 100 == 0:
            print(total_loss / total_ct)
            total_loss = 0
            total_ct = 0
        

3.1787993788719175
2.9261367177963256
2.623743758201599
2.5182638823986054
2.4591938173770904
2.459174566268921
2.287376394271851
2.2815901720523835
2.3412524843215943
2.3835724878311155
2.350095978975296
2.3221794927120207
2.3293706011772155
2.377852574586868
2.3243053936958313
2.2765778195858
2.365855212211609
2.3168812870979307
2.3706305658817293
2.291849093437195
2.4018107795715333
2.3130504024028777
2.3277452111244203
2.3775104904174804
2.350185858011246
2.303881766796112
2.3312344086170196
2.286299113035202
2.3128930127620695
2.2720192635059355
2.257389837503433
2.326676433086395
2.2619024825096132
2.2856195795536043
2.2998231017589568
2.2427855694293974
2.3336423230171204
2.3362000930309295
2.2741802537441256
2.267365732192993
2.282627214193344
2.3731164681911467
2.253259996175766
2.321033321619034
2.2926199615001677
2.246405313014984
2.289404674768448
2.2481949400901793
2.2359469890594483
2.2600600135326387
2.2900845885276793
2.325423084497452
2.264633911848068
2.32229703903198

In [21]:
lossfn2 = torch.nn.CrossEntropyLoss(reduction='sum')
with torch.no_grad():
    # Get perplexity
    sumneglogp = 0
    T = 0
    for name in open('names.txt', 'r'):
        name = name.lower().strip()
        T += len(name)
        name = '.' + name
        # Get the name from index 0 to -1 exclusive end
        x_data = name[:-1]
        # Get the y from index 1 to end inclusive end
        y_data = name[1:]
        # logits per token prediction
        logits = []
        # Initialize the h vector to random
        h = torch.rand((d_h)).to(device)
        # Loop over each chracter in the name and pass h and this into the RNN
        # Get the new logit
        for x in x_data:
            # Get the int for x
            x = torch.tensor(stoi[x], dtype=torch.long).to(device)
            # Get z and h
            z, h = model(x, h)
            # Append to logit
            logits.append(z)

        # Get all the logits for each character
        logits = torch.stack(logits)

        # Compute the loss across all characters
        loss = lossfn2(logits, torch.tensor([stoi[y] for y in y_data], dtype=torch.long).to(device))
                
        # Change to log base 2
        # log2(x) = ln(x) / ln(2)
        loss *= 1/math.log(2)

        sumneglogp += loss
        
    # sumneglogp is -log(p('.' + name1)) -log(p('.' + name2)) -log(p('.' + name3)) ...  
    # Divide by the appropriate term to get the answer we want 
    # print('Preplexity: ', torch.pow(sumneglogp.clone().detach() / T , 2).item()) <--- this was wrong
    print('Perplexity: ', torch.pow(2, sumneglogp.clone().detach() / T).item())
    

Perplexity:  12.176115989685059


In [22]:
# Generate a random word using this distributon
# Intialize the word with 
name = '.'
# Initialize h to random
h = torch.rand((d_h)).to(device)
while True:
    # Make c to an integer
    c = torch.tensor(stoi[name[-1]], dtype=torch.long).to(device)
    # Make the distribution from c to any other word other than START
    logits, h = model(c, h)
    # Get p; use Softmax
    p = torch.nn.functional.softmax(logits, dim=0)
    # Sample from p
    categorical_dist = torch.distributions.Categorical(probs=p)
    c = categorical_dist.sample()
    # If we generate '.', stop
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[1:])

Generated name:  zaled
