### Building Makemore part 5: making the model more complex

So we would like to complexify the model more:
1. We would like to take more characters as inputs. So the context would be larger
2. Becasue we are taking more characters and squashing them to a linear layers that is putting too much information all too quickly.
3. Instead what we would like to do is that we would like to gradually squash the information and build a deeper network that progressively squeezes the inputs and give out the predictions

So the starter code for part 5 is identical to part 3. The part 4 backpropagation  was a sort of an independent part. We would pick up where we left off in part 3.

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [2]:
with open("D://Datasets/names.txt", 'r') as file:
    names = file.read().splitlines()
names[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
torch.manual_seed(42)   # Here we set the global seed value

<torch._C.Generator at 0x21737596ef0>

In [4]:
vocabulary = sorted(list(set(''.join(names))))
chartoidx = {}
idxtochar = {}
chartoidx['.'] = 0   # Putting a special token to denote the start and the end of a sentence.
idxtochar[0] = '.'
for i,char in enumerate(vocabulary):
    chartoidx[char] = i+1
    idxtochar[i+1] = char

print(chartoidx)

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [None]:
block_size = 3 # This would be used to set the number of context letters that would be in the word.
# blank_context = idxtochar[0] * block_size  # This would produce the context consisting of only special character "."
# print(f"The blank context is {blank_context}")

# Now we would need to split the names into train, dev and test sets
train_names = names[:int(0.8*len(names))]
dev_names = names[int(0.8*len(names)): int(0.9*len(names))]
test_names = names[int(0.9*len(names)):]

# Now we need to define our xs and ys
X_train, y_train = [],[]
X_dev, y_dev = [],[]
X_test, y_test = [], []

# Now we need to add the data into our xs and ys:
for i,split in enumerate([train_names, dev_names, test_names]):
    for word in split:
        word = word+"."
        blank_context = [0] * block_size  # This would produce the context consisting of only special character "."
        # xs.append(blank_context)
        if i == 0:
            for ch in word:
                X_train.append(blank_context)
                y_train.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
        if i == 1:
            for ch in word:
                X_dev.append(blank_context)
                y_dev.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
        if i == 2:
            for ch in word:
                X_test.append(blank_context)
                y_test.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
      
        # print(blank_context)
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_dev = torch.tensor(X_dev)
y_dev = torch.tensor(y_dev)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)  
print(len(X_train), len(y_train))
print(len(X_dev), len(y_dev))
print(len(X_test), len(y_test))

182778 182778
22633 22633
22735 22735


So in part 3 we also made an API of the layers which was very similar to the pytorch API. SO we will copy them here

In [8]:
# So we would convert our code into modules like the Linear Module and  the BatchNorm Module and then we would write the training loop pytorch style
class Linear():
    """ This class creates a Linear layer with the weights and biases and performs the calculations"""
    def __init__(self, fan_in, fan_out, bias = True):
        self.weight = torch.randn((fan_in, fan_out)) / (fan_in**0.5)
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight, [] if self.bias is None else self.bias]
# ------------------------------------------------------------------------------------------------------------------------------------------------
class BatchNorm1D():
    """ This class creates a Batch Normalization layer"""
    def __init__(self, dim, epsilon = 1e-5, momentum = 0.1):
        self.epsilon = epsilon
        self.momentum = momentum
        self.training = True
        # Initializing learnable paramters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # Buffers trained with a momentum update
        self.running_mean = torch.zeros(dim)
        self.running_variance = torch.ones(dim)

    def __call__(self, x):
        if x.ndim == 2:
            dim = 0
        if x.ndim == 3:
            dim = (0,1)
        # Calculate the forward pass
        if self.training:
            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim,keepdim = True, unbiased = True)
        else:
            xmean = self.running_mean
            xvar = self.running_variance

        xhat = (x-xmean)/ torch.sqrt(xvar+self.epsilon)
        self.out = self.gamma * xhat + self.beta

        # Now update the running mean buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * xmean
                self.running_variance = (1-self.momentum)* self.running_variance + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]
# ---------------------------------------------------------------------------------------------------------------------------------------------------
class Tanh():
    def __call__(self,x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class Embedding():
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))
        
    def __call__(self,IX):
        self.out = self.weight[IX]
        return self.out

    def parameters(self):
        return [self.weight]
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class Flatten():
    def __call__(self, X):
        self.out =  X.view(X.shape[0], -1)
        return self.out
    def parameters(self):
        return []

# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class Sequential():
    def __init__(self, layers:list):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self):
         return [p for layer in self.layers for p in layer.parameters()]
            

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class FlattenConsecutive():
    def __init__(self, n):
        """ n is the how many consecutive tensors you want concatenated in the output"""
        self.n = n
    def __call__(self, x):
        B, T, C = x.shape   # x is a 3D tensor
        self.out = x.view(B, T//self.n, C*self.n)
        # Sometimes T==n it creates a 1 at dim = 1 in the self.out tensor. So we would remove the extra dimension by tensor.squeeze in that case
        if self.out.shape[1] == 1:
            self.out = torch.squeeze(self.out, dim=1)

        return self.out
    def parameters(self):
        return []
        

In [126]:
n_embed = 10
n_hidden = 100
block_size = 3

c = torch.randn(len(chartoidx), n_embed) # Instead of the embedding we would have the Embedding Layer in layers which we just added in this chapter

layers = [
    Linear(n_embed*block_size, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(chartoidx)),BatchNorm1D(len(chartoidx))
]


with torch.no_grad():
    # Because the last layer is batchNorm we would not be making the softmax less confident. We would make gamma less confident
    layers[-1].gamma*= 0.1
    # If we remove the batchnorm layer then we would have to make the softmax less confident
    # Make the output softmax less confident
    # layers[-1].weight *= 0.1

    for layer in layers[:-1]:
        # apply the gain parameters of the kaiming initialization
        if isinstance(layer, Linear):
            layer.weight *= 5/3
# calculate the parameters and make the parameters requires_grad attribute = True
parameter_list = [c]+[p for layer in layers for p in layer.parameters()]

p = sum([x.nelement() for x in parameter_list])
print(p)
for p in parameter_list:
    p.requires_grad = True

47551


The optimization will not change so we it will be familiar

In [105]:
# First we would need to train the Neural Net and then we would need to get the values activation for plotting
data_dict ={'train':[X_train, y_train], 
            'dev':[X_dev, y_dev],
            'test':[X_test, y_test]}

epochs = 100000
batch_size = 32
lossi = []
update_data_ratios = []
# Now we would write the training loop
def training_loop(epochs, batch_size, data, is_val_set = False):
    if is_val_set == False:
        for epoch in range(0,epochs):
            # get random 32 indices to take from the training set
            ix = torch.randint(0, data_dict['train'][0].shape[0], size=(32,)) # returns 32 random integers between 0 and len(X_train)
            data = data_dict['train'][0][ix] # This would get the actual training data of shape (32,3) i.e (batch_size, block_size) 
            embeddings = c[data]  # This will get the embeddings of the each of the data i.e (32,3,10)

            # Now we would convert the embedding tensor to a shape that will be used by the model

            x = embeddings.view(embeddings.shape[0],-1) # This is the reshaped data that would be passed into the model

            # Now we would get the activations of the layers by inputting the data into our model
            for layer in layers:
                x = layer(x)
            loss = F.cross_entropy(x, data_dict['train'][1][ix])

            # Now do the backward pass and retain the gradients
            for layer in layers:
                layer.out.retain_grad()
            # zero out the gradients
            for p in parameter_list:
                p.grad = None
            
            # Do the backward propagation
            loss.backward()

            # Update the parameters
            lr = 0.1 if epoch<500000 else 0.01
            for p in parameter_list:
                p.data += -lr*p.grad
            
            with torch.no_grad():
                update_data_ratios.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameter_list])

            lossi.append(loss.item())
            if epoch%10000 == 0:
                # lossi.append(loss.item())
                print(f"epochs: {epoch:}/{epochs:} | loss :{loss.item():.4f}")
            # if epoch == 1000:
                # break
    else:
        # When we want to evaluate the trained network we need to set the batch_norm layers to evaluate mode:
        for layer in layers:
            layer.training = False
        with torch.no_grad():
            val_ix = torch.randint(0, data_dict['dev'][0].shape[0], (32,))  # this would generate 32 random indexes which we can filter our dataset to train over a mini-batch
            val_embed = c[data_dict['dev'][0][val_ix]]   # (32,3,2)
            x = val_embed.view(val_embed.shape[0], -1)
            for layer in layers:
                x = layer(x)
            val_loss = F.cross_entropy(x, data_dict['dev'][1][val_ix])
        print(f"The Validation loss is :{val_loss}")     
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=False)

epochs: 0/100000 | loss :3.3016
epochs: 10000/100000 | loss :2.3855
epochs: 20000/100000 | loss :2.0238
epochs: 30000/100000 | loss :2.1849
epochs: 40000/100000 | loss :2.1669
epochs: 50000/100000 | loss :2.3394
epochs: 60000/100000 | loss :2.0242
epochs: 70000/100000 | loss :2.3119
epochs: 80000/100000 | loss :2.0425
epochs: 90000/100000 | loss :1.9030


In [106]:
lossi_copy = lossi.copy()

In [114]:
training_loop(epochs = 100,batch_size = 32, data = data_dict, is_val_set=True)

The Validation loss is :1.9105645418167114


1. The training loss is 2.0261 and the validation loss is 2.122467 when trained for 100000 epochs.

In [115]:
print(len(lossi_copy))

100000


In [116]:
fig = go.Figure()

fig.add_trace(go.Scatter(y  = lossi_copy, name = "Loss"))
fig.update_layout(title = 'Loss Per epoch')

But the problem here is that the graph is very unappealing as it is now so what we should do is that we should make the graph of the mean so we get a more representative values.

In [119]:
# So we will create a tensor of lossi floats and we are going to stretch it out in a row format.
lossi_copy = torch.tensor(lossi_copy).view(-1,500)
# So the lossi tensor would be 200 rows of 500 values  and now we can do the mean along the rows and this would make the shape of 200 rows
print(lossi_copy.shape)
# Now get the mean values of the lossi_copy tensor
lossi_copy = lossi_copy.mean(1)
fig = go.Figure()

fig.add_trace(go.Scatter(y  = lossi_copy.numpy(), name = "Loss"))
fig.update_layout(title = 'Loss Per epoch')

torch.Size([200, 500])



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



Now we can see that the loss graph is a better looking graph

We can also see that the problem our forward pass also has more of sort of hacky implementations of embedding layers and also the list of the layers instead of being a list we should make them into a more  concrete implementation with an implementation similar to the torch.Sequential. So we should impelement sequential and embedding layers also.We would also need to create a flattening class.

#### So after making seperate modules for the layers, we we would incorporate them into our network. This would also change the forward pass of our network

In [147]:
n_embed = 10
n_hidden = 100
block_size = 3

# c = torch.randn(len(chartoidx), n_embed) # Instead of the embedding we would have the Embedding Layer in layers which we just added in this chapter

layers = [
    Embedding(len(chartoidx), n_embed),
    Flatten(),
    Linear(n_embed*block_size, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(chartoidx)),BatchNorm1D(len(chartoidx))
]


with torch.no_grad():
    # Because the last layer is batchNorm we would not be making the softmax less confident. We would make gamma less confident
    layers[-1].gamma*= 0.1
    # If we remove the batchnorm layer then we would have to make the softmax less confident
    # Make the output softmax less confident
    # layers[-1].weight *= 0.1

    for layer in layers[:-1]:
        # apply the gain parameters of the kaiming initialization
        if isinstance(layer, Linear):
            layer.weight *= 5/3
# calculate the parameters and make the parameters requires_grad attribute = True
parameter_list = [p for layer in layers for p in layer.parameters()]

p = sum([x.nelement() for x in parameter_list])
print(p)
for p in parameter_list:
    p.requires_grad = True

47551


In [150]:
epochs = 10000
batch_size = 32
lossi = []
update_data_ratios = []
# Now we would write the training loop
def training_loop(epochs, batch_size, data, is_val_set = False):
    if is_val_set == False:
        for epoch in range(0,epochs):
            # get random 32 indices to take from the training set
            ix = torch.randint(0, data_dict['train'][0].shape[0], size=(32,)) # returns 32 random integers between 0 and len(X_train)
            data = data_dict['train'][0][ix] # This would get the actual training data of shape (32,3) i.e (batch_size, block_size) 
            x = data.clone()

            # The getting the embeddings and the flattening part have been added straight to the model
            # embeddings = c[data]  # This will get the embeddings of the each of the data i.e (32,3,10)

            # # Now we would convert the embedding tensor to a shape that will be used by the model

            # x = embeddings.view(embeddings.shape[0],-1) # This is the reshaped data that would be passed into the model

            # # Now we would get the activations of the layers by inputting the data into our model
            for layer in layers:
                x = layer(x)
            loss = F.cross_entropy(x, data_dict['train'][1][ix])

            # Now do the backward pass and retain the gradients
            for layer in layers:
                layer.out.retain_grad()
            # zero out the gradients
            for p in parameter_list:
                p.grad = None
            
            # Do the backward propagation
            loss.backward()

            # Update the parameters
            lr = 0.1 if epoch<500000 else 0.01
            for p in parameter_list:
                p.data += -lr*p.grad
            
            with torch.no_grad():
                update_data_ratios.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameter_list])

            lossi.append(loss.item())
            if epoch%10000 == 0:
                # lossi.append(loss.item())
                print(f"epochs: {epoch:}/{epochs:} | loss :{loss.item():.4f}")
            # if epoch == 1000:
                # break
    else:
        # When we want to evaluate the trained network we need to set the batch_norm layers to evaluate mode:
        for layer in layers:
            layer.training = False
        with torch.no_grad():
            val_ix = torch.randint(0, data_dict['dev'][0].shape[0], (32,))  # this would generate 32 random indexes which we can filter our dataset to train over a mini-batch
            val_data = data_dict['dev'][0][val_ix]   # (32,3,2)
            x = val_data.clone()
            # x = val_embed.view(val_embed.shape[0], -1) # Also removed the embeddings and the falttening operations here

            for layer in layers:
                x = layer(x)
            val_loss = F.cross_entropy(x, data_dict['dev'][1][val_ix])
        print(f"The Validation loss is :{val_loss}")     


In [151]:
# Check if the current model is training
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=False)

epochs: 0/10000 | loss :2.3676


In [152]:
# Check if the NN is doing the validation properly
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=True)

The Validation loss is :2.1817123889923096


In [156]:
lossi_copy= lossi.copy()

In [157]:
# So we will create a tensor of lossi floats and we are going to stretch it out in a row format.
lossi_copy= lossi.copy()
lossi_copy = torch.tensor(lossi_copy).view(-1,500)
# So the lossi tensor would be 200 rows of 500 values  and now we can do the mean along the rows and this would make the shape of 200 rows
print(lossi_copy.shape)
# Now get the mean values of the lossi_copy tensor
lossi_copy = lossi_copy.mean(1)
fig = go.Figure()

fig.add_trace(go.Scatter(y  = lossi_copy.numpy(), name = "Loss"))
fig.update_layout(title = 'Loss Per epoch')

torch.Size([20, 500])


#### Now we would introduce the concept of containers in pytorch which simply put is a list of layers than instead of passing the layers in a naked list we would put them inside the container. and then we would pass our input through the container which would pass the input through all the layers and gives an output result.

In [7]:
# So the code changes substantially
n_embed = 10
n_hidden = 100
block_size = 3

# c = torch.randn(len(chartoidx), n_embed) # Instead of the embedding we would have the Embedding Layer in layers which we just added in this chapter

model = Sequential([
    Embedding(len(chartoidx), n_embed),
    Flatten(),
    Linear(n_embed*block_size, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(chartoidx)),BatchNorm1D(len(chartoidx))
])


# with torch.no_grad():
#     # Because the last layer is batchNorm we would not be making the softmax less confident. We would make gamma less confident
#     layers[-1].gamma*= 0.1
#     # If we remove the batchnorm layer then we would have to make the softmax less confident
#     # Make the output softmax less confident
#     # layers[-1].weight *= 0.1

#     for layer in layers[:-1]:
#         # apply the gain parameters of the kaiming initialization
#         if isinstance(layer, Linear):
#             layer.weight *= 5/3
# calculate the parameters and make the parameters requires_grad attribute = True
parameter_list = model.parameters()

p = sum([x.nelement() for x in parameter_list])
print(p)
for p in parameter_list:
    p.requires_grad = True

47551


In [10]:
# Then the forward pass also changes
data_dict ={'train':[X_train, y_train], 
            'dev':[X_dev, y_dev],
            'test':[X_test, y_test]}
            
epochs = 10000
batch_size = 32
lossi = []
update_data_ratios = []
# Now we would write the training loop
def training_loop(epochs, batch_size, data, is_val_set = False):
    if is_val_set == False:
        for epoch in range(0,epochs):
            # get random 32 indices to take from the training set
            ix = torch.randint(0, data_dict['train'][0].shape[0], size=(32,)) # returns 32 random integers between 0 and len(X_train)
            data = data_dict['train'][0][ix] # This would get the actual training data of shape (32,3) i.e (batch_size, block_size) 
            x = data.clone()

            # The getting the embeddings and the flattening part have been added straight to the model
            # embeddings = c[data]  # This will get the embeddings of the each of the data i.e (32,3,10)

            # # Now we would convert the embedding tensor to a shape that will be used by the model

            # x = embeddings.view(embeddings.shape[0],-1) # This is the reshaped data that would be passed into the model

            # # # Now we would get the activations of the layers by inputting the data into our model
            # for layer in layers:
            #     x = layer(x)
            x = model(x)
            loss = F.cross_entropy(x, data_dict['train'][1][ix])

            # # Now do the backward pass and retain the gradients
            # for layer in layers:
            #     layer.out.retain_grad()
            # zero out the gradients
            # for p in parameter_list:
            #     p.grad = None
            
            # Do the backward propagation
            loss.backward()

            # Update the parameters
            lr = 0.1 if epoch<500000 else 0.01
            for p in model.parameters():
                p.data += -lr*p.grad
            
            # with torch.no_grad():
            #     update_data_ratios.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameter_list])

            lossi.append(loss.item())
            if epoch%10000 == 0:
                # lossi.append(loss.item())
                print(f"epochs: {epoch:}/{epochs:} | loss :{loss.item():.4f}")
            # if epoch == 1000:
                # break
    else:
        # When we want to evaluate the trained network we need to set the batch_norm layers to evaluate mode:
        for layer in model.layers:
            layer.training = False
        with torch.no_grad():
            val_ix = torch.randint(0, data_dict['dev'][0].shape[0], (32,))  # this would generate 32 random indexes which we can filter our dataset to train over a mini-batch
            val_data = data_dict['dev'][0][val_ix]   # (32,3,2)
            x = val_data.clone()
            # x = val_embed.view(val_embed.shape[0], -1) # Also removed the embeddings and the falttening operations here

            output = model(x)
            val_loss = F.cross_entropy(output, data_dict['dev'][1][val_ix])
        print(f"The Validation loss is :{val_loss}") 

In [11]:
# Now we would run the Neural Network with the model as a list passed to a Sequential Container.
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=False)

epochs: 0/10000 | loss :3.5161


So we can see that our Sequential class works as expected and it gives us the output we need.

#### The wavenet Model
1. This model is a better model than the previous MLP which was squashing too much information too quickly. What this model does is that it impelements a window which takes two characters at a time and puts them side by side and then gives out the activations for those two. So it sort of takes bigrams and then in the next layer it takes two of those bigrams and so on. So it gradually squashes the information to predict the output and we will see if it has an effect on the predictions or not.
2. We would be taking 8 characters of context to predict the 9th character.

In [18]:
block_size = 8 # This would be used to set the number of context letters that would be in the word.
# blank_context = idxtochar[0] * block_size  # This would produce the context consisting of only special character "."
# print(f"The blank context is {blank_context}")

# Now we would need to split the names into train, dev and test sets
train_names = names[:int(0.8*len(names))]
dev_names = names[int(0.8*len(names)): int(0.9*len(names))]
test_names = names[int(0.9*len(names)):]

# Now we need to define our xs and ys
X_train, y_train = [],[]
X_dev, y_dev = [],[]
X_test, y_test = [], []

# Now we need to add the data into our xs and ys:
for i,split in enumerate([train_names, dev_names, test_names]):
    for word in split:
        word = word+"."
        blank_context = [0] * block_size  # This would produce the context consisting of only special character "."
        # xs.append(blank_context)
        if i == 0:
            for ch in word:
                X_train.append(blank_context)
                y_train.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
        if i == 1:
            for ch in word:
                X_dev.append(blank_context)
                y_dev.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
        if i == 2:
            for ch in word:
                X_test.append(blank_context)
                y_test.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
      
        # print(blank_context)
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_dev = torch.tensor(X_dev)
y_dev = torch.tensor(y_dev)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)  
print(X_train.shape, y_train.shape)
print(X_dev.shape, y_dev.shape)
print(X_test.shape, y_test.shape)

torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])
torch.Size([22735, 8]) torch.Size([22735])


In [19]:
for x,y in zip(X_train[:20], y_train[:20]):
    print(''.join(idxtochar[ix.item()] for ix in x), '-->', idxtochar[y.item()])

........ --> e
.......e --> m
......em --> m
.....emm --> a
....emma --> .
........ --> o
.......o --> l
......ol --> i
.....oli --> v
....oliv --> i
...olivi --> a
..olivia --> .
........ --> a
.......a --> v
......av --> a
.....ava --> .
........ --> i
.......i --> s
......is --> a
.....isa --> b


The context has been changed for the item to 8 characters to predict the 9th character.

1. Now to implement the model as a wavenet model we need to take two consecutive characters and pass them to the model. The character embeddings should be concatenated and fed into the model to generate the predictions. 
2. Lets take an example like the characters indexes are (1,2,3,4,5,6,7,8). So the way that they should be fed into the model is ((1,2),(3,4),(5,6),(7,8)).
Lets see a toy example for the dataset.

In [8]:
# Get 4 random integers
X_s = torch.randint(0,X_train.shape[0], (4,))

# Now get the data from the Dataset
Xb, yb = X_train[X_s], y_train[X_s]

print(Xb.shape)
Xb

torch.Size([4, 8])


tensor([[ 0,  0,  0,  0,  0, 13, 15, 18],
        [ 0,  0,  0,  0,  0,  0,  0,  1],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0, 13,  9]])

1. The above shape is 4 examples of n=8 context characters. which would be used to predict the 9th character. But for the input to the wavenet model we need to concatenate the consecutive indexes like (1,2) and (3,4). So The (4,8) shape would change to (2,2,8) which would preserve the overall shape and would also make input required for the model. 
2. So for the inputs they need to flattened and concatenated in the 2nd dimension. We can do this by doing the view operation which does this for us. So we would modify our flatten layer definition to Flatten consecutive and it will take another parameter which is the number of consecutive dimensions to be flattened.

In [9]:
# We will se that the view operation will do exactly what we want and concatenate the consecutive letters indexes.
Xb.view(2,2,8)

tensor([[[ 0,  0,  0,  0,  0, 13, 15, 18],
         [ 0,  0,  0,  0,  0,  0,  0,  1]],

        [[ 0,  0,  0,  0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0,  0, 13,  9]]])

In [10]:
embedding = Embedding(len(chartoidx), 10)
embedding(Xb).shape
f = FlattenConsecutive(2)
f(embedding(Xb)).shape

torch.Size([4, 4, 20])

So our embedding layer takes in indexes of the words and gives out an embedding vector for each index which would be of the shape (m,context_length, embedding_dim). Here m would be the number of examples in a batch, context_length would be the number of preceeding characters used to predict the next character and embedding_dim will be embedding dimension. Then we flatten it and push the tensor into the Neural Network which matrix multiplies to generate 200 channels.

The matrix multiply operator in pytorch is very good and it can multiply not only two dimensional vectors with each other but also multiplies higher dimensional Vectors.

In [16]:
torch.randn((4,5)) @ torch.randn((5,4)) + torch.randn((4))   # A 2D matrix multiply with a broadcasting operation

tensor([[-1.6363,  6.3076,  3.7587,  1.4628],
        [-0.3605,  2.7366,  2.0290,  0.8838],
        [-0.6808, -0.4368,  0.0679,  1.2652],
        [-0.6717,  3.0158,  0.3834,  0.9630]])

In [20]:
x = torch.randn((4,2,5)) @ torch.randn((5,4)) + torch.randn((4))   # A higher dimensional matrix multiplication
print(x.shape)  

torch.Size([4, 2, 4])


A higher dimensional matrix multiply works only when the last dimension of the matrix matches the 1st dimension of the other matrix and does not depend upon the other dimensions. For eg. a (4,2,5) matrix can only be multiplied by (5,4) matrix i.e the matrix multiplication works only on the last dimension and the dimensions before it are left unchanged.

We can use this in our network now to matrix multiply groups of bigrams and we would like to process these groups in parallel. Do we would process these 4 bigram groups in individual examples and also over the actual batch dimension.

This would need a flatten consecutive layer and a we would also change the linear layer.

In [11]:
# So the code changes substantially
n_embed = 10
n_hidden = 68
block_size = 8
consecutive_size = 2

# c = torch.randn(len(chartoidx), n_embed) # Instead of the embedding we would have the Embedding Layer in layers which we just added in this chapter

model = Sequential([
    Embedding(len(chartoidx), n_embed),
    FlattenConsecutive(consecutive_size),
    Linear(n_embed*consecutive_size, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2),
    Linear(n_hidden*consecutive_size, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2),
    Linear(n_hidden*consecutive_size, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(chartoidx))#,BatchNorm1D(len(chartoidx))
])


with torch.no_grad():
#     # Because the last layer is batchNorm we would not be making the softmax less confident. We would make gamma less confident
#     model.layers[-1].gamma*= 0.1
#     # If we remove the batchnorm layer then we would have to make the softmax less confident
#     # Make the output softmax less confident
    model.layers[-1].weight *= 0.1

#     for layer in layers[:-1]:
#         # apply the gain parameters of the kaiming initialization
#         if isinstance(layer, Linear):
#             layer.weight *= 5/3
# calculate the parameters and make the parameters requires_grad attribute = True
parameter_list = model.parameters()

p = sum([x.nelement() for x in parameter_list])
print(p)
for p in parameter_list:
    p.requires_grad = True

32257


In [12]:
# Get 4 random integers
X_s = torch.randint(0,X_train.shape[0], (4,))

# Now get the data from the Dataset
Xb, yb = X_train[X_s], y_train[X_s]
out = model(Xb)

In [13]:
# Lets inspect the layers of the model and their shapes
for layer in model.layers:
    print(layer.__class__.__name__,':',tuple(layer.out.shape))

Embedding : (4, 8, 10)
FlattenConsecutive : (4, 4, 20)
Linear : (4, 4, 68)
BatchNorm1D : (4, 4, 68)
Tanh : (4, 4, 68)
FlattenConsecutive : (4, 2, 136)
Linear : (4, 2, 68)
BatchNorm1D : (4, 2, 68)
Tanh : (4, 2, 68)
FlattenConsecutive : (4, 136)
Linear : (4, 68)
BatchNorm1D : (4, 68)
Tanh : (4, 68)
Linear : (4, 68)
BatchNorm1D : (4, 68)
Tanh : (4, 68)
Linear : (4, 68)
BatchNorm1D : (4, 68)
Tanh : (4, 68)
Linear : (4, 27)


In [14]:
# lets train the new model
# Then the forward pass also changes
data_dict ={'train':[X_train, y_train], 
            'dev':[X_dev, y_dev],
            'test':[X_test, y_test]}
            
epochs = 100000
batch_size = 32
lossi = []
update_data_ratios = []
# Now we would write the training loop
def training_loop(epochs, batch_size, data, is_val_set = False):
    if is_val_set == False:
        for epoch in range(0,epochs):
            # get random 32 indices to take from the training set
            ix = torch.randint(0, data_dict['train'][0].shape[0], size=(32,)) # returns 32 random integers between 0 and len(X_train)
            data = data_dict['train'][0][ix] # This would get the actual training data of shape (32,3) i.e (batch_size, block_size) 
            x = data.clone()
            # The getting the embeddings and the flattening part have been added straight to the model
            # embeddings = c[data]  # This will get the embeddings of the each of the data i.e (32,3,10)

            # # Now we would convert the embedding tensor to a shape that will be used by the model

            # x = embeddings.view(embeddings.shape[0],-1) # This is the reshaped data that would be passed into the model

            # # # Now we would get the activations of the layers by inputting the data into our model
            # for layer in layers:
            #     x = layer(x)
            out = model(x)
            loss = F.cross_entropy(out, data_dict['train'][1][ix])

            # # Now do the backward pass and retain the gradients
            # for layer in layers:
            #     layer.out.retain_grad()
            # zero out the gradients
            # for p in parameter_list:
            #     p.grad = None
            
            # Do the backward propagation
            loss.backward()

            # Update the parameters
            lr = 0.1 if epoch<500000 else 0.01
            for p in model.parameters():
                p.data += -lr*p.grad
            
            # with torch.no_grad():
            #     update_data_ratios.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameter_list])

            lossi.append(loss.item())
            if epoch%10000 == 0:
                # lossi.append(loss.item())
                print(f"epochs: {epoch:}/{epochs:} | loss :{loss.item():.4f}")
            # if epoch == 1000:
            #     break
    else:
        # When we want to evaluate the trained network we need to set the batch_norm layers to evaluate mode:
        for layer in model.layers:
            layer.training = False
        with torch.no_grad():
            val_ix = torch.randint(0, data_dict['dev'][0].shape[0], (32,))  # this would generate 32 random indexes which we can filter our dataset to train over a mini-batch
            val_data = data_dict['dev'][0][val_ix]   # (32,3,2)
            x = val_data.clone()
            # x = val_embed.view(val_embed.shape[0], -1) # Also removed the embeddings and the falttening operations here

            output = model(x)
            val_loss = F.cross_entropy(output, data_dict['dev'][1][val_ix])
        print(f"The Validation loss is :{val_loss}") 

In [15]:
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=False)

epochs: 0/100000 | loss :3.3042
epochs: 10000/100000 | loss :1632.2870
epochs: 20000/100000 | loss :7094.0078
epochs: 30000/100000 | loss :6695.6665
epochs: 40000/100000 | loss :7129.4634
epochs: 50000/100000 | loss :11402.9648
epochs: 60000/100000 | loss :7932.8296
epochs: 70000/100000 | loss :5537.4106
epochs: 80000/100000 | loss :9695.0410
epochs: 90000/100000 | loss :14758.2354


In [16]:
lossi_copy= lossi.copy()
lossi_copy = torch.tensor(lossi_copy).view(-1,500)
# So the lossi tensor would be 200 rows of 500 values  and now we can do the mean along the rows and this would make the shape of 200 rows
print(lossi_copy.shape)
# Now get the mean values of the lossi_copy tensor
lossi_copy = lossi_copy.mean(1)
fig = go.Figure()

fig.add_trace(go.Scatter(y  = lossi_copy.numpy(), name = "Loss"))
fig.update_layout(title = 'Loss Per epoch')

torch.Size([200, 500])


1. So the problem that we are having due to which the loss in increasing is that the BatchNorm layer is not implemented to tackle 3D input (B,T,C) rather we had only implemented it to calculate 2D inputs.
2. So the problem stems when we are calcualting the mean of the input of the batchNorm, We input a (B,T,C) tensor and when the mean is calculated  on dim = 0 we get (1,T,C). So when the mean is broadcasted everything works without getting any error but we do not get the correct answer. The correct answer should be (1,1,C) which is the case for a 2D input we get a (B,C) tensor and when calculating the mean we get (1,C) output. So we will change the batchNorm layer implementation and do a BatchNorm2D layer which will be similar to BatchNorm1D but calculates other mean.

In [16]:
# So the code changes substantially
n_embed = 10
n_hidden = 200
block_size = 8
consecutive_size = 2

# c = torch.randn(len(chartoidx), n_embed) # Instead of the embedding we would have the Embedding Layer in layers which we just added in this chapter

model = Sequential([
    Embedding(len(chartoidx), n_embed),
    FlattenConsecutive(2),Linear(n_embed*2, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    # Linear(n_hidden, n_hidden, bias = False),BatchNorm1D(n_hidden), Tanh(),
    # Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(chartoidx))#,BatchNorm1D(len(chartoidx))
])


with torch.no_grad():
#     # Because the last layer is batchNorm we would not be making the softmax less confident. We would make gamma less confident
    # model.layers[-1].gamma*= 0.1
#     # If we remove the batchnorm layer then we would have to make the softmax less confident
#     # Make the output softmax less confident
    model.layers[-1].weight *= 0.1

#     for layer in layers[:-1]:
#         # apply the gain parameters of the kaiming initialization
#         if isinstance(layer, Linear):
#             layer.weight *= 5/3
# calculate the parameters and make the parameters requires_grad attribute = True
parameter_list = model.parameters()
print([type(x) for x in parameter_list])

p = sum([x.nelement() for x in parameter_list])
print(p)
for p in parameter_list:
    p.requires_grad = True

[<class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>, <class 'torch.Tensor'>]
171497


In [48]:
# Get 4 random integers
X_s = torch.randint(0,X_train.shape[0], (4,))

# Now get the data from the Dataset
Xb, yb = X_train[X_s], y_train[X_s]
out = model(Xb)

In [49]:
# Lets inspect the layers of the model and their shapes
for layer in model.layers:
    print(layer.__class__.__name__,':',tuple(layer.out.shape))

Embedding : (4, 8, 10)
FlattenConsecutive : (4, 4, 20)
Linear : (4, 4, 200)
BatchNorm2D : (4, 4, 200)
Tanh : (4, 4, 200)
FlattenConsecutive : (4, 2, 400)
Linear : (4, 2, 200)
BatchNorm2D : (4, 2, 200)
Tanh : (4, 2, 200)
FlattenConsecutive : (4, 400)
Linear : (4, 200)
BatchNorm1D : (4, 200)
Tanh : (4, 200)
Linear : (4, 200)
BatchNorm1D : (4, 200)
Tanh : (4, 200)
Linear : (4, 27)
BatchNorm1D : (4, 27)


In [20]:
# lets train the new model
# Then the forward pass also changes
data_dict ={'train':[X_train, y_train], 
            'dev':[X_dev, y_dev],
            'test':[X_test, y_test]}
            
epochs = 100000
batch_size = 32
lossi = []
update_data_ratios = []
# Now we would write the training loop
def training_loop(epochs, batch_size, data, is_val_set = False):
    if is_val_set == False:
        for epoch in range(0,epochs):
            # get random 32 indices to take from the training set
            ix = torch.randint(0, data_dict['train'][0].shape[0], size=(32,)) # returns 32 random integers between 0 and len(X_train)
            data = data_dict['train'][0][ix] # This would get the actual training data of shape (32,3) i.e (batch_size, block_size) 
            x = data.clone()
            # The getting the embeddings and the flattening part have been added straight to the model
            # embeddings = c[data]  # This will get the embeddings of the each of the data i.e (32,3,10)

            # # Now we would convert the embedding tensor to a shape that will be used by the model

            # x = embeddings.view(embeddings.shape[0],-1) # This is the reshaped data that would be passed into the model

            # # # Now we would get the activations of the layers by inputting the data into our model
            # for layer in layers:
            #     x = layer(x)
            out = model(x)
            loss = F.cross_entropy(out, data_dict['train'][1][ix])

            # # Now do the backward pass and retain the gradients
            # for layer in layers:
            #     layer.out.retain_grad()
            # zero out the gradients
            # for p in parameter_list:
            #     p.grad = None
            
            # Do the backward propagation
            loss.backward()

            # Update the parameters
            lr = 0.1 if epoch<500000 else 0.01
            for p in model.parameters():
                p.data += -lr*p.grad
            
            # with torch.no_grad():
            #     update_data_ratios.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameter_list])

            lossi.append(loss.item())
            if epoch%10000 == 0:
                # lossi.append(loss.item())
                print(f"epochs: {epoch:}/{epochs:} | loss :{loss.item():.4f}")
            # if epoch == 1000:
            #     break
    else:
        # When we want to evaluate the trained network we need to set the batch_norm layers to evaluate mode:
        for layer in model.layers:
            layer.training = False
        with torch.no_grad():
            val_ix = torch.randint(0, data_dict['dev'][0].shape[0], (32,))  # this would generate 32 random indexes which we can filter our dataset to train over a mini-batch
            val_data = data_dict['dev'][0][val_ix]   # (32,3,2)
            x = val_data.clone()
            # x = val_embed.view(val_embed.shape[0], -1) # Also removed the embeddings and the falttening operations here

            output = model(x)
            val_loss = F.cross_entropy(output, data_dict['dev'][1][val_ix])
        print(f"The Validation loss is :{val_loss}") 

In [21]:
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=False)

epochs: 0/100000 | loss :3.3162
epochs: 10000/100000 | loss :8848.8838
epochs: 20000/100000 | loss :18210.4688
epochs: 30000/100000 | loss :24337.4453
epochs: 40000/100000 | loss :28440.7832


KeyboardInterrupt: 