In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [2]:
with open("D://Datasets/names.txt", 'r') as file:
    names = file.read().splitlines()
names[:8]
torch.manual_seed(42)   # Here we set the global seed value
vocabulary = sorted(list(set(''.join(names))))
chartoidx = {}
idxtochar = {}
chartoidx['.'] = 0   # Putting a special token to denote the start and the end of a sentence.
idxtochar[0] = '.'
for i,char in enumerate(vocabulary):
    chartoidx[char] = i+1
    idxtochar[i+1] = char

print(chartoidx)
print(len(chartoidx))

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
27


In [3]:
# So we would convert our code into modules like the Linear Module and  the BatchNorm Module and then we would write the training loop pytorch style
class Linear():
    """ This class creates a Linear layer with the weights and biases and performs the calculations"""
    def __init__(self, fan_in, fan_out, bias = True):
        self.weight = torch.randn((fan_in, fan_out)) / (fan_in**0.5)
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight, [] if self.bias == None else self.bias]
# ------------------------------------------------------------------------------------------------------------------------------------------------
class BatchNorm1D():
    """ This class creates a Batch Normalization layer"""
    def __init__(self, dim, epsilon = 1e-5, momentum = 0.1):
        self.epsilon = epsilon
        self.momentum = momentum
        self.training = True
        # Initializing learnable paramters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # Buffers trained with a momentum update
        self.running_mean = torch.zeros(dim)
        self.running_variance = torch.ones(dim)

    def __call__(self, X):
        if X.ndim == 2:
            dim = 0
        if X.ndim == 3:
            dim = (0,1)
        # Calculate the forward pass
        if self.training:
            xmean = X.mean(dim, keepdim=True)
            xvar = X.var(dim,keepdim = True, unbiased = True)
        else:
            xmean = self.running_mean
            xvar = self.running_variance

        xhat = (X-xmean)/ torch.sqrt(xvar+self.epsilon)
        self.out = self.gamma * xhat + self.beta

        # Now update the running mean buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * xmean
                self.running_variance = (1-self.momentum)* self.running_variance + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]
# ---------------------------------------------------------------------------------------------------------------------------------------------------
class Tanh():
    def __call__(self,X):
        self.out = torch.tanh(X)
        return self.out
    def parameters(self):
        return []

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class Embedding():
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))
        
    def __call__(self,IX):
        self.out = self.weight[IX]
        return self.out

    def parameters(self):
        return [self.weight]
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class Flatten():
    def __call__(self, X):
        self.out =  X.view(X.shape[0], -1)
        return self.out
    def parameters(self):
        return []

# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class Sequential():
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, X):
        for layer in self.layers:
            X = layer(X)
        self.out = X
        return self.out

    def parameters(self):
         return [p for layer in self.layers for p in layer.parameters()]
            

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class FlattenConsecutive():
    def __init__(self, n):
        """ n is the how many consecutive tensors you want concatenated in the output"""
        self.n = n
    def __call__(self, X):
        B, T, C = tuple(X.shape)   # x is a 3D tensor
        X = X.view(B, T//self.n, C*self.n)
        # Sometimes T==n it creates a 1 at dim = 1 in the self.out tensor. So we would remove the extra dimension by tensor.squeeze in that case
        if X.shape[1] == 1:
            X = X.squeeze(dim=1)
        self.out = X
        return self.out
    def parameters(self):
        return []
        

In [4]:
block_size = 8 # This would be used to set the number of context letters that would be in the word.
# blank_context = idxtochar[0] * block_size  # This would produce the context consisting of only special character "."
# print(f"The blank context is {blank_context}")

# Now we would need to split the names into train, dev and test sets
train_names = names[:int(0.8*len(names))]
dev_names = names[int(0.8*len(names)): int(0.9*len(names))]
test_names = names[int(0.9*len(names)):]

# Now we need to define our xs and ys
X_train, y_train = [],[]
X_dev, y_dev = [],[]
X_test, y_test = [], []

# Now we need to add the data into our xs and ys:
for i,split in enumerate([train_names, dev_names, test_names]):
    for word in split:
        word = word+"."
        blank_context = [0] * block_size  # This would produce the context consisting of only special character "."
        # xs.append(blank_context)
        if i == 0:
            for ch in word:
                X_train.append(blank_context)
                y_train.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
        if i == 1:
            for ch in word:
                X_dev.append(blank_context)
                y_dev.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
        if i == 2:
            for ch in word:
                X_test.append(blank_context)
                y_test.append(chartoidx[ch])
                blank_context = blank_context[1:] + [chartoidx[ch]]
      
        # print(blank_context)
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_dev = torch.tensor(X_dev)
y_dev = torch.tensor(y_dev)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)  
print(X_train.shape, y_train.shape)
print(X_dev.shape, y_dev.shape)
print(X_test.shape, y_test.shape)

torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])
torch.Size([22735, 8]) torch.Size([22735])


In [20]:
# So the code changes substantially
n_embed = 10
n_hidden = 128
block_size = 8
consecutive_size = 2

model = Sequential([
    Embedding(len(chartoidx), n_embed),
    FlattenConsecutive(2),Linear(n_embed*2, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(chartoidx)),BatchNorm1D(len(chartoidx))
])
# model = Sequential([
#     Embedding(len(chartoidx), n_embed),
#     Flatten(),
#     Linear(n_embed*block_size, n_hidden),BatchNorm1D(n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
#     # Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
#     # Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
#     # Linear(n_hidden, n_hidden),BatchNorm1D(n_hidden), Tanh(),
#     Linear(n_hidden, len(chartoidx))#,BatchNorm1D(len(chartoidx))
# ])

with torch.no_grad():
#     # Because the last layer is batchNorm we would not be making the softmax less confident. We would make gamma less confident
    model.layers[-1].gamma*= 0.1
#     # If we remove the batchnorm layer then we would have to make the softmax less confident
#     # Make the output softmax less confident
    model.layers[-2].weight *= 0.1

    for layer in model.layers[:-1]:
        # apply the gain parameters of the kaiming initialization
        if isinstance(layer, Linear):
            layer.weight *= 5/3
# calculate the parameters and make the parameters requires_grad attribute = True
parameter_list = model.parameters()
# print([type(x) for x in parameter_list])

p = sum([x.nelement() for x in parameter_list])
print(p)
for p in parameter_list:
    p.requires_grad = True

106591


In [27]:
for layer in model.layers:
    print(layer.__class__.__name__,':',tuple(layer.out.shape))

Embedding : (32, 8, 10)
Flatten : (32, 80)
Linear : (32, 200)
BatchNorm1D : (32, 200)
Tanh : (32, 200)
Linear : (32, 200)
BatchNorm1D : (32, 200)
Tanh : (32, 200)
Linear : (32, 200)
BatchNorm1D : (32, 200)
Tanh : (32, 200)
Linear : (32, 200)
BatchNorm1D : (32, 200)
Tanh : (32, 200)
Linear : (32, 200)
BatchNorm1D : (32, 200)
Tanh : (32, 200)
Linear : (32, 27)


In [83]:
data_dict ={'train':[X_train, y_train], 
            'dev':[X_dev, y_dev],
            'test':[X_test, y_test]}
            
epochs = 100000
batch_size = 32
lossi = []
update_data_ratios = []
# Now we would write the training loop
def training_loop(epochs, batch_size, data, is_val_set = False):
    if is_val_set == False:
        for epoch in range(0,epochs):
            # get random 32 indices to take from the training set
            ix = torch.randint(0, data_dict['train'][0].shape[0], size=(32,)) # returns 32 random integers between 0 and len(X_train)
            train_data = data_dict['train'][0][ix] # This would get the actual training data of shape (32,3) i.e (batch_size, block_size) 
            train_labels = data_dict['train'][1][ix]
            # print(batch_data.shape)
            # print(train_data.shape, train_labels.shape)
            out = model(train_data)
            # print(out.shape)
            loss = F.cross_entropy(out, train_labels)
            # print(loss, loss.shape)
            for layer in model.layers:
                layer.out.retain_grad()

            loss.backward()

            

            # Update the parameters
            lr = 0.1 if epoch<500000 else 0.01
            for p in model.parameters():
                p.data += -lr*p.grad

            lossi.append(loss.item())
            if epoch%10000 == 0:
                print(f"epochs: {epoch:}/{epochs:} | loss :{loss.item():.4f}")
            if epoch == 10000:
                break
    else:
        # When we want to evaluate the trained network we need to set the batch_norm layers to evaluate mode:
        for layer in model.layers:
            layer.training = False
        with torch.no_grad():
            val_ix = torch.randint(0, data_dict['dev'][0].shape[0], (32,))  # this would generate 32 random indexes which we can filter our dataset to train over a mini-batch
            val_data = data_dict['dev'][0][val_ix]   # (32,3,2)
            x = val_data.clone()
            # x = val_embed.view(val_embed.shape[0], -1) # Also removed the embeddings and the falttening operations here

            output = model(x)
            val_loss = F.cross_entropy(output, data_dict['dev'][1][val_ix])
        print(f"The Validation loss is :{val_loss}") 

In [84]:
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=False)

epochs: 0/100000 | loss :105.6316
epochs: 10000/100000 | loss :714.4097


In [7]:
# Get 4 random integers
X_s = torch.randint(0,X_train.shape[0], (32,))

# Now get the data from the Dataset
Xb, yb = X_train[X_s], y_train[X_s]
# embedding = Embedding(27,10)
# flatten = Flatten()
# batch = BatchNorm1D(200)
# ta = Tanh()
# e_Xb = embedding(Xb)
# f_Xb = flatten(e_Xb)
# print(f_Xb)
print(Xb.shape)
# print(Xb.shape)
# print(Xb)

torch.Size([32, 8])


In [41]:
activation = d.clone()
print(activation.shape)

torch.Size([32, 200])


In [23]:
from time import sleep

In [21]:
data_dict ={'train':[X_train, y_train], 
            'dev':[X_dev, y_dev],
            'test':[X_test, y_test]}
lossi = []
for i in range(100000):
    X_s = torch.randint(0,X_train.shape[0], (32,)) # returns 32 random integers between 0 and len(X_train)
    Xb, yb = X_train[X_s], y_train[X_s]
    logits = model(Xb)
    loss = F.cross_entropy(logits, yb)
    for layer in model.layers:
        layer.out.retain_grad()
    
    loss.backward()
    
    # print(loss)
    lr = 0.001 if i < 30000 else 0.0001
    for p in model.parameters():
        # print(p.shape)
        p.data += -lr*p.grad
    lossi.append(loss.item())
    # print(loss.item())
    if i%10000 == 0:
        print(f"epochs: {i:}/100000: | loss :{loss.item():.4f}")

epochs: 0/100000: | loss :3.3002
epochs: 10000/100000: | loss :2.1854
epochs: 20000/100000: | loss :2.5019
epochs: 30000/100000: | loss :2.8474
epochs: 40000/100000: | loss :2.0885
epochs: 50000/100000: | loss :2.8248
epochs: 60000/100000: | loss :2.2845
epochs: 70000/100000: | loss :2.7587
epochs: 80000/100000: | loss :2.4373
epochs: 90000/100000: | loss :3.0283


In [22]:
lossi_copy= lossi.copy()
lossi_copy = torch.tensor(lossi_copy).view(-1,500)
# So the lossi tensor would be 200 rows of 500 values  and now we can do the mean along the rows and this would make the shape of 200 rows
print(lossi_copy.shape)
# Now get the mean values of the lossi_copy tensor
lossi_copy = lossi_copy.mean(1)
fig = go.Figure()

fig.add_trace(go.Scatter(y  = lossi_copy.numpy(), name = "Loss"))
fig.update_layout(title = 'Loss Per epoch')

torch.Size([200, 500])


In [23]:
import plotly.graph_objects as go

fig = go.Figure()
for i,layer in enumerate(model.layers[:-1]):
    if isinstance(layer, Tanh):
        l = f"Layer {i}"
        d = layer.out.detach()
        
        hy,hx = torch.histogram(d, density=True)
        fig.add_trace(go.Scatter(x = hx[:-1], y=hy, name = l))
        print(f"{l} | mean = {d.mean()} | std = {d.std()} | % Saturated = {(d.abs()>0.97).float().mean() * 100} %")

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="center",
    x=0.50,
    title = 'Layers'
), title = "Tanh Activations")
fig.show()

Layer 4 | mean = 0.02272501401603222 | std = 0.9961785674095154 | % Saturated = 98.809814453125 %
Layer 8 | mean = 0.011339999735355377 | std = 0.9909374117851257 | % Saturated = 96.35009765625 %
Layer 12 | mean = -0.005315559916198254 | std = 0.9917203783988953 | % Saturated = 96.4111328125 %
Layer 15 | mean = 0.07069593667984009 | std = 0.9906419515609741 | % Saturated = 97.0947265625 %
Layer 18 | mean = -0.023318111896514893 | std = 0.9950968027114868 | % Saturated = 97.8515625 %


In [24]:
fig = go.Figure()
for i,layer in enumerate(model.layers[:-1]):
    if isinstance(layer, Tanh):
        l = f"Layer {i}"
        d = layer.out.grad.detach()
        print(f"{l} | mean = {d.mean()} | std = {d.std()}")
        hy,hx = torch.histogram(d, density=True)
        fig.add_trace(go.Scatter(x = hx[:-1], y=hy, name = l))

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
    title = 'Layers'
), title = "Tanh Layer Gradients")

fig.show()

Layer 4 | mean = -2.518163455533795e-11 | std = 0.01995280385017395
Layer 8 | mean = 3.169020601490047e-12 | std = 0.010276095010340214
Layer 12 | mean = -9.485745522397337e-12 | std = 0.007046087179332972
Layer 15 | mean = -6.579625733138528e-12 | std = 0.0049035935662686825
Layer 18 | mean = 8.981260180007666e-12 | std = 0.004913329612463713


In [35]:
training_loop(epochs = epochs,batch_size = batch_size, data = data_dict, is_val_set=True)

The Validation loss is :44534.5078125
