## Imports

In [1]:
import pickle

In [2]:
import torch
import torch.nn as nn

In [3]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [4]:
import gc

#### Load the tensors

In [5]:
def load_pickle(f_name):
    with open(f_name,"rb") as f:
        obj = pickle.load(f)
    
    return obj

In [6]:
train_text_tensors = load_pickle("/kaggle/input/converting-text-to-tensors-ipynb/train_text_tensors.pkl")

In [7]:
test_text_tensors = load_pickle("/kaggle/input/converting-text-to-tensors-ipynb/test_text_tensors.pkl")

In [8]:
train_labels_tensors = load_pickle("/kaggle/input/converting-text-to-tensors-ipynb/train_labels_tensors")

In [9]:
test_labels_tensors = load_pickle("/kaggle/input/converting-text-to-tensors-ipynb/test_labels_tensors")

## Custom Dataset and DataLoaders

In [10]:
class TextLabelDataset(Dataset):
    def __init__(self,text_tensor,label_tensor):
        self.text_tensor = text_tensor
        self.label_tensor = label_tensor
    
    def __len__(self):
        return len(self.text_tensor)
    
    def __getitem__(self,idx):
        return self.text_tensor[idx],self.label_tensor[idx]

In [11]:
train_dataset = TextLabelDataset(train_text_tensors,train_labels_tensors)

In [12]:
del train_text_tensors
del train_labels_tensors
gc.collect

<function gc.collect(generation=2)>

In [13]:
test_dataset = TextLabelDataset(test_text_tensors,test_labels_tensors)

In [14]:
del test_text_tensors
del test_labels_tensors
gc.collect()

28

In [15]:
train_loader = DataLoader(train_dataset,batch_size=32,shuffle=True)

In [16]:
del train_dataset
gc.collect()

0

In [17]:
test_loader = DataLoader(test_dataset,batch_size=32,shuffle=True)

In [18]:
del test_dataset
gc.collect()

0

In [19]:
##having a sneak at one train batch

for idx,data in enumerate(train_loader):
    text_tensor,label_tensor = data
    
    print(f"Shape of text tensor {text_tensor.shape}")
    print(f"Shape of label tensor {label_tensor.shape}")
    break

Shape of text tensor torch.Size([32, 16788])
Shape of label tensor torch.Size([32, 3])


## Building a Custom Model using Model parallelism

In [20]:
class TextClassification(nn.Module):
    
    def __init__(self,vocab_size,embed_dim,hidden_dim):
        
        super().__init__()
    
        self.embed = nn.Embedding(vocab_size,hidden_dim,padding_idx=0)
        self.gelu1 = nn.GELU()
        self.linear = nn.Linear(hidden_dim,32)
        self.gelu2 = nn.GELU()
        self.final = nn.Linear(32,3)#since there are 6 classes in the dataset
    
    def forward(self,text_tensor):
        output = self.embed(text_tensor)
        output = self.gelu1(output)
        output = self.linear(output)
        output = self.gelu2(output)
        output = self.final(output)
        
        return output

In [21]:
model = TextClassification(vocab_size = 16772,embed_dim=512,hidden_dim=128)

In [22]:
loss = nn.CrossEntropyLoss()

In [23]:
## test forward pass for one batch
##having a sneak at one train batch

for idx,data in enumerate(train_loader):
    text_tensor,label = data
    text_tensor = text_tensor.to(torch.int)
    label = label.to(torch.long)
    output = model(text_tensor)
    
    loss_val = loss(output,label)
    
    print(loss_val.item())
    break

9.697684288024902


### Okay then! we can proceed with writing the training loop

##### But wait a min!! we still have bunch of things left to do

# Set the loss function:

Since we are dealing with a classification problem here , we will use cross entopy loss function
which basically calculates the entropy between the probability distribuition which is outputed by the model and the gold probability distribuition.

The more similar the probability distribution the less will the entropy i.e the loss

> Foumula for binary classification for a single instance
loss = -(p(gold output)*log(p(predicted output)) - (1-p(gold output))*log(1-p(predicted output)))

> The above can be be generalized for multiple classes
loss = -sum for each class(p(gold output)*log(p(predicted output)))


In [24]:
loss_fn = nn.CrossEntropyLoss()

# Set the optimizer

The optimizer decides how the gradient will be updated in backpropagation.Some of the commonly known optimizers are :

> 1) SGD : This is the same as mini batch gradient descent .w_new = W_old - grad_of_loss_wrt_w

> 2) SGD with momentum : Now SGD can get stuck it local minima,this borrows the intuition of momentum in physics, where the momentum due to previous gradient updates can push the parameter out of its local minima and lead towards global minima.  w_new = W_old + momentum, where momentum = function(grad_wrt_w_old, previous momentum)

> 3) Adagrad : Often times, using only momentum might be slow, as the gradients of many paprameters might tend to zero and not contirbute to gradient descent. Here we maintain the sums of squares of gradient and divide the learning rate of each paramater by the square root of the accualted sums of gradients for that particular parameter. Hence unlike momentum , different parameters will have different learning rates , which is better.

> 4) RMSProp : Since Adagrad uses squares of gradients, it can accumulate gradients too quickly and thus cause the learning rates to drop too rapidly. RMSProp thus handles the problem of Adagrad.

> 5) Adam : Adam basically combines the adavantages of momentum and rmsProp. With momentum it acheives the speed of convergence and with rmsprop every parameter gets to have its own learning rate.


In [25]:
#setting sgd with momentum
optimizer = torch.optim.SGD(model.parameters(),lr=0.001,momentum=0.9)


In [26]:
def training_for_one_epoch(dataloader,epoch_nos):
    '''
        One epoch will do forward pass and backward pass for the model for the entire dataset for one iteration.
        For convergence, we would have do forward and backward pass multiple times for the dataset

    '''
    loss_for_the_epoch = 0
    
    #iterate through the dataset in batches, i.e we will at a time load the chuncks(of size = batch size) of dataset
    for idx,data in enumerate(dataloader):
        
        input_tensors,label_tensors = data
        
        #refer to the notebook on autograd in the repository and recall how we calculate gradients.
        #optimizer will store the .grad value of all the parameters, hence at the start of our iteration, we will set the grads to 0
        #this is important, as we do not want to accumulate the gradients for each batch.
        #since we are doing sgd, we will update the gradients(via backprop) at the end of each batch and clear the gradients at the begining of each batch
        
        optimizer.zero_grad()
        
        #get the ouput of the model by doing forward pass
        model_output = model(input_tensors)
        
        #compute the loss
        loss = loss_fn(model_output,label_tensors)
        
        #now that we have the loss!, we would want to update all the parameters of our model by backprop
        #and recall this step from the autograd notebook as well
        #hence this step will calculate and update the .grad values of all the parameters in our model
        loss.backward()
        
        
        #and now that we have the gradient values of all our parameter, we would do gradient descent for the parameters. i.e upate the parameter values
        optimizer.step()
        
        loss_for_the_epoch += loss.item() 
        
        #print the loss for every 500th batch
        if i % 500 == 0:
            curr_loss = loss_for_the_epoch/500 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, curr_loss))
    
    
    return loss_for_the_epoch/len(dataloader)
        
        
        
    

## Wuhunn!! now we can actually train our model