In [1]:
import torch 
import torch.nn as nn
from torch.nn import MultiheadAttention
from torch.optim import SGD,lr_scheduler
from torch.utils.data import DataLoader

import torchtext

import datasets
import numpy as np
from tqdm import tqdm 

from collections import defaultdict


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device=('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [3]:
train_data,test_data=datasets.load_dataset('imdb',
                                           split=['train','test'])

In [4]:
train_data.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [5]:
## we need to tokenize it 
tokenizer=torchtext.data.get_tokenizer('basic_english')

In [6]:
##
tokenizer(train_data[0]['text'])

['i',
 'rented',
 'i',
 'am',
 'curious-yellow',
 'from',
 'my',
 'video',
 'store',
 'because',
 'of',
 'all',
 'the',
 'controversy',
 'that',
 'surrounded',
 'it',
 'when',
 'it',
 'was',
 'first',
 'released',
 'in',
 '1967',
 '.',
 'i',
 'also',
 'heard',
 'that',
 'at',
 'first',
 'it',
 'was',
 'seized',
 'by',
 'u',
 '.',
 's',
 '.',
 'customs',
 'if',
 'it',
 'ever',
 'tried',
 'to',
 'enter',
 'this',
 'country',
 ',',
 'therefore',
 'being',
 'a',
 'fan',
 'of',
 'films',
 'considered',
 'controversial',
 'i',
 'really',
 'had',
 'to',
 'see',
 'this',
 'for',
 'myself',
 '.',
 'the',
 'plot',
 'is',
 'centered',
 'around',
 'a',
 'young',
 'swedish',
 'drama',
 'student',
 'named',
 'lena',
 'who',
 'wants',
 'to',
 'learn',
 'everything',
 'she',
 'can',
 'about',
 'life',
 '.',
 'in',
 'particular',
 'she',
 'wants',
 'to',
 'focus',
 'her',
 'attentions',
 'to',
 'making',
 'some',
 'sort',
 'of',
 'documentary',
 'on',
 'what',
 'the',
 'average',
 'swede',
 'thought',


In [7]:
def tokenize_text(example_data,tokenizer,max_length):
    tokens=tokenizer(example_data['text'])[:max_length]
    return {'tokens':tokens}

In [8]:
##applying map funtion
max_length=256
train_data=train_data.map(tokenize_text,
                          fn_kwargs={'tokenizer':tokenizer,
                                      'max_length':max_length})
test_data=test_data.map(tokenize_text,
                          fn_kwargs={'tokenizer':tokenizer,
                                      'max_length':max_length})


In [9]:
len(train_data[0]['tokens']) ##len will be <=256

256

In [10]:
##now apply vocabulary
min_frequency=5
specials=['unk','pad']

vocab=torchtext.vocab.build_vocab_from_iterator(train_data['tokens'],
                                                min_freq=min_frequency,
                                                specials=specials)

In [11]:
len(vocab)

24896

In [12]:
vocab.set_default_index(0)

In [13]:
vocab.lookup_indices(train_data[0]['tokens'])

[12,
 1242,
 12,
 220,
 0,
 44,
 61,
 362,
 1035,
 90,
 7,
 37,
 2,
 7142,
 15,
 3319,
 11,
 60,
 11,
 17,
 80,
 569,
 13,
 7558,
 3,
 12,
 99,
 508,
 15,
 38,
 80,
 11,
 17,
 24371,
 40,
 1095,
 3,
 16,
 3,
 10339,
 52,
 11,
 125,
 747,
 8,
 2389,
 14,
 644,
 4,
 1644,
 123,
 5,
 314,
 7,
 116,
 1121,
 3029,
 12,
 68,
 72,
 8,
 73,
 14,
 21,
 496,
 3,
 2,
 114,
 10,
 5778,
 195,
 5,
 182,
 3517,
 442,
 1306,
 726,
 5178,
 42,
 509,
 8,
 865,
 293,
 63,
 59,
 47,
 126,
 3,
 13,
 859,
 63,
 509,
 8,
 1157,
 51,
 11837,
 8,
 263,
 55,
 457,
 7,
 606,
 27,
 54,
 2,
 811,
 0,
 190,
 47,
 805,
 1045,
 1284,
 145,
 19,
 2,
 2353,
 331,
 6,
 1506,
 1284,
 13,
 2,
 2238,
 1530,
 3,
 13,
 215,
 2240,
 6937,
 6,
 1940,
 16876,
 7,
 18792,
 47,
 77,
 4485,
 27,
 2307,
 4,
 63,
 50,
 405,
 20,
 51,
 442,
 1537,
 4,
 6999,
 4,
 6,
 979,
 366,
 3,
 54,
 1150,
 75,
 47,
 12,
 220,
 0,
 10,
 15,
 1577,
 152,
 529,
 4,
 14,
 17,
 1121,
 9575,
 3,
 68,
 4,
 2,
 405,
 6,
 973,
 150,
 30,
 175,
 6,
 244,


In [14]:
## creating a function to vocabularize 

def vocabularize_tokens(example_data,vocab):
    ids=vocab.lookup_indices(example_data['tokens'])
    return {'ids':ids}

In [15]:
##mapping to train_data and test_data

train_data=train_data.map(vocabularize_tokens,
                          fn_kwargs={'vocab':vocab})


test_data=test_data.map(vocabularize_tokens,
                          fn_kwargs={'vocab':vocab})


In [16]:
train_data[0]['ids']

[12,
 1242,
 12,
 220,
 0,
 44,
 61,
 362,
 1035,
 90,
 7,
 37,
 2,
 7142,
 15,
 3319,
 11,
 60,
 11,
 17,
 80,
 569,
 13,
 7558,
 3,
 12,
 99,
 508,
 15,
 38,
 80,
 11,
 17,
 24371,
 40,
 1095,
 3,
 16,
 3,
 10339,
 52,
 11,
 125,
 747,
 8,
 2389,
 14,
 644,
 4,
 1644,
 123,
 5,
 314,
 7,
 116,
 1121,
 3029,
 12,
 68,
 72,
 8,
 73,
 14,
 21,
 496,
 3,
 2,
 114,
 10,
 5778,
 195,
 5,
 182,
 3517,
 442,
 1306,
 726,
 5178,
 42,
 509,
 8,
 865,
 293,
 63,
 59,
 47,
 126,
 3,
 13,
 859,
 63,
 509,
 8,
 1157,
 51,
 11837,
 8,
 263,
 55,
 457,
 7,
 606,
 27,
 54,
 2,
 811,
 0,
 190,
 47,
 805,
 1045,
 1284,
 145,
 19,
 2,
 2353,
 331,
 6,
 1506,
 1284,
 13,
 2,
 2238,
 1530,
 3,
 13,
 215,
 2240,
 6937,
 6,
 1940,
 16876,
 7,
 18792,
 47,
 77,
 4485,
 27,
 2307,
 4,
 63,
 50,
 405,
 20,
 51,
 442,
 1537,
 4,
 6999,
 4,
 6,
 979,
 366,
 3,
 54,
 1150,
 75,
 47,
 12,
 220,
 0,
 10,
 15,
 1577,
 152,
 529,
 4,
 14,
 17,
 1121,
 9575,
 3,
 68,
 4,
 2,
 405,
 6,
 973,
 150,
 30,
 175,
 6,
 244,


In [22]:
train_data.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [23]:
### creating tensors data

train_data=train_data.with_format(type='torch',columns=['ids','label'])
test_data=test_data.with_format(type='torch',column=['ids','label'])

In [24]:
type(train_data[0]['ids'])

torch.Tensor

In [25]:
##splitting train_data to train_val_data

train_val_data=train_data.train_test_split(0.25)
train_data=train_val_data['train']
val_data=train_val_data['test']

In [26]:
train_data.num_rows

14062

In [27]:
def get_collate_fnc(pad_idx):

    def collate_fnc(batch):
        batch_ids=[i['ids'] for i in batch]
        batch_label=[i['label'] for i in batch]
        batch_ids=torch.nn.utils.rnn.pad_sequence(batch_ids,padding_value=pad_idx,batch_first=True)
        batch={'ids':batch_ids,'label':batch_label}
        return batch
    
    return collate_fnc

In [28]:
def dataloader(dataset,batch_size,pad_idx,shuffle=False):
    collate_fnc=get_collate_fnc(pad_idx)
    dataloader=DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          collate_fn=collate_fnc,
                          shuffle=shuffle)
    return dataloader

In [29]:
train_dataloader=dataloader(train_data,10,0,True)

In [30]:
for batch in train_dataloader:
    print(batch['ids'].shape)

torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size([10, 256])
torch.Size

In [31]:
###creating a mode model for sentiment analysis; however unlike in previous turtorial we will build TransformerEncoder from scratch, and use it inplace of LSTM

### Defining Embedding Class now 

class EmbeddingLayer(nn.Module):

    def __init__(self,vocab_size,embed_dim):
        super(EmbeddingLayer,self).__init__()
        self.vocab_size=vocab_size
        self.embed_dim=embed_dim
        self.emb_layer=nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embed_dim)

    def positional_encoding(self,seq_len, embed_dim):
        pos = torch.arange(seq_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2, dtype=torch.float32) * -(torch.log(torch.tensor(10000.0)) / embed_dim))
        pe = torch.zeros(seq_len, embed_dim)
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        pe = pe.unsqueeze(0)  # Add batch dimension
        return pe
    
    def forward(self,x):
        x=self.emb_layer(x)
        seq_len=x.shape[1]
        pos_enc=self.positional_encoding(seq_len=seq_len,embed_dim=self.embed_dim).to(device=x.device)
        return x+pos_enc


In [32]:
class FFL(nn.Module): ## feedforward Layer
    def __init__(self,embed_dim,dff,dropout):
        super(FFL,self).__init__()
        self.ffl_layer=nn.Sequential(
            nn.Linear(in_features=embed_dim,out_features=dff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=dff,out_features=embed_dim)
        )

    def forward(self,x):
        return self.ffl_layer(x)

In [33]:
class TransformerEncoder(nn.Module):

    def __init__(self,embed_dim,vocab_size,droput,num_heads):
        super(TransformerEncoder,self).__init__()
        ##initializing variables
        self.vocab_size=vocab_size
        self.embed_dim=embed_dim
        self.dff=self.embed_dim*4
        self.num_heads=num_heads
        self.dropout=droput
        
        ##nn.NN
        self.mha=nn.MultiheadAttention(embed_dim=self.embed_dim,num_heads=self.num_heads,dropout=self.dropout)
        self.layer_nomr1=nn.LayerNorm(normalized_shape=self.embed_dim)
        self.layer_nomr2=nn.LayerNorm(normalized_shape=self.embed_dim)

        ##Class Objects
        
        self.feedforward_layer=FFL(embed_dim=self.embed_dim,dff=self.dff,dropout=self.dropout)

    
    def forward(self,x):

        #x is of shape batch_size,seq_len,embed_dim; output from Embedding Layer

        
        mha_output,_=self.mha(x,x,x)
        
        normlized_1=self.layer_nomr1(x+mha_output)
        ffd_output=self.feedforward_layer(normlized_1)
        normlized_2=self.layer_nomr2(normlized_1+ffd_output)

        return normlized_2
        


        



In [34]:
num_heads=8
num_layers=6
### Creating Embedding Layer 
embed_dim=256
vocab_size=100
batch_size=8
num_tokens=10
encoder=TransformerEncoder(embed_dim=embed_dim,vocab_size=vocab_size,
                           num_heads=num_heads,droput=0.3)

encoder=encoder.to(device=device)

In [37]:
batch['ids'].shape

torch.Size([2, 256])

In [38]:
example_ids=batch['ids']

In [39]:
### defining Encoder with num_of_layers
class TransformerEncoderLayers(nn.Module):
    
    def __init__(self,num_layers,embed_dim,vocab_size,droput,num_heads):
        super(TransformerEncoderLayers,self).__init__()
        self.num_layers=num_layers
        self.embedding_layers=EmbeddingLayer(vocab_size=vocab_size,embed_dim=embed_dim)
        self.transformer_encoders=nn.ModuleList([TransformerEncoder(embed_dim,vocab_size,droput,num_heads)
                                                for i in range(self.num_layers)])
        
    def forward(self,x):

        x=self.embedding_layers(x)

        for i in range(self.num_layers):
            x=self.transformer_encoders[i](x)
        
        return x



In [40]:
### checking TransformerEncoder
## defining Encoder Parameter using baseline from Attention is all we need Paper
num_heads=8
num_layers=6
### Creating Embedding Layer 
embed_dim=256
vocab_size=100
batch_size=8
num_tokens=10

example_tensors=torch.rand(size=(batch_size,num_tokens)).to(dtype=torch.int64)
encoder_layers=TransformerEncoderLayers(num_layers=num_layers,embed_dim=embed_dim,vocab_size=vocab_size,
                           num_heads=num_heads,droput=0.3)

In [42]:
### Creating a random tensor  || Testing Custom Transformers encoder layers
batch_size=8
num_tokens=10

example_tensors=torch.rand(size=(batch_size,num_tokens)).to(dtype=torch.int64)

encoder_layers(example_tensors)


tensor([[[-1.6860,  0.0186, -0.5737,  ..., -1.3529, -1.2014,  1.4205],
         [-1.3166, -0.3840, -0.5542,  ..., -1.0626, -0.3838,  0.0923],
         [-1.3858, -0.9728, -0.6733,  ..., -0.7308, -0.3352,  0.6164],
         ...,
         [-1.4923,  0.5200, -0.9967,  ..., -1.8046, -1.0522,  0.1120],
         [-0.7978, -0.9199, -0.1573,  ..., -0.7895, -0.6820, -0.4502],
         [-1.5262, -1.2595, -0.5342,  ..., -0.9933, -0.8044,  0.8380]],

        [[-1.4762, -0.8589, -0.6190,  ..., -1.3217, -1.2633,  0.8759],
         [-1.5596, -0.6869,  0.0458,  ..., -1.0617, -1.4319,  0.5335],
         [-0.8491, -0.7903, -0.2657,  ..., -0.4611, -0.2816,  0.5894],
         ...,
         [-1.3151, -0.5618, -0.8633,  ..., -1.2598, -0.3456,  0.0689],
         [-1.0664, -0.6998, -0.1316,  ..., -1.4276, -0.5668, -0.1251],
         [-1.5471, -1.0636, -0.4235,  ..., -0.1274, -0.6342,  0.5107]],

        [[-1.2409, -0.2825, -0.8003,  ..., -1.4103, -1.0526,  0.8120],
         [-1.3440, -0.5117, -0.7935,  ..., -1

In [43]:
device

'cuda:0'

In [44]:
##tesitng in GPU cuda

encoder_layers.to(device=device)(example_tensors.to(device))

tensor([[[-1.4859e+00, -6.2361e-01, -6.1226e-01,  ..., -1.0399e+00,
          -1.4903e+00,  9.6051e-01],
         [-9.5148e-01, -4.6248e-01, -3.1253e-01,  ..., -8.7676e-01,
          -5.2341e-01,  5.0658e-01],
         [-1.5375e+00, -1.0660e+00, -1.1916e+00,  ..., -9.9766e-01,
          -1.0989e+00,  8.1634e-01],
         ...,
         [-6.2386e-01,  1.2346e-01, -5.1488e-01,  ..., -1.7369e+00,
          -7.0508e-01,  2.1727e-01],
         [-8.9607e-01, -9.3249e-01,  1.8323e-01,  ..., -1.3563e+00,
          -1.1281e+00, -6.2004e-02],
         [-1.2679e+00, -1.0861e+00, -6.4117e-01,  ..., -9.0980e-01,
          -6.6566e-01, -5.1932e-02]],

        [[-1.4801e+00, -4.6780e-01, -4.0603e-01,  ..., -1.8978e+00,
          -4.7852e-01,  8.7885e-01],
         [-1.5067e+00, -3.5352e-02, -2.0204e-01,  ..., -1.2274e+00,
          -1.8952e-01,  6.3198e-01],
         [-5.1526e-01, -7.0663e-01, -3.5956e-01,  ..., -9.1291e-01,
          -1.0132e+00,  8.3923e-01],
         ...,
         [-1.0508e+00,  4

In [None]:
## okay working 

In [45]:
### Now continue to build NBOW but first let us define the parameters
vocab_size=len(vocab)
embed_dim=512
output_dim=len(train_data.unique('label'))

## parameters for EncoderLayers
num_heads=8
num_layers=2
dropout=0.3

##
batch_size=128

Flattening the indices: 100%|██████████| 14062/14062 [00:05<00:00, 2724.55 examples/s]


In [47]:
embed_dim%num_heads

0

In [48]:
##def NBOW

class NBOW_Encoder(nn.Module):

    def __init__(self,num_layers,embed_dim,vocab_size,num_heads,dropout,output_dim):
        super(NBOW_Encoder,self).__init__()

        self.transformer_encoder_layer=TransformerEncoderLayers(num_layers=num_layers,embed_dim=embed_dim,vocab_size=vocab_size,
                           num_heads=num_heads,droput=dropout)
        self.linear=nn.Linear(embed_dim,output_dim)

    def forward(self,x):

        encoder_output=self.transformer_encoder_layer(x)
        flatten_output=encoder_output.mean(1)
        final_output=self.linear(flatten_output)
        return final_output

        

In [49]:
##checking model
ids=batch['ids'].to(device=device)
print(ids.shape)

torch.Size([2, 256])


In [50]:
model=NBOW_Encoder(num_layers=num_layers,embed_dim=embed_dim,vocab_size=vocab_size,
                           num_heads=num_heads,dropout=dropout,
                           output_dim=output_dim).to(device=device)

In [52]:
torch.argmax(model(ids),axis=1) ## working 

tensor([0, 0], device='cuda:0')

In [None]:
model=model.to(device=device)

In [None]:
# Define your model
num_layers = 6  # Example value
embed_dim = 512  # Example value
vocab_size = 10000  # Example value
dropout = 0.1  # Example value
num_heads = 8  # Example value

model = TransformerEncoderLayers(num_layers=num_layers, embed_dim=embed_dim, vocab_size=vocab_size, num_heads=num_heads,droput=dropout)

# Move the entire model to the CUDA device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model

In [59]:
def get_accuracy(pred_logits,true_label):
    pred_label=torch.argmax(pred_logits,axis=1)
    total_predictions=pred_label.eq(true_label).sum()
    accuracy=total_predictions/pred_label.shape[0]
    return accuracy

In [61]:
## testing 
pred_logits=torch.rand(size=(5,3))
true_label=torch.tensor(data=[0,1,0,2,2])

accuracy=get_accuracy(pred_logits,true_label)

print(accuracy)

tensor(0.2000)


In [62]:
train_dataloader=dataloader(dataset=train_data,batch_size=batch_size,pad_idx=0,shuffle=True)
val_dataloader=dataloader(dataset=val_data,batch_size=batch_size,pad_idx=0,shuffle=False)
test_dataloader=dataloader(dataset=test_data,batch_size=batch_size,pad_idx=0,shuffle=False)

In [63]:
criterion=nn.CrossEntropyLoss().to(device=device)
optimizer=SGD(model.parameters(),lr=0.001)

In [68]:
def train(data_loader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm(data_loader, desc="training..."):
        ids = batch["ids"].to(device)
        label = batch["label"].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)
def evaluate(data_loader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            label = batch["label"].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [69]:
import collections

In [70]:
### training the model
save_dire="../saved_model/nbow_transformer.pt"
n_epochs=25
best_valid_loss=float("inf")

metrics=collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss,train_acc=train(train_dataloader,device=device,
                                  model=model,criterion=criterion,
                                  optimizer=optimizer)
    
    val_loss,val_acc=evaluate(val_dataloader,device=device,
                                  model=model,criterion=criterion
                        )
    
    metrics['train_loss'].append(train_loss)
    metrics['train_acc'].append(train_acc)
    metrics['val_loss'].append(val_loss)
    metrics['val_acc'].append(val_acc)
    if val_loss<train_loss:
       
        best_valid_loss=val_loss
        
        torch.save(model.state_dict(),save_dire)

    print("Epoch Num: ", epoch)
    print('Train Loss: {:.3f}   | Train Acc: {:.3f}'.format(train_loss,train_acc))
    print('Val Loss: {:.3f}   | Val Acc: {:.3f}'.format(val_loss,val_acc))




# best Validation loss

model.load_state_dict(torch.load(save_dire))
best_val_loss,best_val_acc=evaluate(val_dataloader,device=device,
                                  model=model,criterion=criterion
                                )

print('Best Val Loss: {:.3f}   | Best Val Acc: {:.3f}'.format(best_val_loss,best_val_acc))



    

training...:   0%|          | 0/110 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'to'