In [21]:
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader

In [22]:
class Time2Vec(nn.Module):
    def __init__(self, k):
        super().__init__()
        self.k = k # Dimension of time2vec vector
        self.w = nn.Parameter(torch.randn(k)) # Learnable parameters for linear part
        self.b = nn.Parameter(torch.randn(k)) # Learnable parameters for linear part
        self.w_sin = nn.Parameter(torch.randn(k)) # Learnable parameters for periodic part
        self.b_sin = nn.Parameter(torch.randn(k)) # Learnable parameters for periodic part
        self.w_cos = nn.Parameter(torch.randn(k)) # Learnable parameters for periodic part
        self.b_cos = nn.Parameter(torch.randn(k)) # Learnable parameters for periodic part
    def forward(self, x):
        x = x.unsqueeze(-1) # Add extra dimension for vectorization
        linear = self.w * x + self.b # Linear transformation (k features)
        sin_trans = torch.sin(self.w_sin * x + self.b_sin) # Periodic transformation (k features)
        cos_trans = torch.cos(self.w_cos * x + self.b_cos) # Periodic transformation (k features)
        return torch.cat([linear, sin_trans, cos_trans],-1) # Concatenate along last dimension

In [23]:
from CoRe_Dataloader6 import dataloader   
train_dl, test_dl = dataloader,dataloader

In [24]:
# Import modules
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

n_features = 8 # Number of features per time step
n_layers = 4 # Number of transformer encoder layers
n_heads = 8 # Number of attention heads per layer
hidden_size = 1024 # Size of hidden state in sublayers 
dropout_rate = 0.4 # Dropout rate for regularization
n_classes = 19

class SelfAttentionPooling(nn.Module):
    """
    Implementation of SelfAttentionPooling
    Original Paper: Self-Attention Encoding and Pooling for Speaker Recognition
    https://arxiv.org/pdf/2008.01077v1.pdf
    """

    def __init__(self, input_dim):
        super(SelfAttentionPooling, self).__init__()
        self.W = nn.Linear(input_dim, 1)
        self.softmax = nn.functional.softmax
    def forward(self, batch_rep):
        """
        input:
            batch_rep : size (N, T, H), N: batch size, T: sequence length, H: Hidden dimension

        attention_weight:
            att_w : size (N, T, 1)

        return:
            utter_rep: size (N, H)
        """

        att_w = self.softmax(self.W(batch_rep).squeeze(-1), -1).unsqueeze(-1)
        utter_rep = torch.sum(batch_rep * att_w, dim=1)

        return utter_rep

# Define classifier model 
class Classifier(nn.Module):
    def __init__(self , n_layers , n_features , n_heads , hidden_size , dropout_rate , n_classes):
        super(Classifier , self).__init__()
        self.t2v = Time2Vec(n_features)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(n_features*3,n_heads,hidden_size,dropout_rate),n_layers) # Transformer encoder layer 
        self.pooling= SelfAttentionPooling(n_features*3) # Global average pooling layer 
        self.linear= nn.Linear(n_features*3,n_classes) # Linear layer with softmax activation 

    def forward(self,x):
        # x shape: (batch_size ,seq_len ,n_features)
        x = self.t2v(x)
        # print(x.shape)
        x= self.transformer_encoder(x) # Apply transformer encoder on x 
        # x= x.permute(1,0 ,2) # Permute x to match expected shape for pooling (batch_size ,n_features ,seq_len)
        x= self.pooling(x) # Apply pooling on x 
        # x= x.squeeze(-1) # Remove last dimension 
        # print(x.shape)
        x= self.linear(x) # Apply linear layer on x 
        return nn.functional.softmax(x,dim=-1) #Return class probabilities (batch_size ,n_classes)

In [25]:
import gc
torch.cuda.empty_cache()
gc.collect()
import wandb

In [26]:
model = Classifier(
    n_layers=n_layers,
    n_features=n_features,
    n_heads=n_heads,
    hidden_size=hidden_size,
    dropout_rate=dropout_rate,
    n_classes=n_classes,
).to("cuda:0")
# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss(
    weight=torch.tensor(
        [
            1,
            1.0000,
            0.5,
            1.0000,
            1,
            1,
            1,
            1.0000,
            1,
            1.0000,
            0.5,
            1,
            1,
            1,
            0.5,
            1,
            0.1,
            1.0000,
            1,
        ]
    )
).to(
    "cuda:0"
)  # Cross entropy loss function
optimizer = optim.AdamW(model.parameters(), lr=1e-4)  # Adam optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 15, gamma=0.9)

In [27]:
wandb.init("transformer-test-c2")
n_epochs = 1000
# Train the model on the training data 
for epoch in range(n_epochs): #Iterate over epochs 
    epochloss = 0
    ldl = len(train_dl)
    for bnum,(batch_x,batch_y) in enumerate(train_dl): #Iterate over batches 
        # print(batch_x.shape,batch_y.shape)
        batch_x = batch_x.to("cuda:0").to(torch.float)
        batch_y = batch_y.to("cuda:0").to(torch.long)[:,0]
        optimizer.zero_grad() #Clear previous gradients 
        
        output=model(batch_x).to(torch.float) #Get model output for current batch (batch_step ,n_classes)
        # print(output.shape)
        loss=loss_fn(output.to(torch.float),batch_y) #Compute loss for current batch 
        loss.backward() #Backpropagate loss 
        optimizer.step() #Update parameters 
        
        print(f"Epoch {epoch} Batch {bnum}/{ldl} : Loss {loss.item()}",end = "\r",flush=True) #Print epoch and loss 
        epochloss+=loss.item()
        wandb.log({"batchloss":loss.item()})
    if epoch > 20 and epoch < 300: scheduler.step() 
    epochloss/=ldl

    print(f"\nAverage epoch loss: {epochloss} with Learning rate {scheduler.get_last_lr()}")
    print("Evaluating with",end = " ")
    with torch.no_grad(): # Disable gradient computation 
        accsum = []
        for bnum,(batch_x,batch_y) in enumerate(train_dl):
            batch_x = batch_x.to("cuda:0")
            batch_y = batch_y.to("cuda:0").to(torch.long)[:,0]
            output=model(batch_x) #Get model output for validation data (val_batch_size ,n_classes)
            pred=torch.argmax(output,dim=-1) # Get predicted classes (val_batch_size)
            accsum.append((pred==batch_y).float()) # Compute accuracy (%)
            # print(inp.shape, output.shape, pred.shape, rout.shape,acc)
        # print(f"Accuracy on validation data : {acc}%")# Print accuracy
        equals= torch.cat(accsum)
        means = torch.mean(equals)
        print("mean total accuracy:", means.item()*100,"%\n","-=<{|}>=-"*8,'\n')
        wandb.log({"accuracy":means.item()})
    
    # Evaluate the model on the validation data 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maashraychegu[0m ([33malabs[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 0 Batch 38/71 : Loss 2.8646452426910495

In [None]:
with torch.no_grad(): # Disable gradient computation 
    accsum = []
    for bnum,(batch_x,batch_y) in enumerate(test_dl):
        batch_x = batch_x.to("cuda:0")
        batch_y = batch_y.to("cuda:0")
        print(bnum)
        output=model(batch_x) #Get model output for validation data (val_batch_size ,n_classes)
        pred=torch.argmax(output,dim=-1) # Get predicted classes (val_batch_size)
        accsum.append((pred==batch_y).float()) # Compute accuracy (%)
        # print(inp.shape, output.shape, pred.shape, rout.shape,acc)
    # print(f"Accuracy on validation data : {acc}%")# Print accuracy
    equals= torch.cat(accsum)
    means = torch.mean(equals)
    print(means)

In [None]:
torch.set_printoptions(threshold=500_000)
print(equals)
torch.set_printoptions(profile='default')