In [2]:
import torch
import numpy as np
import pandas as pd
from torch import Tensor, nn
from tqdm.notebook import tqdm
from torchaudio.models import Conformer
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# ENCODER MODEL

In [2]:
class encoder_model(nn.Module) :
    def __init__(self, kernel_size: int, num_channels: int, num_layers: int, feed_forward: int, num_heads: int) :
        super().__init__()

        self.embedding = nn.Sequential(nn.Embedding(457, num_channels//4), nn.ReLU(), nn.Linear(num_channels//4, num_channels//2), 
                                        nn.ReLU(), nn.Linear(num_channels//2, num_channels))

        self.encoder =  Conformer(num_channels, num_heads, feed_forward, num_layers, kernel_size)     

    def forward(self, input_ids, length, mask) :

        embedding = self.embedding(input_ids)*mask
        encoded, _ = self.encoder(embedding, length)

        return encoded

# DECODER MODEL

In [None]:
class decoder_model(nn.Module) :
    def __init__(self, kernel_size: int, num_channels: int, num_layers: int, feed_forward: int, num_heads: int) :
        super().__init__()

        self.embedding = nn.Sequential(nn.Embedding(1, num_channels//4), nn.ReLU(), nn.Linear(num_channels//4, num_channels//2), 
                                        nn.ReLU(), nn.Linear(num_channels//2, num_channels))
        
        decoder_layer = nn.TransformerDecoderLayer(num_channels, num_heads, feed_forward, batch_first=True)
        self.transformer_Decoder = nn.TransformerDecoder(decoder_layer, num_layers)    
        self.result = nn.Sequential(nn.Linear(num_channels, num_channels//2), nn.Linear(num_channels//2, num_channels//4), nn.ReLU(), 
                                    nn.Linear(num_channels//4, 2))
        
        
    def forward(self, encoder_output, input_base, base_mask) :
        
        embedded = self.embedding(input_base)
        decoded = self.transformer_Decoder(embedded, encoder_output, memory_mask=base_mask)
        output = self.result(decoded)
        
        return output

In [None]:
class main_model(nn.Module) :
    def __init__(self, kernel_size: int, num_channels: int, num_layers: int, feed_forward: int, num_heads: int) :
        super().__init__()
        self.encoder = encoder_model(kernel_size, num_channels, num_layers, feed_forward, num_heads)
        self.decoder = decoder_model(kernel_size, num_channels, num_layers, feed_forward, num_heads)
        self.loss = nn.L1Loss()
    def forward(self, inputs, length, mask, labels=None) :
        encoded = self.encoder(inputs, length)

        for i in range(inputs.shape[1]) :
            decoded = self.decoder(encoded, inputs[:, i])