In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import Wav2Vec2Model, Wav2Vec2Config

class ResidualCNN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()
        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel // 2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel // 2)
        self.layer_norm1 = nn.LayerNorm(n_feats)
        self.layer_norm2 = nn.LayerNorm(n_feats)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x.transpose(2, 3)).transpose(2, 3)
        x = torch.relu(self.cnn1(x))
        x = self.dropout(x)
        x = self.layer_norm2(x.transpose(2, 3)).transpose(2, 3)
        x = torch.relu(self.cnn2(x))
        x = self.dropout(x)
        return x + residual

class BidirectionalLSTM(nn.Module):
    def __init__(self, rnn_dim, hidden_size, dropout, batch_first=True):
        super(BidirectionalLSTM, self).__init__()
        self.BiLSTM = nn.LSTM(rnn_dim, hidden_size, num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(hidden_size * 2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x, _ = self.BiLSTM(x)
        x = self.layer_norm(x)
        x = self.dropout(x)
        return x

class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats // 2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3 // 2)  # cnn for extracting heirachal features
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats * 32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BidirectionalLSTM(rnn_dim=rnn_dim, hidden_size=rnn_dim, dropout=dropout)
            for _ in range(n_rnn_layers)
        ])
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=rnn_dim * 2, nhead=8)
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=3)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim * 2, rnn_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])
        x = x.transpose(1, 2)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = x.permute(1, 0, 2)  # Transformer expects (seq_len, batch_size, dim)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        x = self.classifier(x)
        return x

n_cnn_layers = 3
n_rnn_layers = 5
rnn_dim = 512
n_class = len(set(train_labels))  # Number of unique characters in transcriptions
n_feats = 40  # Number of MFCCs
stride = 2
dropout = 0.1

model = SpeechRecognitionModel(n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride, dropout)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# Custom collate function to handle variable lengths of input
def collate_fn(data):
    inputs, targets = zip(*data)
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True).unsqueeze(1).transpose(2, 3)
    targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
    return inputs, targets

# Training loop
num_epochs = 20
batch_size = 32

from torch.utils.data import DataLoader, Dataset

class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = AudioDataset(train_features, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, n_class), labels.view(-1))
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


In [None]:
# Preprocess adaptation data similarly
adapt_features = []
adapt_labels = []

for index, row in adapt_df.iterrows():
    file_path = f"{adapt_audio_files}/{row['wav_id']}.wav"
    signal, sr = load_audio(file_path)
    mfccs = extract_features(signal, sr)
    adapt_features.append(mfccs)
    adapt_labels.append(row['transcription'])

adapt_features = np.array(adapt_features, dtype=object)
adapt_labels = np.array(adapt_labels)

# Evaluate the model
adapt_dataset = AudioDataset(adapt_features, adapt_labels)
adapt_loader = DataLoader(adapt_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for inputs, labels in adapt_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 2)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0) * labels.size(1)
    accuracy = total_correct / total_samples
    print(f'Accuracy: {accuracy * 100:.2f}%')
