In [2]:
import torch
import torchaudio
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram, Resample, TimeMasking, FrequencyMasking, MFCC
from jiwer import wer
import numpy as np

In [3]:
def load_lists_from_files(audio_file_path='audio_files.txt', transcript_file_path='transcriptions.txt'):
    with open(audio_file_path, 'r') as af:
        audio_files = [line.strip() for line in af]

    with open(transcript_file_path, 'r') as tf:
        transcriptions = [line.strip() for line in tf]

    return audio_files, transcriptions

# Load the lists back from text files
audio_files, transcriptions = load_lists_from_files()

In [31]:
# Create a character-level vocabulary
vocab = set(''.join(transcriptions))
vocab = {char: idx for idx, char in enumerate(sorted(vocab))}
vocab['<blank>'] = len(vocab)
blank_token = vocab['<blank>']
idx_to_char = {idx: char for char, idx in vocab.items()}
print("Vocabulary:", vocab)
print("Index to Character Mapping:", idx_to_char)

# Function to convert transcription to numerical labels
def text_to_labels(text):
    return [vocab[char] for char in text]

# Function to convert labels to text
def labels_to_text(labels):
    return ''.join([idx_to_char[idx] for idx in labels if idx in idx_to_char])

Vocabulary: {' ': 0, "'": 1, 'A': 2, 'B': 3, 'C': 4, 'D': 5, 'E': 6, 'F': 7, 'G': 8, 'H': 9, 'I': 10, 'J': 11, 'K': 12, 'L': 13, 'M': 14, 'N': 15, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 20, 'T': 21, 'U': 22, 'V': 23, 'W': 24, 'X': 25, 'Y': 26, 'Z': 27, '<blank>': 28}
Index to Character Mapping: {0: ' ', 1: "'", 2: 'A', 3: 'B', 4: 'C', 5: 'D', 6: 'E', 7: 'F', 8: 'G', 9: 'H', 10: 'I', 11: 'J', 12: 'K', 13: 'L', 14: 'M', 15: 'N', 16: 'O', 17: 'P', 18: 'Q', 19: 'R', 20: 'S', 21: 'T', 22: 'U', 23: 'V', 24: 'W', 25: 'X', 26: 'Y', 27: 'Z', 28: '<blank>'}


In [5]:
# SpeechDataset class definition with spectrogram augmentation and MFCC
class SpeechDataset(Dataset):
    def __init__(self, audio_files, transcriptions, sample_rate=16000):
        self.audio_files = audio_files
        self.transcriptions = transcriptions
        self.sample_rate = sample_rate
        self.resample = Resample(orig_freq=sample_rate, new_freq=16000)
        self.melspec = MelSpectrogram(sample_rate=16000, n_mels=128)
        self.mfcc = MFCC(sample_rate=16000, n_mfcc=40)
        self.time_masking = TimeMasking(time_mask_param=30)
        self.freq_masking = FrequencyMasking(freq_mask_param=15)
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        try:
            waveform, sample_rate = torchaudio.load(self.audio_files[idx])
        except Exception as e:
            print(f"Error loading file {self.audio_files[idx]}: {e}")
            return None, None  # Return None for both to handle it later
        
        waveform = self.resample(waveform)
        mel_spec = self.melspec(waveform)
        mel_spec = self.time_masking(mel_spec)
        mel_spec = self.freq_masking(mel_spec)
        mfcc = self.mfcc(waveform)

         # Ensure mel_spec and mfcc have the same size along the concatenation dimension
        if mel_spec.size(2) > mfcc.size(2):
            mel_spec = mel_spec[:, :, :mfcc.size(2)]
        elif mfcc.size(2) > mel_spec.size(2):
            mfcc = mfcc[:, :, :mel_spec.size(2)]
        
        features = torch.cat((mel_spec, mfcc), dim=1)
        transcription = self.transcriptions[idx]
        return features.squeeze(0).transpose(0, 1), transcription  # Transpose to [seq_len, feature_dim]

In [6]:
# Function to pad sequences
def pad_sequence(batch):
    batch = [item for item in batch]  # Ensure [seq_len, feature_dim]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
    return batch

In [7]:
# Update the collate function to handle None entries
def collate_fn(batch):
    batch = [item for item in batch if item[0] is not None]  # Filter out None entries
    if len(batch) == 0:  # Handle the case where all items are None
        return None, None, None, None
    mel_specs = [item[0] for item in batch]
    transcriptions = [item[1] for item in batch]
    mel_specs_padded = pad_sequence(mel_specs)
    labels = [torch.tensor(text_to_labels(t)) for t in transcriptions]
    label_lengths = torch.tensor([len(label) for label in labels])
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)
    input_lengths = torch.tensor([mel_spec.size(0) for mel_spec in mel_specs_padded])
    return mel_specs_padded, labels_padded, input_lengths, label_lengths

In [8]:
# Create dataset and dataloader with the expanded dataset
dataset = SpeechDataset(audio_files, transcriptions)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)



In [9]:
# Define a simple model architecture
class EnhancedSpeechModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(EnhancedSpeechModel, self).__init__()
        self.conv1 = nn.Conv1d(input_dim, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(64, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bidirectional LSTM

    def forward(self, x):
        x = x.transpose(1, 2)  # Change to [batch_size, feature_dim, seq_len] for Conv1d
        x = self.conv1(x)
        x = self.relu(x)
        x = x.transpose(1, 2)  # Change back to [batch_size, seq_len, feature_dim] for LSTM
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [10]:
# Initialize model
input_dim = 128 + 40  # Number of mel bands + number of MFCCs
hidden_dim = 256
output_dim = len(vocab) + 1  # Output dimension based on the size of the vocabulary + 1 for the blank token
model = EnhancedSpeechModel(input_dim, hidden_dim, output_dim)

In [11]:
# Training with Early Stopping and Learning Rate Scheduler
def train_model(model, dataloader, num_epochs=50, learning_rate=0.001, patience=10):
    criterion = nn.CTCLoss(blank=output_dim - 1, zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.5, verbose=True)
    best_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch in dataloader:
            if batch[0] is None:  # Skip if batch is None
                continue
            
            mel_specs, labels, input_lengths, label_lengths = batch
            
            if mel_specs.dim() != 3:
                print(f"Unexpected dimensions: {mel_specs.shape}")
                continue  # Skip this batch if dimensions are not as expected
            
            outputs = model(mel_specs)
            outputs = outputs.log_softmax(2)
            outputs = outputs.permute(1, 0, 2)  # (T, N, C) for CTCLoss

            loss = criterion(outputs, labels, input_lengths, label_lengths)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        epoch_loss /= len(dataloader)
        scheduler.step(epoch_loss)
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
        
        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

In [12]:
# Train the model with the expanded dataset
train_model(model, dataloader)



Epoch [1/50], Loss: 2.4562
Epoch [2/50], Loss: 1.3594
Epoch [3/50], Loss: 1.2595
Epoch [4/50], Loss: 1.2204
Epoch [5/50], Loss: 1.2139
Epoch [6/50], Loss: 1.1954
Epoch [7/50], Loss: 1.2034
Epoch [8/50], Loss: 1.2099
Epoch [9/50], Loss: 1.6206
Epoch [10/50], Loss: 1.7128
Epoch [11/50], Loss: 1.4160
Epoch [12/50], Loss: 1.3478
Epoch [13/50], Loss: 1.3159
Epoch [14/50], Loss: 1.4160
Epoch [15/50], Loss: 1.7303
Epoch [16/50], Loss: 1.9384
Early stopping at epoch 16


In [13]:
# Save the trained model
torch.save(model, 'speechtotextmodel4_with_large_data.pth')

In [35]:
# Function to decode model outputs into text, handling the blank token
def decode_predictions(predictions, vocab):
    idx_to_char = {idx: char for char, idx in vocab.items()}
    blank_token = vocab['<blank>']
    print("Blank token index:", blank_token)
    decoded_output = []
    for prediction in predictions:
        pred_indices = torch.argmax(prediction, dim=-1)
        print("Prediction indices before clamping:", pred_indices)
        pred_indices = torch.clamp(pred_indices, 0, len(vocab) - 1)  # Clamp indices to valid range
        print("Prediction indices after clamping:", pred_indices)
        pred_text = ''.join([idx_to_char[idx.item()] for idx in pred_indices if idx.item() != blank_token])
        decoded_output.append(pred_text)
    return decoded_output

In [17]:
# Function to decode the model output into text
def decode_output(output):
    _, max_indices = torch.max(output, dim=-1)
    tokens = max_indices.unique_consecutive()
    decoded = ''.join([idx_to_char[idx.item()] for idx in tokens if idx.item() in idx_to_char])
    return decoded

In [18]:
# Calculate accuracy using Word Error Rate (WER)
def calculate_accuracy(model, dataloader):
    model.eval()
    predictions = []
    ground_truths = []
    
    with torch.no_grad():
        for batch in dataloader:
            if batch[0] is None:  # Skip if batch is None
                continue
            
            mel_specs, labels, input_lengths, label_lengths = batch
            
            outputs = model(mel_specs)
            outputs = outputs.log_softmax(2)
            outputs = outputs.permute(1, 0, 2)  # (T, N, C) for CTCLoss
            
            for i in range(outputs.size(1)):  # Iterate over batch
                decoded_output = decode_output(outputs[:, i, :])
                predictions.append(decoded_output)
                ground_truths.append(''.join([idx_to_char[idx.item()] for idx in labels[i] if idx.item() in idx_to_char]))
    
    wer_score = wer(ground_truths, predictions)
    accuracy = 1 - wer_score
    return accuracy


In [19]:
# Calculate accuracy
accuracy = calculate_accuracy(model, dataloader)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.0267


In [23]:
# Function to preprocess a single audio file
def preprocess_audio(audio_file, sample_rate=16000):
    resample = Resample(orig_freq=sample_rate, new_freq=16000)
    melspec = MelSpectrogram(sample_rate=16000, n_mels=128)
    mfcc = MFCC(sample_rate=16000, n_mfcc=40)
    time_masking = TimeMasking(time_mask_param=30)
    freq_masking = FrequencyMasking(freq_mask_param=15)
    
    try:
        waveform, sample_rate = torchaudio.load(audio_file)
    except Exception as e:
        print(f"Error loading file {audio_file}: {e}")
        return None
    
    waveform = resample(waveform)
    mel_spec = melspec(waveform)
    mel_spec = time_masking(mel_spec)
    mel_spec = freq_masking(mel_spec)
    mfcc_feat = mfcc(waveform)

    # Ensure mel_spec and mfcc have the same size along the concatenation dimension
    if mel_spec.size(2) > mfcc_feat.size(2):
        mel_spec = mel_spec[:, :, :mfcc_feat.size(2)]
    elif mfcc_feat.size(2) > mel_spec.size(2):
        mfcc_feat = mfcc_feat[:, :, :mel_spec.size(2)]

    features = torch.cat((mel_spec, mfcc_feat), dim=1)
    return features.squeeze(0).transpose(0, 1)  # Transpose to [seq_len, feature_dim]

In [36]:
# Function to transcribe a single audio file using the trained model
def transcribe_audio(model, audio_file, vocab):
    model.eval()
    with torch.no_grad():
        features = preprocess_audio(audio_file)
        if features is None:
            return None
        features = features.unsqueeze(0)  # Add batch dimension
        outputs = model(features)
        decoded_output = decode_predictions(outputs, vocab)
        return decoded_output[0]

In [37]:
# Example usage
audio_file = "31-121972-0000.wav"  # Replace with the path to your audio file
transcription = transcribe_audio(model, audio_file, vocab)
print(f"Transcription: {transcription}")

Blank token index: 28
Prediction indices before clamping: tensor([29, 29, 29,  ..., 29, 29, 29])
Prediction indices after clamping: tensor([28, 28, 28,  ..., 28, 28, 28])
Transcription: A TEF IGO O PSTAE GS  TN MD TTSS TON T  TETESS M  MEGETERRT   TO ONNGTSY  GO   B WIS  OS E S FSI
