In [5]:
import torch
import torchaudio
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram, Resample
from jiwer import wer
import numpy as np

In [6]:
def load_lists_from_files(audio_file_path='audio_files.txt', transcript_file_path='transcriptions.txt'):
    with open(audio_file_path, 'r') as af:
        audio_files = [line.strip() for line in af]

    with open(transcript_file_path, 'r') as tf:
        transcriptions = [line.strip() for line in tf]

    return audio_files, transcriptions

# Load the lists back from text files
audio_files, transcriptions = load_lists_from_files()

In [7]:
# Create a character-level vocabulary
vocab = set(''.join(transcriptions))
vocab = {char: idx for idx, char in enumerate(sorted(vocab))}
idx_to_char = {idx: char for char, idx in vocab.items()}
blank_token_idx = len(vocab)
print(vocab)

# Function to convert transcription to numerical labels
def text_to_labels(text):
    return [vocab[char] for char in text]

# Function to convert labels to text
def labels_to_text(labels):
    return ''.join([idx_to_char[idx] for idx in labels if idx in idx_to_char])

{' ': 0, "'": 1, 'A': 2, 'B': 3, 'C': 4, 'D': 5, 'E': 6, 'F': 7, 'G': 8, 'H': 9, 'I': 10, 'J': 11, 'K': 12, 'L': 13, 'M': 14, 'N': 15, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 20, 'T': 21, 'U': 22, 'V': 23, 'W': 24, 'X': 25, 'Y': 26, 'Z': 27}


In [8]:
# SpeechDataset class definition
class SpeechDataset(Dataset):
    def __init__(self, audio_files, transcriptions, sample_rate=16000):
        self.audio_files = audio_files
        self.transcriptions = transcriptions
        self.sample_rate = sample_rate
        self.resample = Resample(orig_freq=sample_rate, new_freq=16000)
        self.melspec = MelSpectrogram(sample_rate=16000, n_mels=128)
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        try:
            waveform, sample_rate = torchaudio.load(self.audio_files[idx])
        except Exception as e:
            print(f"Error loading file {self.audio_files[idx]}: {e}")
            return None, None  # Return None for both to handle it later
        
        waveform = self.resample(waveform)
        mel_spec = self.melspec(waveform)
        transcription = self.transcriptions[idx]
        return mel_spec.squeeze(0).transpose(0, 1), transcription  # Transpose to [seq_len, feature_dim]

In [9]:
# Function to pad sequences
def pad_sequence(batch):
    batch = [item for item in batch]  # Ensure [seq_len, feature_dim]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
    return batch

In [10]:
# Update the collate function to handle None entries
def collate_fn(batch):
    batch = [item for item in batch if item[0] is not None]  # Filter out None entries
    if len(batch) == 0:  # Handle the case where all items are None
        return None, None, None, None
    mel_specs = [item[0] for item in batch]
    transcriptions = [item[1] for item in batch]
    mel_specs_padded = pad_sequence(mel_specs)
    labels = [torch.tensor(text_to_labels(t)) for t in transcriptions]
    label_lengths = torch.tensor([len(label) for label in labels])
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)
    input_lengths = torch.tensor([mel_spec.size(0) for mel_spec in mel_specs_padded])
    return mel_specs_padded, labels_padded, input_lengths, label_lengths

In [11]:
# Create dataset and dataloader with the expanded dataset
dataset = SpeechDataset(audio_files, transcriptions)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)



In [12]:
# Enhanced Model with Convolutional Layer
class EnhancedSpeechModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(EnhancedSpeechModel, self).__init__()
        self.conv1 = nn.Conv1d(input_dim, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(64, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bidirectional LSTM

    def forward(self, x):
        x = x.transpose(1, 2)  # Change to [batch_size, feature_dim, seq_len] for Conv1d
        x = self.conv1(x)
        x = self.relu(x)
        x = x.transpose(1, 2)  # Change back to [batch_size, seq_len, feature_dim] for LSTM
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [13]:
# Initialize and train the model
input_dim = 128  # Number of mel bands
hidden_dim = 256
output_dim = len(vocab) + 1  # Output dimension based on the size of the vocabulary + 1 for the blank token
model = EnhancedSpeechModel(input_dim, hidden_dim, output_dim)

In [14]:
# Training with Early Stopping and Learning Rate Scheduler
def train_model(model, dataloader, num_epochs=50, learning_rate=0.001, patience=10):
    criterion = nn.CTCLoss(blank=output_dim - 1, zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.5, verbose=True)
    best_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch in dataloader:
            if batch[0] is None:  # Skip if batch is None
                continue
            
            mel_specs, labels, input_lengths, label_lengths = batch
            
            if mel_specs.dim() != 3:
                print(f"Unexpected dimensions: {mel_specs.shape}")
                continue  # Skip this batch if dimensions are not as expected
            
            outputs = model(mel_specs)
            outputs = outputs.log_softmax(2)
            outputs = outputs.permute(1, 0, 2)  # (T, N, C) for CTCLoss

            loss = criterion(outputs, labels, input_lengths, label_lengths)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        epoch_loss /= len(dataloader)
        scheduler.step(epoch_loss)
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
        
        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

In [11]:
# Train the model with the expanded dataset
train_model(model, dataloader)



Epoch [1/100], Loss: 1.9692
Epoch [2/100], Loss: 1.5000
Epoch [3/100], Loss: 1.3373
Epoch [4/100], Loss: 1.2771
Epoch [5/100], Loss: 1.2087
Epoch [6/100], Loss: 1.1666
Epoch [7/100], Loss: 1.1351
Epoch [8/100], Loss: 1.1128
Epoch [9/100], Loss: 1.0979
Epoch [10/100], Loss: 1.0843
Epoch [11/100], Loss: 1.0693
Epoch [12/100], Loss: 1.0639
Epoch [13/100], Loss: 1.0554
Epoch [14/100], Loss: 1.0510
Epoch [15/100], Loss: 1.0436
Epoch [16/100], Loss: 1.1238
Epoch [17/100], Loss: 1.8289
Epoch [18/100], Loss: 1.5339
Epoch [19/100], Loss: 1.3607
Epoch [20/100], Loss: 1.3152
Epoch [21/100], Loss: 1.3320
Epoch [22/100], Loss: 1.2814
Epoch [23/100], Loss: 1.3036
Epoch [24/100], Loss: 1.3967


KeyboardInterrupt: 

In [15]:
# Function to decode the model output into text
def decode_output(output):
    _, max_indices = torch.max(output, dim=-1)
    tokens = max_indices.unique_consecutive()
    decoded = ''.join([idx_to_char[idx.item()] for idx in tokens if idx.item() in idx_to_char])
    return decoded

In [16]:
# Calculate accuracy using Word Error Rate (WER)
def calculate_accuracy(model, dataloader):
    model.eval()
    predictions = []
    ground_truths = []
    
    with torch.no_grad():
        for batch in dataloader:
            if batch[0] is None:  # Skip if batch is None
                continue
            
            mel_specs, labels, input_lengths, label_lengths = batch
            
            outputs = model(mel_specs)
            outputs = outputs.log_softmax(2)
            outputs = outputs.permute(1, 0, 2)  # (T, N, C) for CTCLoss
            
            for i in range(outputs.size(1)):  # Iterate over batch
                decoded_output = decode_output(outputs[:, i, :])
                predictions.append(decoded_output)
                ground_truths.append(''.join([idx_to_char[idx.item()] for idx in labels[i] if idx.item() in idx_to_char]))
    
    wer_score = wer(ground_truths, predictions)
    accuracy = 1 - wer_score
    return accuracy


In [14]:
# Calculate accuracy
accuracy = calculate_accuracy(model, dataloader)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.0905


In [15]:
# Save the trained model
torch.save(model, 'speechtotextmodel3_with_large_data.pth')

In [17]:
# Load the trained model
model_path = 'speechtotextmodel3_with_large_data.pth'  # Adjust this path as necessary
model = torch.load(model_path)
model.eval()

EnhancedSpeechModel(
  (conv1): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (lstm): LSTM(64, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=29, bias=True)
)

In [18]:
# Define the function to preprocess audio
def preprocess_audio(file_path, sample_rate=16000):
    waveform, sr = torchaudio.load(file_path)
    resample = Resample(orig_freq=sr, new_freq=sample_rate)
    waveform = resample(waveform)
    melspec = MelSpectrogram(sample_rate=sample_rate, n_mels=128)
    mel_spec = melspec(waveform)
    return mel_spec.squeeze(0).transpose(0, 1)  # Transpose to [seq_len, feature_dim]

In [19]:
# Function to decode the output
def labels_to_text1(labels, idx_to_char1):
    return ''.join([idx_to_char1[idx] for idx in labels if idx in idx_to_char1])

In [20]:
# Load vocabulary
vocab1 = {char: idx for idx, char in enumerate(" 'ABCDEFGHIJKLMNOPQRSTUVWXYZ")}
idx_to_char1 = {idx: char for char, idx in vocab1.items()}

In [23]:
# Paths to the uploaded audio files
audio_files = ["31-121972-0000.wav", "sample2.wav"]  # Adjust paths as necessary

In [24]:
# Process and predict for each audio file
transcriptions = []
for audio_file in audio_files:
    mel_spec = preprocess_audio(audio_file)
    mel_spec = mel_spec.unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        output = model(mel_spec)
    predicted_labels = torch.argmax(output, dim=-1).squeeze().tolist()
    transcription = labels_to_text(predicted_labels)
    transcriptions.append(transcription)
    print(f"Transcription for {audio_file}: {transcription}")

Transcription for 31-121972-0000.wav: IISOONIN TTSERTHETTY GES NNEENARD  ATI IFO CNNTATO  TSYN  EN  TONCENOFFE ANS OED  SHHMTCOFSIBHENINF WWESSONS CSSEM SSII
Transcription for sample2.wav: THE  SHE A  NOT OPANDMO EFOR FE SOIMEDIN Y  ANI TO  HEE DONS AHE  AS  COD APANTH O ARRSAT ASTISIMESAS ANERR AN ATORRISHHOL SADENSETIT


In [25]:
transcriptions

['IISOONIN TTSERTHETTY GES NNEENARD  ATI IFO CNNTATO  TSYN  EN  TONCENOFFE ANS OED  SHHMTCOFSIBHENINF WWESSONS CSSEM SSII',
 'THE  SHE A  NOT OPANDMO EFOR FE SOIMEDIN Y  ANI TO  HEE DONS AHE  AS  COD APANTH O ARRSAT ASTISIMESAS ANERR AN ATORRISHHOL SADENSETIT']