In [1]:
import torch
import torchaudio
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram, Resample

In [2]:
# Paths to the uploaded audio files
audio_files = ["sample1.wav", "sample2.wav"]  # List of audio file paths
transcriptions = ["IN SPITE OF UDOLPHO AND THE DRESSMAKER HOWEVER THE PARTY FROM PULTENEY STREET REACHED THE UPPER ROOMS IN VERY GOOD TIME THE THORPES AND JAMES MORLAND WERE THERE ONLY TWO MINUTES BEFORE THEM", 
                  "BUT SHE HAD NOT LOOKED ROUND LONG BEFORE SHE SAW HIM LEADING A YOUNG LADY TO THE DANCE AH HE HAS GOT A PARTNER I WISH HE HAD ASKED YOU SAID MISSUS ALLEN AND AFTER A SHORT SILENCE SHE ADDED"]  # Corresponding transcriptions

In [3]:
class SpeechDataset(Dataset):
    def __init__(self, audio_files, transcriptions, sample_rate=16000):
        self.audio_files = audio_files
        self.transcriptions = transcriptions
        self.sample_rate = sample_rate
        self.resample = Resample(orig_freq=sample_rate, new_freq=16000)
        self.melspec = MelSpectrogram(sample_rate=16000, n_mels=128)
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.audio_files[idx])
        waveform = self.resample(waveform)
        mel_spec = self.melspec(waveform)
        transcription = self.transcriptions[idx]
        return mel_spec.squeeze(0).transpose(0, 1), transcription  # Transpose to [seq_len, feature_dim]

In [4]:
# def pad_sequence(batch):
#     batch = [item.transpose(0, 1) for item in batch]
#     batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
#     batch = batch.transpose(1, 2)  # Restore original dimensions
#     return batch

# def pad_sequence(batch):
#     batch = [item[0] for item in batch]
#     batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
#     return batch

# def pad_sequence(batch):
#     batch = [item[0].transpose(0, 1) for item in batch]
#     batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
#     return batch

def pad_sequence(batch):
    batch = [item for item in batch]  # Ensure [seq_len, feature_dim]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
    return batch

In [6]:
# def collate_fn(batch):
#     mel_specs = [item[0].squeeze(0) for item in batch]
#     transcriptions = [item[1] for item in batch]
#     mel_specs_padded = pad_sequence(mel_specs)
#     return mel_specs_padded, transcriptions

# def collate_fn(batch):
#     mel_specs = [item[0] for item in batch]
#     transcriptions = [item[1] for item in batch]
#     mel_specs_padded = pad_sequence(mel_specs)
#     return mel_specs_padded, transcriptions

def collate_fn(batch):
    mel_specs = [item[0] for item in batch]
    transcriptions = [item[1] for item in batch]
    mel_specs_padded = pad_sequence(mel_specs)
    return mel_specs_padded, transcriptions

In [7]:
# Define a simple model architecture
class SimpleSpeechModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleSpeechModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

In [8]:
# Initialize and train the model
input_dim = 128  # Number of mel bands
hidden_dim = 256
output_dim = 10  # Example output dimension, adjust based on your transcription encoding
model = SimpleSpeechModel(input_dim, hidden_dim, output_dim)

In [9]:
def train_model(model, dataloader, num_epochs=10, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        for mel_specs, transcriptions in dataloader:
            # Convert transcriptions to tensor of labels
            # Note: In a real scenario, you'll need to preprocess and convert transcriptions to a suitable format (e.g., label encoding)
            labels = torch.tensor([0, 1])  # Dummy labels for example purposes
            
            if mel_specs.dim() != 3:
                print(f"Unexpected dimensions: {mel_specs.shape}")
                continue  # Skip this batch if dimensions are not as expected
            
            outputs = model(mel_specs)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [10]:
# Create dataset and dataloader
dataset = SpeechDataset(audio_files, transcriptions)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)



In [11]:
train_model(model, dataloader)

Epoch [1/10], Loss: 2.2982
Epoch [2/10], Loss: 2.2673
Epoch [3/10], Loss: 2.2362
Epoch [4/10], Loss: 2.2031
Epoch [5/10], Loss: 2.1658
Epoch [6/10], Loss: 2.1202
Epoch [7/10], Loss: 2.0590
Epoch [8/10], Loss: 1.9649
Epoch [9/10], Loss: 1.7768
Epoch [10/10], Loss: 1.0158
