In [11]:
import os

def extract_phonemes_from_lpc(directory, output_file):
    """
    Extract all unique phonemes from .lpc files and save them to a file.
    
    Args:
        directory (str): Path to the directory containing .lpc files.
        output_file (str): Path to the output file (CSV or text) where phonemes will be saved.
    """
    # Initialize a set to store unique phonemes
    unique_phonemes = set()

    # Iterate through all .lpc files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".lpc"):
            file_path = os.path.join(directory, filename)
            with open(file_path, "r") as f:
                for line in f:
                    # Extract the phoneme-syllable pair (first column)
                    phoneme_syllable = line.strip().split()[0]
                    # Split by underscore and extract the phoneme
                    syllabe = list(phoneme_syllable.split('_')[0])
                    if len(syllabe) == 3:
                        unique_phonemes.add(syllabe[0])
                        unique_phonemes.add("".join(syllabe[1:]))
                    else:
                        for phoneme in syllabe:
                            unique_phonemes.add(phoneme)

    # Save the unique phonemes to the output file
    with open(output_file, "w") as f:
        for phoneme in sorted(unique_phonemes):
            f.write(f"{phoneme}\n")

    print(f"Saved {len(unique_phonemes)} unique phonemes to {output_file}")


# Example usage
lpc_directory = "/scratch2/bsow/Documents/ACSR/data/training_videos/lpc"
output_file = "/scratch2/bsow/Documents/ACSR/data/training_videos/phoneme_dictionary.txt"
extract_phonemes_from_lpc(lpc_directory, output_file)

Saved 36 unique phonemes to /scratch2/bsow/Documents/ACSR/data/training_videos/phoneme_dictionary.txt


In [10]:
import os
import pandas as pd
import numpy as np
import re
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Load CSV files from a directory based on a filename pattern
def load_csv_files(directory, filename_pattern, type="position"):
    files_data = {}
    for filename in os.listdir(directory):
        if filename_pattern in filename:
            df = pd.read_csv(os.path.join(directory, filename))
            df.dropna(inplace=True)
            base_name = filename.split(f'_{type}_')[1].split('.csv')[0]
            files_data[base_name] = df
    return files_data

# Load features from .npy files based on a filename pattern
def load_features(directory, filename_pattern):
    files_data = {}
    for filename in os.listdir(directory):
        if filename_pattern in filename:
            features = pd.read_csv(os.path.join(directory, filename))
            features.dropna(inplace=True)
            base_name = filename.split('_features')[0]
            files_data[base_name] = features
    return files_data

# Find corresponding phoneme files based on the base names of position filenames
def find_phoneme_files(directory, base_names):
    phoneme_files = {}
    for base_name in base_names:
        phoneme_file = os.path.join(directory, f'{base_name}.lpc')
        if os.path.exists(phoneme_file):
            phoneme_files[base_name] = phoneme_file
    return phoneme_files


In [20]:
import os
import re
import csv
import cv2
import numpy as np
import torch
import pandas as pd
import librosa
from praatio import textgrid as tgio
from tqdm import tqdm

# ==========================================================
# Helper Functions
# ==========================================================

def pad_sequences(sequences, max_length, pad_value=0):
    """
    Pad sequences to the maximum length.

    Args:
        sequences (list): List of sequences to pad.
        max_length (int): Maximum length to pad to.
        pad_value (int): Value to use for padding.

    Returns:
        np.ndarray: Padded sequences.
    """
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_length:
            padding = np.full((max_length - len(seq), seq.shape[1]), pad_value)
            padded_seq = np.vstack((seq, padding))
        else:
            padded_seq = seq[:max_length]
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences)

def combine_sequences_with_padding(video_data):
    """
    Combine sequences with padding to ensure uniform length.

    Args:
        video_data (dict): Dictionary containing video data.

    Returns:
        tuple: Padded input sequences (X_student_hand_shape, X_student_hand_pos, X_student_lips, X_teacher) and padded labels (y).
    """
    max_length = max(len(video_data[video]["X_student_hand_shape"]) for video in video_data)
    
    # Pad hand shape features
    X_student_hand_shape_padded = [
        pad_sequences([video_data[video]["X_student_hand_shape"]], max_length)[0] for video in video_data
    ]
    
    # Pad hand position features
    X_student_hand_pos_padded = [
        pad_sequences([video_data[video]["X_student_hand_pos"]], max_length)[0] for video in video_data
    ]
    
    # Pad lip features
    X_student_lips_padded = [
        pad_sequences([video_data[video]["X_student_lips"]], max_length)[0] for video in video_data
    ]
    
    # Pad teacher features
    X_teacher_padded = [
        pad_sequences([video_data[video]["X_teacher"]], max_length)[0] for video in video_data
    ]
    
    # Pad labels
    y_padded = [
        video_data[video]["y"]
        + [phoneme_to_index[" "]] * (max_length - len(video_data[video]["y"]))
        for video in video_data
    ]
    
    return X_student_hand_shape_padded, X_student_hand_pos_padded, X_student_lips_padded, X_teacher_padded, y_padded

def compute_log_mel_spectrogram(audio_path, sr=16000, n_fft=400, hop_length=160, n_mels=161):
    """
    Compute the log-mel spectrogram for an audio file.

    Args:
        audio_path (str): Path to the audio file.
        sr (int): Sample rate.
        n_fft (int): FFT window size.
        hop_length (int): Hop length for STFT.
        n_mels (int): Number of mel bands.

    Returns:
        np.ndarray: Log-mel spectrogram of shape (num_frames, n_mels).
    """
    # Load audio
    audio, _ = librosa.load(audio_path, sr=sr)

    # Compute mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(
        y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
    )

    # Convert to log scale
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Transpose to (num_frames, n_mels)
    log_mel_spectrogram = log_mel_spectrogram.T

    return log_mel_spectrogram

def parse_textgrid(textgrid_path):
    """
    Parse a TextGrid file to extract phoneme-level intervals.

    Args:
        textgrid_path (str): Path to the TextGrid file.

    Returns:
        list: List of (start_time, end_time, phoneme) tuples.
    """
    tg = tgio.openTextgrid(textgrid_path, includeEmptyIntervals=False)
    phone_tier = tg.getTier("phones")
    return [(start, end, label) for start, end, label in phone_tier.entries]

def get_phoneme_labels_for_frames(phoneme_intervals, num_frames, fps):
    """
    Map phoneme intervals to video frames.

    Args:
        phoneme_intervals (list): List of (start_time, end_time, phoneme) tuples.
        num_frames (int): Total number of video frames.
        fps (int): Frame rate of the video.

    Returns:
        list: Phoneme labels for each frame.
    """
    phoneme_labels = []
    for frame_idx in range(num_frames):
        frame_time = frame_idx / fps
        phoneme = " "  # Default to silence/space
        for start, end, label in phoneme_intervals:
            if start <= frame_time < end:
                phoneme = label
                break
        phoneme_labels.append(phoneme)
    return phoneme_labels

# Load phoneme-to-index mapping
with open(
    r"/scratch2/bsow/Documents/ACSR/data/training_videos/phoneme_dictionary.txt", "r"
) as file:
    reader = csv.reader(file)
    vocabulary_list = [row[0] for row in reader]

phoneme_to_index = {phoneme: idx for idx, phoneme in enumerate(vocabulary_list)}
index_to_phoneme = {idx: phoneme for phoneme, idx in phoneme_to_index.items()}
phoneme_to_index[" "] = len(phoneme_to_index)
index_to_phoneme[len(index_to_phoneme)] = " "

def load_coordinates(directory, base_name):
    """
    Load pre-extracted coordinates from a CSV file.

    Args:
        directory (str): Directory containing the coordinate files.
        base_name (str): Base name of the video (e.g., 'sent_01').

    Returns:
        pd.DataFrame: DataFrame containing the coordinates.
    """
    file_path = os.path.join(directory, f"{base_name}_coordinates.csv")
    df = pd.read_csv(file_path)
    df.dropna(inplace=True)  # Drop rows with NaN values
    return df

def prepare_data_for_videos_no_sliding_windows(
    hand_position_data, phoneme_files, audio_dir, textgrid_dir, video_dir, coordinates_dir
):
    """
    Prepare data for all videos without sliding windows.

    Args:
        hand_position_data (dict): Dictionary of hand position data.
        phoneme_files (dict): Dictionary of phoneme file paths.
        audio_dir (str): Directory containing audio files.
        textgrid_dir (str): Directory containing TextGrid files.
        video_dir (str): Directory containing video files.
        coordinates_dir (str): Directory containing pre-extracted coordinate files.

    Returns:
        dict: Dictionary containing combined features, spectrograms, and phoneme indices.
    """
    all_videos_data = {}
    for base_name in hand_position_data:
        if base_name in phoneme_files:
            # Load pre-extracted coordinates
            coordinates_df = load_coordinates(coordinates_dir, base_name)
            if 'frame_number' not in coordinates_df.columns:
                raise ValueError(f"Coordinate file for {base_name} does not contain 'frame_number' column.")
            frame_numbers = coordinates_df['frame_number'].values

            # Separate coordinates into hand shape, hand position, and lip landmarks
            hand_shape_columns = [f"hand_x{i}" for i in range(21)] + [f"hand_y{i}" for i in range(21)] + [f"hand_z{i}" for i in range(21)]
            hand_pos_columns = ["hand_pos_x", "hand_pos_y", "hand_pos_z"]
            lip_columns = [f"lip_x{i}" for i in range(40)] + [f"lip_y{i}" for i in range(40)] + [f"lip_z{i}" for i in range(40)]

            X_student_hand_shape = coordinates_df[hand_shape_columns].to_numpy()
            X_student_hand_pos = coordinates_df[hand_pos_columns].to_numpy()
            X_student_lips = coordinates_df[lip_columns].to_numpy()

            # Load audio and compute spectrogram
            audio_path = os.path.join(audio_dir, f"{base_name}.wav")
            log_mel_spectrogram = compute_log_mel_spectrogram(audio_path)

            # Load TextGrid and get phoneme labels for each frame
            textgrid_path = os.path.join(textgrid_dir, f"{base_name}.TextGrid")
            phoneme_intervals = parse_textgrid(textgrid_path)

            # Get video FPS
            video_path = os.path.join(video_dir, f"{base_name}.mp4")
            cap = cv2.VideoCapture(video_path)
            fps = int(cap.get(cv2.CAP_PROP_FPS))
            cap.release()

            # Map phoneme labels to frames
            phoneme_labels = get_phoneme_labels_for_frames(phoneme_intervals, len(frame_numbers), fps)

            # Convert phoneme labels to indices
            phoneme_indices = [phoneme_to_index.get(phoneme, -1) for phoneme in phoneme_labels]

            # Combine features, spectrogram, and phoneme indices
            all_videos_data[base_name] = {
                "X_student_hand_shape": X_student_hand_shape,  # Hand shape coordinates
                "X_student_hand_pos": X_student_hand_pos,      # Hand position coordinates
                "X_student_lips": X_student_lips,              # Lip landmarks
                "X_teacher": log_mel_spectrogram,              # Audio features (log-mel spectrogram)
                "y": phoneme_indices,                          # Phoneme labels (shared)
            }
    return all_videos_data

In [21]:
# Directories
data_dir = r'/scratch2/bsow/Documents/ACSR/output/predictions'
phoneme_dir = r'/scratch2/bsow/Documents/ACSR/data/training_videos/lpc'
audio_dir = r'/scratch2/bsow/Documents/ACSR/data/training_videos/audio'
textgrid_dir = r'/scratch2/bsow/Documents/ACSR/data/training_videos/textgrids'
video_dir = r'/scratch2/bsow/Documents/ACSR/data/training_videos/videos'
coordinates_dir = r'/scratch2/bsow/Documents/ACSR/output/extracted_coordinates_old'

# Load position and shape data
hand_position_data = load_csv_files(data_dir, 'predictions_rf_position', type='position')

# Find phoneme files
base_names = hand_position_data.keys()
phoneme_files = find_phoneme_files(phoneme_dir, base_names)

# Prepare data
all_videos_data = prepare_data_for_videos_no_sliding_windows(
    hand_position_data, phoneme_files, audio_dir, textgrid_dir, video_dir, coordinates_dir
)

# Combine sequences with padding
X_student_hand_shape_padded, X_student_hand_pos_padded, X_student_lips_padded, X_teacher_padded, y_padded = combine_sequences_with_padding(all_videos_data)

# Convert to PyTorch tensors
X_student_hand_shape_tensor = torch.tensor(X_student_hand_shape_padded, dtype=torch.float32)
X_student_hand_pos_tensor = torch.tensor(X_student_hand_pos_padded, dtype=torch.float32)
X_student_lips_tensor = torch.tensor(X_student_lips_padded, dtype=torch.float32)
X_teacher_tensor = torch.tensor(X_teacher_padded, dtype=torch.float32)
y_tensor = torch.tensor(y_padded, dtype=torch.long)

# Final organized data
all_videos_data = {
    "X_student_hand_shape": X_student_hand_shape_tensor,  # Hand shape coordinates
    "X_student_hand_pos": X_student_hand_pos_tensor,      # Hand position coordinates
    "X_student_lips": X_student_lips_tensor,              # Lip landmarks
    "X_teacher": X_teacher_tensor,                        # Audio features (log-mel spectrogram)
    "y": y_tensor,                                        # Phoneme labels
}

Number of position files: 95
Number of shape files: 95
Number of phoneme files: 95


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Function to split data into training and validation sets
def train_val_split(data, train_ratio=0.9):
    """
    Split data into training and validation sets.

    Args:
        data (dict): Dictionary containing the dataset.
        train_ratio (float): Proportion of data to use for training.

    Returns:
        tuple: Two dictionaries for training and validation data.
    """
    num_samples = len(data['X_student_hand_shape'])
    split_idx = int(num_samples * train_ratio)
    
    # Randomize the data
    indices = torch.randperm(num_samples)
    
    # Split hand shape features
    X_student_hand_shape = data['X_student_hand_shape'][indices]
    X_student_hand_shape_train = X_student_hand_shape[:split_idx]
    X_student_hand_shape_val = X_student_hand_shape[split_idx:]
    
    # Split hand position features
    X_student_hand_pos = data['X_student_hand_pos'][indices]
    X_student_hand_pos_train = X_student_hand_pos[:split_idx]
    X_student_hand_pos_val = X_student_hand_pos[split_idx:]
    
    # Split lip features
    X_student_lips = data['X_student_lips'][indices]
    X_student_lips_train = X_student_lips[:split_idx]
    X_student_lips_val = X_student_lips[split_idx:]
    
    # Split teacher features
    X_teacher = data['X_teacher'][indices]
    X_teacher_train = X_teacher[:split_idx]
    X_teacher_val = X_teacher[split_idx:]
    
    # Split labels
    y = data['y'][indices]
    y_train = y[:split_idx]
    y_val = y[split_idx:]
    
    # Create train and validation data dictionaries
    train_data = {
        'X_student_hand_shape': X_student_hand_shape_train,
        'X_student_hand_pos': X_student_hand_pos_train,
        'X_student_lips': X_student_lips_train,
        'X_teacher': X_teacher_train,
        'y': y_train
    }
    val_data = {
        'X_student_hand_shape': X_student_hand_shape_val,
        'X_student_hand_pos': X_student_hand_pos_val,
        'X_student_lips': X_student_lips_val,
        'X_teacher': X_teacher_val,
        'y': y_val
    }
    
    return train_data, val_data


# Convert data to DataLoader format
def data_to_dataloader(data, batch_size=4, shuffle=True):
    """
    Convert data into PyTorch DataLoader format.

    Args:
        data (dict): Dictionary containing the dataset.
        batch_size (int): Batch size for the DataLoader.
        shuffle (bool): Whether to shuffle the data.

    Returns:
        DataLoader: PyTorch DataLoader object.
    """
    X_student_hand_shape_tensors = data['X_student_hand_shape']
    X_student_hand_pos_tensors = data['X_student_hand_pos']
    X_student_lips_tensors = data['X_student_lips']
    X_teacher_tensors = data['X_teacher']
    y_tensors = data['y']
    
    # Create a TensorDataset with inputs and labels
    dataset = TensorDataset(
        X_student_hand_shape_tensors,
        X_student_hand_pos_tensors,
        X_student_lips_tensors,
        X_teacher_tensors,
        y_tensors
    )
    
    # Create a DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader


# Split data
train_data, val_data = train_val_split(all_videos_data)

# Prepare DataLoaders
train_loader = data_to_dataloader(train_data, batch_size=4, shuffle=True)
val_loader = data_to_dataloader(val_data, batch_size=4, shuffle=False)

print("Len of train dataset", len(train_data['X_student_hand_shape']))
print("Len of val dataset", len(val_data['X_student_hand_shape']))

# Check the DataLoader output
for batch_X_student_hand_shape, batch_X_student_hand_pos, batch_X_student_lips, batch_X_teacher, batch_y in train_loader:
    print("Batch X_student_hand_shape shape:", batch_X_student_hand_shape.shape)
    print("Batch X_student_hand_pos shape:", batch_X_student_hand_pos.shape)
    print("Batch X_student_lips shape:", batch_X_student_lips.shape)
    print("Batch X_teacher shape:", batch_X_teacher.shape)
    print("Batch y shape:", batch_y.shape)
    print(batch_y)
    break

# Pretraining of a deep speech model



In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import librosa
import pandas as pd

class IPADataset(Dataset):
    def __init__(self, manifest_file, alphabet_file, sample_rate=16000, n_mels=80):
        self.manifest = pd.read_csv(manifest_file, header=None)
        with open(alphabet_file, "r") as f:
            self.alphabet = f.read().splitlines()
        self.sample_rate = sample_rate
        self.n_mels = n_mels

    def __len__(self):
        return len(self.manifest)

    def __getitem__(self, idx):
        audio_path, ipa_path = self.manifest.iloc[idx]
        
        # Load audio and compute mel spectrogram
        audio, _ = librosa.load(audio_path, sr=self.sample_rate)
        mel_spec = librosa.feature.melspectrogram(
            y=audio, sr=self.sample_rate, n_mels=self.n_mels
        )
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Load IPA transcription
        with open(ipa_path, "r") as f:
            ipa = f.read().strip()
        
        # Convert IPA to indices
        ipa_indices = [self.alphabet.index(c) for c in ipa.split()]
        
        return torch.tensor(mel_spec, dtype=torch.float32), torch.tensor(ipa_indices, dtype=torch.long)

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DeepSpeech2(nn.Module):
    def __init__(self, num_classes, hidden_size=1024, num_layers=5):
        super(DeepSpeech2, self).__init__()
        
        # 2D Convolutional Layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv2 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        
        # Bidirectional LSTM Layers
        self.lstm = nn.LSTM(
            input_size=32,  # Output size of the last convolutional layer
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )
        
        # Fully Connected Layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # *2 for bidirectional LSTM
        
    def forward(self, x):
        # Input shape: (batch_size, 1, num_mel_bins, time_steps)
        
        # Convolutional Layers
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        
        # Reshape for LSTM: (batch_size, time_steps, num_features)
        x = x.permute(0, 3, 1, 2)  # Move time_steps to the second dimension
        x = x.reshape(x.size(0), x.size(1), -1)  # Flatten the last two dimensions
        
        # LSTM Layers
        x, _ = self.lstm(x)
        
        # Fully Connected Layer
        x = self.fc(x)
        
        # Output shape: (batch_size, time_steps, num_classes)
        return x

In [23]:
import torch
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    """
    Collate function to pad mel spectrograms and IPA indices to the same length.
    Args:
        batch: List of tuples (mel_spec, ipa_indices).
    Returns:
        Padded mel spectrograms and IPA indices.
    """
    # Separate mel spectrograms and IPA indices
    mel_specs, ipa_indices = zip(*batch)
    
    # Pad mel spectrograms to the same length
    mel_specs_padded = pad_sequence(mel_specs, batch_first=True, padding_value=0)
    
    # Pad IPA indices to the same length
    ipa_indices_padded = pad_sequence(ipa_indices, batch_first=True, padding_value=0)
    
    return mel_specs_padded, ipa_indices_padded

dataset = IPADataset(manifest_file="/scratch2/bsow/Documents/ACSR/data/train.csv", alphabet_file="/scratch2/bsow/Documents/ACSR/data/training_videos/phoneme_dictionary.txt")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [24]:
import torch.optim as optim
from torch.nn import CTCLoss

# Initialize model, optimizer, and loss function
model = DeepSpeech2(num_classes=len(dataset.alphabet))
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = CTCLoss()

# Training loop
for epoch in range(10):  # Number of epochs
    for batch_idx, (mel_spec, ipa_indices) in enumerate(dataloader):
        # Forward pass
        outputs = model(mel_spec.unsqueeze(1))  # Add channel dimension
        outputs = outputs.permute(1, 0, 2)  # CTC expects (time, batch, num_classes)
        
        # Compute CTC loss
        input_lengths = torch.full((mel_spec.size(0),), outputs.size(0), dtype=torch.long)
        target_lengths = torch.tensor([len(ipa) for ipa in ipa_indices], dtype=torch.long)
        loss = criterion(outputs, ipa_indices, input_lengths, target_lengths)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Epoch [{epoch+1}/10], Batch [{batch_idx+1}/{len(dataloader)}], Loss: {loss.item()}")

RuntimeError: stack expects each tensor to be equal size, but got [80, 124] at entry 0 and [80, 210] at entry 1

In [27]:
from coqui_stt import Model

# Load the teacher model
teacher_model_path = "/scratch2/bsow/Documents/ACSR/data/models/kenlm.scorer"
alphabet_path = "/scratch2/bsow/Documents/ACSR/data/models/alphabet.txt"

# Initialize the teacher model
teacher_model = Model(teacher_model_path)
teacher_model.enableExternalScorer(teacher_model_path)

Training on cpu
Epoch 1/200, Loss: 67.96739664944735
Epoch 2/200, Loss: 8.29189978946339
Epoch 3/200, Loss: 5.486008535731923
Epoch 4/200, Loss: 4.9963651136918505
Epoch 5/200, Loss: 4.879518053748391
Epoch 6/200, Loss: 4.8767499273473565
Epoch 7/200, Loss: 4.869186011227694
Epoch 8/200, Loss: 4.850904573093761
Epoch 9/200, Loss: 4.860606930472634
Epoch 10/200, Loss: 4.8490895357998935
Epoch 11/200, Loss: 4.858245957981456
Epoch 12/200, Loss: 4.851324341513894
Epoch 13/200, Loss: 4.8643506223505195
Epoch 14/200, Loss: 4.844618840651079
Epoch 15/200, Loss: 4.849085742777044
Epoch 16/200, Loss: 4.834958033128218
Epoch 17/200, Loss: 4.854825366627086
Epoch 18/200, Loss: 4.831333550539884
Epoch 19/200, Loss: 4.827303019436923
Epoch 20/200, Loss: 4.841424270109697
Epoch 21/200, Loss: 4.84000284021551
Epoch 22/200, Loss: 4.82599156553095
Epoch 23/200, Loss: 4.825075734745372
Epoch 24/200, Loss: 4.829496361992576
Epoch 25/200, Loss: 4.845784143968062
Epoch 26/200, Loss: 4.8297295570373535
Epo

In [28]:
evaluate_model(model, val_loader, criterion)

Validation Loss: 5.295373439788818


In [29]:
def greedy_decoder(output, blank):
    """
    Decode model outputs using a greedy decoder.

    Args:
        output (torch.Tensor): Model outputs of shape (batch_size, sequence_length, num_classes).
        blank (int): Index of the blank token.

    Returns:
        list: List of decoded sequences.
    """
    arg_maxes = torch.argmax(output, dim=2)  # Get the most likely class for each time step
    decodes = []
    for args in arg_maxes:
        decode = []
        previous_idx = None
        for index in args:
            if index != blank and (previous_idx is None or index != previous_idx):
                decode.append(index.item())  # Append non-blank and non-repeated tokens
            previous_idx = index
        decodes.append(decode)
    return decodes


def decode_loader(model, loader, blank, index_to_phoneme):
    """
    Decode outputs for all batches in a DataLoader and return both decoded and true sequences.

    Args:
        model (torch.nn.Module): Trained model.
        loader (torch.utils.data.DataLoader): DataLoader containing input data and labels.
        blank (int): Index of the blank token.
        index_to_phoneme (dict): Mapping from indices to phonemes.

    Returns:
        tuple: (decoded_sequences, true_sequences), where:
            - decoded_sequences: List of decoded phoneme sequences.
            - true_sequences: List of true phoneme sequences.
    """
    model.eval()  # Set the model to evaluation mode
    all_decoded_sequences = []
    all_true_sequences = []

    with torch.no_grad():  # Disable gradient computation
        for X_batch, y_batch in loader:  # Iterate over batches (X_batch: inputs, y_batch: labels)
            X_batch = X_batch.to(device)  # Move inputs to device
            y_batch = y_batch.to(device)  # Move labels to device
            outputs = model(X_batch)  # Get model predictions
            decoded_phoneme_sequences = greedy_decoder(outputs, blank=blank)  # Decode outputs
            decoded_phonemes = [[index_to_phoneme[idx] for idx in sequence] for sequence in decoded_phoneme_sequences]  # Convert indices to phonemes
            all_decoded_sequences.extend(decoded_phonemes)  # Add to the list of decoded sequences

            # Convert true labels to phoneme sequences
            true_phoneme_sequences = [[index_to_phoneme[idx.item()] for idx in sequence if idx != blank and 
                                       index_to_phoneme[idx.item()] != " "] for sequence in y_batch]
            all_true_sequences.extend(true_phoneme_sequences)  # Add to the list of true sequences

    return all_decoded_sequences, all_true_sequences


# Example usage
blank_token = len(phoneme_to_index)  # Index of the blank token
decoded_train_sequences, true_train_sequences = decode_loader(model, train_loader, blank_token, index_to_phoneme)
decoded_val_sequences, true_val_sequences = decode_loader(model, val_loader, blank_token, index_to_phoneme)

# Print results
print("Decoded training phoneme sequences:", decoded_train_sequences)
print("True training phoneme sequences:", true_train_sequences)
print("Decoded validation phoneme sequences:", decoded_val_sequences)
print("True validation phoneme sequences:", true_val_sequences)



Decoded training phoneme sequences: [['la', 'p', 'ɥi', 'ʁ', 'wɑ', 'ɑ̃', 't'], ['pu', 'v', 'zɛ̃', 'ɛ', 'ʁə', 'pɛ̃', 'sa', 't'], ['mɔ̃', 'p', 'ʁɔ', 'ʁ', 't'], ['a', 't'], ['bɛ', 't', 'ʁ', 'pɛ̃', 'ɑ̃', 't'], ['dɛ', 't'], ['le', 'la', 'ka', 'do', 'pu', 'a', 't'], ['t'], ['sə', 'fɛ', 'lø', 'lø', 't'], ['la', 'p', 'lø', 'tɛ', 'də', 't'], ['i', 'p', 't'], ['pu', 'ɡo', 'n', 't', 't'], ['t'], ['i', 't'], ['pu', 'ʁa', 'ɡ', 'za', 'mɛ̃', 'ɛ', 'ma', 'nə', 't'], ['i', 't'], ['ʒə', 'a', 't'], ['t'], ['le', 'p', 'tɛ', 'kœ', 'ji', 'ʁɛ̃', 'də', 't'], ['a', 't'], ['ʁ'], ['le', 'nɔ', 'z', 'sɑ̃', 'lɛ', 'ze', 't'], ['dɛ', 't'], ['i', 't', 'ʁə', 'ze', 'na', 'tɛ', 'e'], ['la', 's', 'sɔ', 's', 't', 'ʁ', 't'], ['la', 'ya', 'sɔ', 'a', 't'], ['pu', 't'], ['le', 'ʁ'], ['lø', 't'], ['i', 'la', 'fɑ̃', 'də', 'sœ', 'ʁɔ̃', 'də', 't'], ['pu', 'zɔ̃', 'lɛ̃', 't'], ['la', 'də', 'sɔ̃', 'a', 'ti', 'a', 't'], ['la', 'də', 'sə', 't'], ['t'], ['a', 'p', 'də', 't'], ['i', 'a', 'i', 'di', 'fi', 't'], ['le', 'k', 't'], ['bɛ', 'kœ'

In [30]:
import jiwer

def calculate_per_with_jiwer(decoded_sequences, true_sequences):
    """
    Calculate the Phoneme Error Rate (PER) using jiwer.

    Args:
        decoded_sequences (list): List of decoded phoneme sequences.
        true_sequences (list): List of true phoneme sequences.

    Returns:
        float: Phoneme Error Rate (PER).
    """
    # Convert phoneme sequences to space-separated strings
    decoded_str = [" ".join(seq) for seq in decoded_sequences]
    true_str = [" ".join(seq) for seq in true_sequences]

    # Calculate PER using jiwer
    per = jiwer.wer(true_str, decoded_str)
    return per

# Example usage
train_per = calculate_per_with_jiwer(decoded_train_sequences, true_train_sequences)
val_per = calculate_per_with_jiwer(decoded_val_sequences, true_val_sequences)

print("Training PER (jiwer):", train_per, "1 - PER: ", 1 - train_per)
print("Validation PER (jiwer):", val_per, "1 - PER: ", 1 - val_per)

Training PER (jiwer): 0.84232868405094 1 - PER:  0.15767131594906003
Validation PER (jiwer): 0.9512195121951219 1 - PER:  0.04878048780487809
