In [1]:
import os

def extract_syllables_from_lpc(directory, output_file):
    """
    Extract all unique syllables from .lpc files and save them to a file.
    
    Args:
        directory (str): Path to the directory containing .lpc files.
        output_file (str): Path to the output file (CSV or text) where syllables will be saved.
    """
    # Initialize a set to store unique syllables
    unique_syllables = set()

    # Iterate through all .lpc files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".lpc"):
            file_path = os.path.join(directory, filename)
            with open(file_path, "r") as f:
                for line in f:
                    # Extract the syllable (first column)
                    syllable = line.strip().split()[0]
                    unique_syllables.add(syllable.split('_')[0])

    # Save the unique syllables to the output file
    with open(output_file, "w") as f:
        for syllable in sorted(unique_syllables):
            f.write(f"{syllable}\n")

    print(f"Saved {len(unique_syllables)} unique syllables to {output_file}")


# Example usage
lpc_directory = "/scratch2/bsow/Documents/ACSR/data/training_videos/lpc"
output_file = "/scratch2/bsow/Documents/ACSR/data/training_videos/syllable_dictionary.txt"
extract_syllables_from_lpc(lpc_directory, output_file)

Saved 201 unique syllables to /scratch2/bsow/Documents/ACSR/data/training_videos/syllable_dictionary.txt


In [1]:
import os
import pandas as pd
import numpy as np
import re
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Load CSV files from a directory based on a filename pattern
def load_csv_files(directory, filename_pattern, type="position"):
    files_data = {}
    for filename in os.listdir(directory):
        if filename_pattern in filename:
            df = pd.read_csv(os.path.join(directory, filename))
            df.dropna(inplace=True)
            base_name = filename.split(f'_{type}_')[1].split('.csv')[0]
            files_data[base_name] = df
    return files_data

# Load features from .npy files based on a filename pattern
def load_features(directory, filename_pattern):
    files_data = {}
    for filename in os.listdir(directory):
        if filename_pattern in filename:
            features = pd.read_csv(os.path.join(directory, filename))
            features.dropna(inplace=True)
            base_name = filename.split('_features')[0]
            files_data[base_name] = features
    return files_data

# Find corresponding phoneme files based on the base names of position filenames
def find_phoneme_files(directory, base_names):
    phoneme_files = {}
    for base_name in base_names:
        phoneme_file = os.path.join(directory, f'{base_name}.lpc')
        if os.path.exists(phoneme_file):
            phoneme_files[base_name] = phoneme_file
    return phoneme_files


In [2]:
import os
import re
import csv
import numpy as np
import torch
import pandas as pd

# ==========================================================
# Helper Functions
# ==========================================================


def pad_sequences(sequences, max_length, pad_value=0):
    """
    Pad sequences to the maximum length.

    Args:
        sequences (list): List of sequences to pad.
        max_length (int): Maximum length to pad to.
        pad_value (int): Value to use for padding.

    Returns:
        np.ndarray: Padded sequences.
    """
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_length:
            padding = np.full((max_length - len(seq), seq.shape[1]), pad_value)
            padded_seq = np.vstack((seq, padding))
        else:
            padded_seq = seq[:max_length]
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences)


def extract_probabilities(data, columns):
    """
    Extract and concatenate probabilities from multiple DataFrames.

    Args:
        data (list): List of DataFrames.
        columns (list): Columns to extract probabilities from.

    Returns:
        np.ndarray: Concatenated probabilities.
    """
    data = [df.fillna(0) for df in data]
    probs_list = [df[columns].to_numpy() for df in data]
    return np.concatenate(probs_list, axis=0)


def combine_sequences_with_padding(video_data):
    """
    Combine sequences with padding to ensure uniform length.

    Args:
        video_data (dict): Dictionary containing video data.

    Returns:
        tuple: Padded input sequences (X) and padded labels (y).
    """
    max_length = max(len(video_data[video]["X"]) for video in video_data)
    X_padded = [
        pad_sequences([video_data[video]["X"]], max_length)[0] for video in video_data
    ]
    y_padded = [
        video_data[video]["y"]
        + [phoneme_to_index[" "]] * (max_length - len(video_data[video]["y"]))
        for video in video_data
    ]
    return X_padded, y_padded


# ==========================================================
# Data Preparation Functions
# ==========================================================

# Load phoneme-to-index mapping
with open(
    r"/scratch2/bsow/Documents/ACSR/data/training_videos/syllable_dictionary.txt", "r"
) as file:
    reader = csv.reader(file)
    vocabulary_list = [row[0] for row in reader]


phoneme_to_index = {phoneme: idx for idx, phoneme in enumerate(vocabulary_list)}
index_to_phoneme = {idx: phoneme for phoneme, idx in phoneme_to_index.items()}
phoneme_to_index[" "] = len(phoneme_to_index)
index_to_phoneme[len(index_to_phoneme)] = " "


# Prepare data for all videos without sliding windows
def prepare_data_for_videos_no_sliding_windows(
    hand_position_data, hand_shape_data, phoneme_files, feature_files
):
    """
    Prepare data for all videos without sliding windows.
    Only frames specified in the .lpc files are included in the predictions.
    Other frames are filled with -1 for the predicted class and 0 for probabilities.

    Args:
        hand_position_data (dict): Dictionary of hand position data.
        hand_shape_data (dict): Dictionary of hand shape data.
        phoneme_files (dict): Dictionary of phoneme file paths.
        feature_files (dict): Dictionary of feature file paths.

    Returns:
        dict: Dictionary containing combined features, probabilities, and phoneme indices.
    """
    all_videos_data = {}
    for base_name in hand_position_data:
        if base_name in phoneme_files and base_name in feature_files:
            position_df = hand_position_data[base_name]
            shape_df = hand_shape_data[base_name]
            phoneme_file = phoneme_files[base_name]
            feature_df = feature_files[base_name]

            # Ensure the feature DataFrame has a 'frame_number' column
            if 'frame_number' not in feature_df.columns:
                raise ValueError(f"Feature file for {base_name} does not contain 'frame_number' column.")

            # Get the frame numbers from the feature DataFrame
            frame_numbers = feature_df['frame_number'].values

            # Initialize arrays for probabilities and predicted classes
            num_frames = len(frame_numbers)
            num_position_probs = 5  # Number of position probability columns
            num_shape_probs = 8  # Number of shape probability columns

            # Create arrays to store probabilities and predicted classes
            #position_probs = np.zeros((num_frames, num_position_probs))
            #shape_probs = np.zeros((num_frames, num_shape_probs))
            #predicted_classes = np.full(num_frames, -1)  # Default to -1 for unpredicted frames
#
            ## Fill in probabilities and predicted classes for frames that have predictions
            #for i, frame in enumerate(frame_numbers):
            #    if frame in position_df['frame_number'].values:
            #        position_probs[i] = position_df.loc[position_df['frame_number'] == frame, [
            #            'p_class_1', 'p_class_2', 'p_class_3', 'p_class_4', 'p_class_5'
            #        ]].values[0]
            #        shape_probs[i] = shape_df.loc[shape_df['frame_number'] == frame, [
            #            'p_class_1', 'p_class_2', 'p_class_3', 'p_class_4', 'p_class_5',
            #            'p_class_6', 'p_class_7', 'p_class_8'
            #        ]].values[0]
            #        predicted_classes[i] = position_df.loc[position_df['frame_number'] == frame, 'predicted_class'].values[0]
#
            #combined_probs = np.hstack((position_probs, shape_probs))
#
            ## Drop 'fn_video' and 'frame_number' columns from the feature DataFrame
            feature_df_filtered = feature_df.drop(columns=['fn_video', 'frame_number'])
#
            ## Combine filtered features with probabilities
            #combined_df = pd.DataFrame(
            #    np.hstack((feature_df_filtered.values, combined_probs)),
            #    columns=list(feature_df_filtered.columns) + [f'prob_{i}' for i in range(combined_probs.shape[1])]
            #)

            # Read phoneme sequences
            with open(phoneme_file, "r", encoding="utf-8") as f:
                reader = csv.reader(f)
                phoneme_sequence = [row[0].split(' ')[0].split('_')[0] for row in reader]

            # Convert phoneme sequence to indices
            try:
                phoneme_indices = [
                    phoneme_to_index[phoneme] for phoneme in phoneme_sequence
                ]
            except KeyError:
                print(f"Unknown phoneme in {base_name}: {phoneme_sequence}")
                phoneme_indices = [
                    phoneme_to_index.get(phoneme, -1) for phoneme in phoneme_sequence
                ]

            # Store the data
            all_videos_data[base_name] = {
                "X": feature_df_filtered.to_numpy(),  # Combined features and probabilities for all frames
                "y": phoneme_indices,  # Phoneme indices
            }
    return all_videos_data

In [5]:
hand_position_data

{'sent_16':     frame_number  predicted_class  p_class_1  p_class_2  p_class_3  p_class_4  \
 0             17              1.0       0.16       0.03       0.73       0.08   
 1             22              1.0       0.87       0.00       0.01       0.00   
 2             26              1.0       0.00       0.00       0.00       0.00   
 3             31              1.0       0.00       0.00       0.00       0.00   
 4             37              1.0       0.00       0.00       0.00       0.00   
 5             45              1.0       0.00       0.00       0.00       0.00   
 6             55              3.0       0.00       0.00       0.00       0.00   
 7             60              3.0       0.00       0.00       0.00       0.00   
 8             67              3.0       0.00       0.00       0.00       0.00   
 9             77              5.0       0.00       0.00       0.00       0.00   
 10            97              1.0       0.00       0.00       0.00       0.00   
 11  

In [4]:
# ==========================================================
# Main Script
# ==========================================================

# Directories
data_dir = r'/scratch2/bsow/Documents/ACSR/output/predictions'
phoneme_dir = r'/scratch2/bsow/Documents/ACSR/data/training_videos/lpc'

# Load position and shape data
hand_position_data = load_csv_files(data_dir, 'predictions_rf_position', type='position')
hand_shape_data = load_csv_files(data_dir, 'predictions_rf_shape', type='shape')

# Find phoneme files
base_names = hand_position_data.keys()[0:1]
phoneme_files = find_phoneme_files(phoneme_dir, base_names)

# Load feature
feature_dir = r'/scratch2/bsow/Documents/ACSR/output/extracted_features'
feature_files = load_features(feature_dir, 'features')
# Print the number of files found
print(f"Number of position files: {len(hand_position_data)}")
print(f"Number of shape files: {len(hand_shape_data)}")
print(f"Number of phoneme files: {len(phoneme_files)}")

hand_position_data = {
    key: hand_position_data[key] for key in list(hand_position_data.keys())[0:1]
}
hand_shape_data = {
    key: hand_shape_data[key] for key in list(hand_shape_data.keys())[0:1]
}
phoneme_files = {key: phoneme_files[key] for key in list(phoneme_files.keys())}

# Prepare data
all_videos_data = prepare_data_for_videos_no_sliding_windows(
    hand_position_data, hand_shape_data, phoneme_files, feature_files
)
X_combined, y_combined = combine_sequences_with_padding(all_videos_data)

# Convert phoneme sequences to tensors
y_tensors = [
    torch.tensor([index for index in video_data["y"]], dtype=torch.long)
    for video_data in all_videos_data.values()
]
all_videos_data = {
    key: {"X": video_data["X"], "y": y_tensors[i]}
    for i, (key, video_data) in enumerate(all_videos_data.items())
}


# Ensure X_combined contains only numeric data
try:
    X_combined_numeric = np.array(X_combined, dtype=np.float32)  # Force conversion to float32
except ValueError as e:
    print(f"Error converting X_combined to numeric: {e}")
    # Identify problematic elements
    for i, x in enumerate(X_combined):
        if not isinstance(x, (np.ndarray, list, float, int)):
            print(f"Non-numeric element at index {i}: {x}")

# Convert to PyTorch tensor
X_combined_tensor = torch.tensor(X_combined_numeric, dtype=torch.float32)
y_combined_tensor = torch.tensor(y_combined, dtype=torch.long)

# Final organized data
all_videos_data = {"X": X_combined_tensor, "y": y_combined_tensor}
# Combine all data into tensors
X_combined = torch.tensor(np.array(X_combined), dtype=torch.float32) 
y_combined = torch.tensor(y_combined, dtype=torch.long)

# Final organized data
all_videos_data = {"X": X_combined, "y": y_combined}

Number of position files: 95
Number of shape files: 95
Number of phoneme files: 95


In [22]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Function to split data into training and validation sets
def train_val_split(data, train_ratio=0.9):
    num_samples = len(data['X'])
    split_idx = int(num_samples * train_ratio)
    # randomize the data
    indices = torch.randperm(num_samples)
    data['X'] = data['X'][indices]
    data['y'] = data['y'][indices]
    
    train_data = {
        'X': data['X'][:split_idx],
        'y': data['y'][:split_idx]
    }
    val_data = {
        'X': data['X'][split_idx:],
        'y': data['y'][split_idx:]
    }
    return train_data, val_data

# Convert data to DataLoader format
def data_to_dataloader(data, batch_size=4, shuffle=True):
    X_tensors = data['X']
    y_tensors = data['y']
    
    # Create a TensorDataset with both inputs and labels
    dataset = TensorDataset(X_tensors, y_tensors)
    
    # Create a DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader


# Split data
train_data, val_data = train_val_split(all_videos_data)

# Prepare DataLoaders
train_loader = data_to_dataloader(train_data, batch_size=4, shuffle=True)
val_loader = data_to_dataloader(val_data, batch_size=4, shuffle=False)

print("Len of train dataset", len(train_data['X']))
print("Len of val dataset", len(val_data['X']))

# Check the DataLoader output
for batch_X, batch_y in train_loader:
    print("Batch X shape:", batch_X.shape)
    print("Batch y shape:", batch_y.shape)
    print(batch_y)
    break

Len of train dataset 85
Len of val dataset 10
Batch X shape: torch.Size([4, 327, 28])
Batch y shape: torch.Size([4, 327])
tensor([[ 62,  10,  97,  ..., 201, 201, 201],
        [ 82,  26, 182,  ..., 201, 201, 201],
        [102, 171, 178,  ..., 201, 201, 201],
        [122, 114,  12,  ..., 201, 201, 201]])


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Model Definition
class CuedSpeechRNN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=48, num_layers=2):
        super(CuedSpeechRNN, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim + 1)  # +1 for the CTC blank token

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=-1)

# Training Function
def train_model(model, train_loader, criterion, optimizer, num_epochs=50):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            # Ensure X_batch is 3D: (batch_size, sequence_length, feature_dimension)
            if X_batch.dim() == 2:
                X_batch = X_batch.unsqueeze(0)

            outputs = model(X_batch)
            input_lengths = torch.full((X_batch.size(0),), outputs.size(1), dtype=torch.long)  # Sequence length for each batch element
            target_lengths = torch.tensor([len(y[y != phoneme_to_index[' ']]) for y in y_batch], dtype=torch.long)  # Target sequence length ignoring padding

            # Compute CTC loss
            loss = criterion(outputs.transpose(0, 1), y_batch, input_lengths, target_lengths)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

# Evaluation Function
def evaluate_model(model, val_loader, criterion):
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            if X_batch.dim() == 2:
                X_batch = X_batch.unsqueeze(0)

            outputs = model(X_batch)
            input_lengths = torch.full((X_batch.size(0),), outputs.size(1), dtype=torch.long)  # Sequence length for each batch element
            target_lengths = torch.tensor([len(y[y != phoneme_to_index[' ']]) for y in y_batch], dtype=torch.long)  # Target sequence length ignoring padding

            val_loss = criterion(outputs.transpose(0, 1), y_batch, input_lengths, target_lengths)
            total_val_loss += val_loss.item()

    print(f"Validation Loss: {total_val_loss/len(val_loader)}")

# Instantiate and Train Model
input_dim = X_combined.shape[-1]
output_dim = len(phoneme_to_index)
model = CuedSpeechRNN(input_dim, output_dim)
criterion = nn.CTCLoss(blank=len(phoneme_to_index))
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Training on {device}")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=200)

# Evaluate the model
evaluate_model(model, val_loader, criterion)

Training on cpu
Epoch 1/200, Loss: 67.96739664944735
Epoch 2/200, Loss: 8.29189978946339
Epoch 3/200, Loss: 5.486008535731923
Epoch 4/200, Loss: 4.9963651136918505
Epoch 5/200, Loss: 4.879518053748391
Epoch 6/200, Loss: 4.8767499273473565
Epoch 7/200, Loss: 4.869186011227694
Epoch 8/200, Loss: 4.850904573093761
Epoch 9/200, Loss: 4.860606930472634
Epoch 10/200, Loss: 4.8490895357998935
Epoch 11/200, Loss: 4.858245957981456
Epoch 12/200, Loss: 4.851324341513894
Epoch 13/200, Loss: 4.8643506223505195
Epoch 14/200, Loss: 4.844618840651079
Epoch 15/200, Loss: 4.849085742777044
Epoch 16/200, Loss: 4.834958033128218
Epoch 17/200, Loss: 4.854825366627086
Epoch 18/200, Loss: 4.831333550539884
Epoch 19/200, Loss: 4.827303019436923
Epoch 20/200, Loss: 4.841424270109697
Epoch 21/200, Loss: 4.84000284021551
Epoch 22/200, Loss: 4.82599156553095
Epoch 23/200, Loss: 4.825075734745372
Epoch 24/200, Loss: 4.829496361992576
Epoch 25/200, Loss: 4.845784143968062
Epoch 26/200, Loss: 4.8297295570373535
Epo

In [28]:
evaluate_model(model, val_loader, criterion)

Validation Loss: 5.295373439788818


In [29]:
def greedy_decoder(output, blank):
    """
    Decode model outputs using a greedy decoder.

    Args:
        output (torch.Tensor): Model outputs of shape (batch_size, sequence_length, num_classes).
        blank (int): Index of the blank token.

    Returns:
        list: List of decoded sequences.
    """
    arg_maxes = torch.argmax(output, dim=2)  # Get the most likely class for each time step
    decodes = []
    for args in arg_maxes:
        decode = []
        previous_idx = None
        for index in args:
            if index != blank and (previous_idx is None or index != previous_idx):
                decode.append(index.item())  # Append non-blank and non-repeated tokens
            previous_idx = index
        decodes.append(decode)
    return decodes


def decode_loader(model, loader, blank, index_to_phoneme):
    """
    Decode outputs for all batches in a DataLoader and return both decoded and true sequences.

    Args:
        model (torch.nn.Module): Trained model.
        loader (torch.utils.data.DataLoader): DataLoader containing input data and labels.
        blank (int): Index of the blank token.
        index_to_phoneme (dict): Mapping from indices to phonemes.

    Returns:
        tuple: (decoded_sequences, true_sequences), where:
            - decoded_sequences: List of decoded phoneme sequences.
            - true_sequences: List of true phoneme sequences.
    """
    model.eval()  # Set the model to evaluation mode
    all_decoded_sequences = []
    all_true_sequences = []

    with torch.no_grad():  # Disable gradient computation
        for X_batch, y_batch in loader:  # Iterate over batches (X_batch: inputs, y_batch: labels)
            X_batch = X_batch.to(device)  # Move inputs to device
            y_batch = y_batch.to(device)  # Move labels to device
            outputs = model(X_batch)  # Get model predictions
            decoded_phoneme_sequences = greedy_decoder(outputs, blank=blank)  # Decode outputs
            decoded_phonemes = [[index_to_phoneme[idx] for idx in sequence] for sequence in decoded_phoneme_sequences]  # Convert indices to phonemes
            all_decoded_sequences.extend(decoded_phonemes)  # Add to the list of decoded sequences

            # Convert true labels to phoneme sequences
            true_phoneme_sequences = [[index_to_phoneme[idx.item()] for idx in sequence if idx != blank and 
                                       index_to_phoneme[idx.item()] != " "] for sequence in y_batch]
            all_true_sequences.extend(true_phoneme_sequences)  # Add to the list of true sequences

    return all_decoded_sequences, all_true_sequences


# Example usage
blank_token = len(phoneme_to_index)  # Index of the blank token
decoded_train_sequences, true_train_sequences = decode_loader(model, train_loader, blank_token, index_to_phoneme)
decoded_val_sequences, true_val_sequences = decode_loader(model, val_loader, blank_token, index_to_phoneme)

# Print results
print("Decoded training phoneme sequences:", decoded_train_sequences)
print("True training phoneme sequences:", true_train_sequences)
print("Decoded validation phoneme sequences:", decoded_val_sequences)
print("True validation phoneme sequences:", true_val_sequences)



Decoded training phoneme sequences: [['la', 'p', 'ɥi', 'ʁ', 'wɑ', 'ɑ̃', 't'], ['pu', 'v', 'zɛ̃', 'ɛ', 'ʁə', 'pɛ̃', 'sa', 't'], ['mɔ̃', 'p', 'ʁɔ', 'ʁ', 't'], ['a', 't'], ['bɛ', 't', 'ʁ', 'pɛ̃', 'ɑ̃', 't'], ['dɛ', 't'], ['le', 'la', 'ka', 'do', 'pu', 'a', 't'], ['t'], ['sə', 'fɛ', 'lø', 'lø', 't'], ['la', 'p', 'lø', 'tɛ', 'də', 't'], ['i', 'p', 't'], ['pu', 'ɡo', 'n', 't', 't'], ['t'], ['i', 't'], ['pu', 'ʁa', 'ɡ', 'za', 'mɛ̃', 'ɛ', 'ma', 'nə', 't'], ['i', 't'], ['ʒə', 'a', 't'], ['t'], ['le', 'p', 'tɛ', 'kœ', 'ji', 'ʁɛ̃', 'də', 't'], ['a', 't'], ['ʁ'], ['le', 'nɔ', 'z', 'sɑ̃', 'lɛ', 'ze', 't'], ['dɛ', 't'], ['i', 't', 'ʁə', 'ze', 'na', 'tɛ', 'e'], ['la', 's', 'sɔ', 's', 't', 'ʁ', 't'], ['la', 'ya', 'sɔ', 'a', 't'], ['pu', 't'], ['le', 'ʁ'], ['lø', 't'], ['i', 'la', 'fɑ̃', 'də', 'sœ', 'ʁɔ̃', 'də', 't'], ['pu', 'zɔ̃', 'lɛ̃', 't'], ['la', 'də', 'sɔ̃', 'a', 'ti', 'a', 't'], ['la', 'də', 'sə', 't'], ['t'], ['a', 'p', 'də', 't'], ['i', 'a', 'i', 'di', 'fi', 't'], ['le', 'k', 't'], ['bɛ', 'kœ'

In [30]:
import jiwer

def calculate_per_with_jiwer(decoded_sequences, true_sequences):
    """
    Calculate the Phoneme Error Rate (PER) using jiwer.

    Args:
        decoded_sequences (list): List of decoded phoneme sequences.
        true_sequences (list): List of true phoneme sequences.

    Returns:
        float: Phoneme Error Rate (PER).
    """
    # Convert phoneme sequences to space-separated strings
    decoded_str = [" ".join(seq) for seq in decoded_sequences]
    true_str = [" ".join(seq) for seq in true_sequences]

    # Calculate PER using jiwer
    per = jiwer.wer(true_str, decoded_str)
    return per

# Example usage
train_per = calculate_per_with_jiwer(decoded_train_sequences, true_train_sequences)
val_per = calculate_per_with_jiwer(decoded_val_sequences, true_val_sequences)

print("Training PER (jiwer):", train_per, "1 - PER: ", 1 - train_per)
print("Validation PER (jiwer):", val_per, "1 - PER: ", 1 - val_per)

Training PER (jiwer): 0.84232868405094 1 - PER:  0.15767131594906003
Validation PER (jiwer): 0.9512195121951219 1 - PER:  0.04878048780487809
