Abderrahmane Balah - 201932370 - Section 1

This notebook has code related to developing an Encoder-Decoder model (without attention). The second notebook has the encoder-decoder model adapted to become attention-based.

# ENCODER-DECODER for Sign Language Interpretation

# Data Preprocessing

In [4]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms
from PIL import Image
import os

class SignLanguageDataset(Dataset):
    def __init__(self, root_dir, caption_mapping, vocab, transform=None, data=None):
        self.root_dir = root_dir
        self.caption_mapping = caption_mapping
        self.vocab = vocab
        self.transform = transform
        if(data is None):
          self.data = self._load_data()
        else:
          self.data = data

    def _load_data(self):
        data = []
        for folder_name in sorted(os.listdir(self.root_dir)):
            folder_path = os.path.join(self.root_dir, folder_name)
            for video_folder in os.listdir(folder_path):
                video_folder_path = os.path.join(folder_path, video_folder)
                frames = [os.path.join(video_folder_path, frame) for frame in sorted(os.listdir(video_folder_path))]
                caption = self.caption_mapping[folder_name]
                numericalized_caption = numericalize(caption, self.vocab)
                data.append((frames, numericalized_caption))
        return data

    def __getitem__(self, idx):
        frames_paths, numericalized_caption = self.data[idx]
        frames = [Image.open(frame_path).convert('RGB') for frame_path in frames_paths]

        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        else:
            to_tensor = transforms.ToTensor()
            frames = [to_tensor(frame) for frame in frames]

        frames_tensor = torch.stack(frames)
        caption_tensor = torch.tensor(numericalized_caption)
        return frames_tensor, caption_tensor

    def __len__(self):
        return len(self.data)

    def subset(self, indices):
        subset_data = [self.data[i] for i in indices]
        return SignLanguageDataset(self.root_dir, self.caption_mapping, self.vocab, self.transform, subset_data)

def load_captions(captions_file):
    with open(captions_file, 'r', encoding='utf-8') as file:
        captions = file.read().splitlines()
    # Create a mapping from folder number to caption
    caption_mapping = {str(i).zfill(4): caption for i, caption in enumerate(captions, 1)}
    return caption_mapping

def build_vocab(captions):
    tokenized_captions = [caption.lower().split() for caption in captions]
    vocab = build_vocab_from_iterator(tokenized_captions, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
    vocab.set_default_index(vocab['<unk>'])
    return vocab

def numericalize(caption, vocab):
    return [vocab['<sos>']] + [vocab[token] for token in caption.lower().split()] + [vocab['<eos>']]

def collate_fn(batch):
    # Separate frames and captions
    frames, captions = zip(*batch)
    # Pad the captions to the same length
    captions_padded = pad_sequence(captions, batch_first=True, padding_value=vocab['<eos>'])
    # Stack frames
    frames_stacked = torch.stack(frames)
    return frames_stacked, captions_padded

# Load the data with the collate function
def get_data_loader(root_dir, caption_mapping, vocab, batch_size, transform, shuffle=True, num_workers=4):
    dataset = SignLanguageDataset(root_dir=root_dir, caption_mapping=caption_mapping, vocab=vocab, transform=transform)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=collate_fn)



In [15]:
caption_mapping = load_captions('../input/assignment3/Assignment03/groundTruth.txt')
all_captions = list(caption_mapping.values())
vocab = build_vocab(all_captions)

train_transform = transforms.Compose([
    transforms.Resize((256, 256)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])
test_transform = transforms.Compose([
    transforms.Resize((256, 256)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

test_data_loader = get_data_loader(
    root_dir='../input/assignment3/Assignment03/test/',
    caption_mapping=caption_mapping,
    vocab=vocab,
    batch_size=1,
    transform=test_transform,
    shuffle=True,
    num_workers=2
)


In [18]:
from sklearn.model_selection import train_test_split

full_test_dataset = SignLanguageDataset(root_dir='../input/assignment3/Assignment03/train/',
                                        caption_mapping=caption_mapping,
                                        vocab=vocab,
                                        transform=train_transform)

captions = [str(full_test_dataset.data[i][1]) for i in range(len(full_test_dataset))]

train_indices, test_indices = train_test_split(range(len(full_test_dataset)),
                                               stratify=captions,
                                               test_size=0.2)  

train_dataset = full_test_dataset.subset(train_indices)
val_dataset = full_test_dataset.subset(test_indices)

train_data_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
val_data_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=2, collate_fn=collate_fn)

# Model Building

In [19]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.models.mobilenetv2 import MobileNet_V2_Weights

## Encoder

In [20]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, num_layers=1, num_frames=80):
        super(Encoder, self).__init__()
        self.num_frames = num_frames

        # CNN Part to get the features of every frame
        mobilenet_weights = MobileNet_V2_Weights.DEFAULT
        mobilenet = models.mobilenet_v2(weights=mobilenet_weights)
        modules = list(mobilenet.children())[:-1]  # Remove the classifier layer to just take the features
        self.mobilenet = nn.Sequential(*modules)

        # Freeze the MobileNet weights
        for param in self.mobilenet.parameters():
            param.requires_grad = False

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # LSTM Part
        self.lstm = nn.LSTM(mobilenet.last_channel*8*8, hidden_size, num_layers, batch_first=True)


    def forward(self, images):
        # images shape: (batch_size, num_frames, C, H, W)
        batch_size, _, C, H, W = images.size()
        assert self.num_frames == images.size(1), "Each video should have 80 frames"

        cnn_in = images.view(batch_size * self.num_frames, C, H, W)

        # Pass each frame through the CNN
        features = self.mobilenet(cnn_in)
        #convert it from (1280, 8, 8) into (1280*8*8)
        features = features.view(batch_size, self.num_frames, -1)

        # Pass the sequence of features through the LSTM
        lstm_out, (hidden_states, cell_states) = self.lstm(features)

        # Retrieve the final hidden state for the last time step
        final_hidden_states = hidden_states[-1]
        final_cell_states = cell_states[-1]
        return final_hidden_states, final_cell_states


## Decoder

In [21]:
import torch
import torch.nn as nn

class Decoder(nn.Module):
    def __init__(self, vocab, embed_size, hidden_size, vocab_size, dropout=0.5, max_seq_length=20, teacher_forcing_ratio = 1):
        super(Decoder, self).__init__()
        self.vocab = vocab
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = 1
        self.num_directions = 1
        self.lstm = nn.LSTMCell(input_size=embed_size, hidden_size=hidden_size)
        self.fc_out = nn.Linear(in_features=hidden_size, out_features=vocab_size)
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size)
        self.dropout = nn.Dropout(dropout)
        self.max_seq_length = max_seq_length

    def forward(self, captions, hidden_state, cell_state, state = 'test'):
        batch_size = captions.size(0)
        hidden_state = hidden_state.view(batch_size, self.hidden_size)
        cell_state = cell_state.view(batch_size, self.hidden_size)

        max_forward_length = self.max_seq_length if state=='test' else captions.size(1)

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, max_forward_length, self.vocab_size).to(captions.device)

        input = captions[:, 0]

        for t in range(0, max_forward_length):
            captions_embed = self.embed(input)
            hidden_state, cell_state = self.lstm(captions_embed, (hidden_state, cell_state))
            output = self.fc_out(self.dropout(hidden_state))
            outputs[:, t, :] = output
            if self.training:
                # Pass the correct output for training purposes
                input = captions[:, t+1] if t+1<captions.size(1) else None
            else: 
                # Get the highest predicted word from the previous output for evaluation
                top1 = output.argmax(1)
                input = top1
        return outputs[:, :t, :]

## Validation Function

In [22]:
def validate_model(encoder, decoder, val_data_loader, criterion):
    encoder.eval()
    decoder.eval()
    total_val_loss = 0
    with torch.no_grad():
        for frames, captions in val_data_loader:
            frames, captions = frames.to(device), captions.to(device)
            hidden_state, cell_state = encoder(frames)
            outputs = decoder(captions[:, :], hidden_state, cell_state, state = 'val')
            loss = criterion(outputs.view(-1, outputs.size(2)), captions[:, 1:].contiguous().view(-1))
            total_val_loss += loss.item()
    return total_val_loss / len(val_data_loader)

## Train Loop

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim

def train(encoder, decoder, encoder_filename, decoder_filename):
  encoder.to(device)
  decoder.to(device)

  # Define loss function and optimizers
  criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
  parameters = list(encoder.parameters()) + list(decoder.parameters())
  optimizer = optim.Adam(parameters, lr=0.0001)

  # Training loop
  num_epochs = 10  # Set the number of epochs
  best_val_loss = float('inf')
  for epoch in range(num_epochs):
      encoder.train()
      decoder.train()
      total_loss = 0

      for batch_idx, (frames, captions) in enumerate(train_data_loader):
          frames, captions = frames.to(device), captions.to(device)

          # Zero the gradients
          optimizer.zero_grad()

          # Forward pass through the encoder
          hidden_state, cell_state = encoder(frames)

          # Forward pass through the decoder
          outputs = decoder(captions[:, :], hidden_state, cell_state, state = 'train')

          # Calculate the loss
          loss = criterion(outputs.reshape(-1, outputs.size(2)), captions[:, 1:].contiguous().view(-1))
          total_loss += loss.item()

          # Backward pass and optimize
          loss.backward()
          optimizer.step()


          if batch_idx % 10 == 0:  # Print loss every 10 batches
              print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_data_loader)}], Loss: {loss.item():.4f}")

      # Print epoch loss
      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_data_loader):.4f}')

      # Validation phase
      val_loss = validate_model(encoder, decoder, val_data_loader, criterion)
      print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')

      # Check for early stopping
      if val_loss < best_val_loss:
          print(f'Saving Model! {val_loss} < {best_val_loss}')
          best_val_loss = val_loss

          torch.save(encoder.state_dict(), encoder_filename)
          torch.save(decoder.state_dict(), decoder_filename)

Version 1: hidden_size = 256

In [20]:
# Initialize models
encoder = Encoder(hidden_size=256)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=256, vocab_size=len(vocab), dropout=0.2)

# Check if CUDA is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(encoder, decoder, "encoder_v1.pth", "decoder_v1.pth")

Epoch [1/10], Step [1/107], Loss: 3.5619

Epoch [1/10], Step [11/107], Loss: 3.4287

Epoch [1/10], Step [21/107], Loss: 3.1737

Epoch [1/10], Step [31/107], Loss: 2.9002

Epoch [1/10], Step [41/107], Loss: 2.4725

Epoch [1/10], Step [51/107], Loss: 2.5141

Epoch [1/10], Step [61/107], Loss: 2.6058

Epoch [1/10], Step [71/107], Loss: 2.4384

Epoch [1/10], Step [81/107], Loss: 2.2886

Epoch [1/10], Step [91/107], Loss: 1.8492

Epoch [1/10], Step [101/107], Loss: 1.9550

Epoch [1/10], Loss: 2.7199

Epoch [1/10], Validation Loss: 2.6794

Saving Model! 2.679369380540937 < inf

Epoch [2/10], Step [1/107], Loss: 2.5408

Epoch [2/10], Step [11/107], Loss: 1.9782

Epoch [2/10], Step [21/107], Loss: 1.8790

Epoch [2/10], Step [31/107], Loss: 1.8405

Epoch [2/10], Step [41/107], Loss: 2.2384

Epoch [2/10], Step [51/107], Loss: 1.7083

Epoch [2/10], Step [61/107], Loss: 1.8776

Epoch [2/10], Step [71/107], Loss: 2.1819

Epoch [2/10], Step [81/107], Loss: 1.7495

Epoch [2/10], Step [91/107], Loss: 

Version 2: hidden_size = 512

In [21]:
# Initialize models
encoder = Encoder(hidden_size=512)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=512, vocab_size=len(vocab), dropout=0.2)

# Check if CUDA is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(encoder, decoder, "encoder_v2.pth", "decoder_v2.pth")

Epoch [1/10], Step [1/107], Loss: 3.5421

Epoch [1/10], Step [11/107], Loss: 3.0934

Epoch [1/10], Step [21/107], Loss: 2.2285

Epoch [1/10], Step [31/107], Loss: 2.3400

Epoch [1/10], Step [41/107], Loss: 1.9373

Epoch [1/10], Step [51/107], Loss: 2.0582

Epoch [1/10], Step [61/107], Loss: 2.5282

Epoch [1/10], Step [71/107], Loss: 2.8516

Epoch [1/10], Step [81/107], Loss: 2.1324

Epoch [1/10], Step [91/107], Loss: 2.1633

Epoch [1/10], Step [101/107], Loss: 2.4900

Epoch [1/10], Loss: 2.4035

Epoch [1/10], Validation Loss: 2.4307

Saving Model! 2.4307285302153256 < inf

Epoch [2/10], Step [1/107], Loss: 1.5851

Epoch [2/10], Step [11/107], Loss: 1.4747

Epoch [2/10], Step [21/107], Loss: 1.3182

Epoch [2/10], Step [31/107], Loss: 1.6758

Epoch [2/10], Step [41/107], Loss: 2.2986

Epoch [2/10], Step [51/107], Loss: 1.4403

Epoch [2/10], Step [61/107], Loss: 1.6296

Epoch [2/10], Step [71/107], Loss: 1.7627

Epoch [2/10], Step [81/107], Loss: 2.1049

Epoch [2/10], Step [91/107], Loss:

Version 3: hidden_size = 1024

In [24]:
# Initialize models
encoder = Encoder(hidden_size=1024)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=1024, vocab_size=len(vocab), dropout=0.2)

# Check if CUDA is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(encoder, decoder, "encoder_v3.pth", "decoder_v3.pth")

Downloading: "https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-7ebf99e0.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 115MB/s] 


Epoch [1/10], Step [1/107], Loss: 3.5667
Epoch [1/10], Step [11/107], Loss: 3.1204
Epoch [1/10], Step [21/107], Loss: 2.6253
Epoch [1/10], Step [31/107], Loss: 2.9908
Epoch [1/10], Step [41/107], Loss: 1.7633
Epoch [1/10], Step [51/107], Loss: 2.9671
Epoch [1/10], Step [61/107], Loss: 2.1424
Epoch [1/10], Step [71/107], Loss: 1.5163
Epoch [1/10], Step [81/107], Loss: 1.6924
Epoch [1/10], Step [91/107], Loss: 1.8779
Epoch [1/10], Step [101/107], Loss: 1.6087
Epoch [1/10], Loss: 2.2485
Epoch [1/10], Validation Loss: 1.8915
Saving Model! 1.891468714330798 < inf
Epoch [2/10], Step [1/107], Loss: 1.1728
Epoch [2/10], Step [11/107], Loss: 1.8985
Epoch [2/10], Step [21/107], Loss: 1.0882
Epoch [2/10], Step [31/107], Loss: 1.1745
Epoch [2/10], Step [41/107], Loss: 1.4282
Epoch [2/10], Step [51/107], Loss: 1.1387
Epoch [2/10], Step [61/107], Loss: 1.1892
Epoch [2/10], Step [71/107], Loss: 1.3850
Epoch [2/10], Step [81/107], Loss: 0.7379
Epoch [2/10], Step [91/107], Loss: 1.2019
Epoch [2/10], St

Therefore, Version 3 is the best one, with 1024 hidden size.

## Testing on testing set

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(hidden_size=1024).to(device)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=1024, vocab_size=len(vocab), dropout=0.2).to(device)

encoder.load_state_dict(torch.load('encoder_v3.pth',map_location=device))
decoder.load_state_dict(torch.load('decoder_v3.pth',map_location=device))

encoder.eval()
decoder.eval()

Decoder(
  (vocab): Vocab()
  (lstm): LSTMCell(256, 1024)
  (fc_out): Linear(in_features=1024, out_features=34, bias=True)
  (embed): Embedding(34, 256)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [28]:
def generate_caption(decoder, vocab, hidden_state, cell_state, device, max_length=20):
    start_token_idx = vocab.lookup_indices(['<sos>'])[0]  # Fetch the index for <sos>
    sequence = torch.tensor([[start_token_idx]], dtype=torch.long).to(device)

    with torch.no_grad():
        outputs = decoder(sequence, hidden_state, cell_state, state='test')

    # Convert output indices to words, excluding <sos> and <eos>
    predicted_indices = outputs.squeeze(0).argmax(dim=1).tolist()
    caption = indices_to_string(predicted_indices, vocab)

    return caption


def indices_to_string(indices, vocab):
    if isinstance(indices, torch.Tensor):
        indices = indices.tolist()

    # Convert each index to its corresponding word
    words = [vocab.get_itos()[idx] for idx in indices]

    # Exclude special tokens like <start>, <end>, and <pad> from the final sentence
    special_tokens = {'<sos>', '<eos>', '<pad>'}
    filtered_words = [word for word in words if word not in special_tokens]

    # Join the words to form a sentence
    sentence = ' '.join(filtered_words)

    return sentence



In [29]:
!pip install jiwer

Collecting jiwer
  Obtaining dependency information for jiwer from https://files.pythonhosted.org/packages/0d/4f/ee537ab20144811dd99321735ff92ef2b3a3230b77ed7454bed4c44d21fc/jiwer-3.0.3-py3-none-any.whl.metadata
  Downloading jiwer-3.0.3-py3-none-any.whl.metadata (2.6 kB)
Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Installing collected packages: jiwer
Successfully installed jiwer-3.0.3


In [30]:
import jiwer

def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

total_wer = 0
num_samples = 0

for frames, captions in test_data_loader:
    frames = frames.to(device)
    captions = captions.to(device)

    with torch.no_grad():
        hidden_states, cell_states = encoder(frames)

    hidden = hidden_states[0].unsqueeze(0)
    cell = cell_states[0].unsqueeze(0)

    true_caption_indices = captions[0]
    generated_caption = generate_caption(decoder, vocab, hidden, cell, device)

    # Convert true caption indices to string
    true_caption = indices_to_string(true_caption_indices, vocab)

    # Calculate and accumulate WER
    wer = calculate_wer(true_caption, generated_caption)
    total_wer += wer
    num_samples += 1

    print(f"True Caption: {true_caption}")
    print(f"Generated Caption: {generated_caption}")
    print(f"WER: {wer}\n")

# Calculate average WER across the test set
average_wer = total_wer / num_samples
print(f"Average WER on the Test Set: {average_wer}")




True Caption: السلام عليكم رحمة الله بركة
Generated Caption: السلام عليكم رحمة الله بركة
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: السلام عليكم رحمة الله بركة
WER: 1.25

True Caption: الله اكبر
Generated Caption: لا شرك الله
WER: 1.5

True Caption: موضوع دراسة لغة الاشارة العربية
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 1.0

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: السلام عليكم رحمة الله بركة
WER: 1.0

True Caption: لا شرك الله
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 1.6666666666666667

True Caption: ايضا كلمات عادية
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 1.6666666666666667

True Caption: السلام عليكم رحمة الله بركة
Generated Caption: السلام عليكم رحمة الله بركة
WER: 0.0

True Caption: موضوع دراسة لغة الاشارة العربية
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 1.0

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 1.0

True Caption: اليوم اقدم انتم برنامج اخ

## Average WER on the Test Set: 1.0699494949494948

This appears to be very bad results. Although we are performing well on both the training and validation set, it seems the testing set is giving bad results. this could be due to the vast difference in distribution between train and test sets.

In [32]:
total_wer = 0
num_samples = 0

for frames, captions in val_data_loader:
    frames = frames.to(device)
    captions = captions.to(device)

    with torch.no_grad():
        hidden_states, cell_states = encoder(frames)

    hidden = hidden_states[0].unsqueeze(0)
    cell = cell_states[0].unsqueeze(0)

    true_caption_indices = captions[0]
    generated_caption = generate_caption(decoder, vocab, hidden, cell, device)

    # Convert true caption indices to string
    true_caption = indices_to_string(true_caption_indices, vocab)

    # Calculate and accumulate WER
    wer = calculate_wer(true_caption, generated_caption)
    total_wer += wer
    num_samples += 1

    print(f"True Caption: {true_caption}")
    print(f"Generated Caption: {generated_caption}")
    print(f"WER: {wer}\n")

# Calculate average WER across the test set
average_wer = total_wer / num_samples
print(f"Average WER on the Validation Set: {average_wer}")

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: اليوم اقدم انتم برنامج اخر
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: جميع الصم العرب السامع
WER: 0.0

True Caption: اسم الله
Generated Caption: اسم الله
WER: 0.0

True Caption: ايضا كلمات عادية
Generated Caption: ايضا كلمات عادية
WER: 0.0

True Caption: الحمد الله
Generated Caption: الحمد الله
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: موضوع دراسة لغة الاشارة العربية
Generated Caption: موضوع دراسة لغة الاشارة العربية
WER: 0.0

True Caption: الحمد الله
Generated Caption: الحمد الله
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: لا شرك الله
Generated Caption: لا شرك الله
WER: 0.0

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: اليوم اقدم انتم برنامج اخر
WER: 0.0

True Caption: الله اكبر

## Average WER on the Validation Set: 0.004672897196261682

In [34]:
total_wer = 0
num_samples = 0

evaluation_train_data_loader = get_data_loader(
    root_dir='../input/assignment3/Assignment03/train/',
    caption_mapping=caption_mapping,
    vocab=vocab,
    batch_size=1,
    transform=train_transform,
    shuffle=True,
    num_workers=2
)
for frames, captions in evaluation_train_data_loader:
    frames = frames.to(device)
    captions = captions.to(device)

    with torch.no_grad():
        hidden_states, cell_states = encoder(frames)

    hidden = hidden_states[0].unsqueeze(0)
    cell = cell_states[0].unsqueeze(0)

    true_caption_indices = captions[0]
    generated_caption = generate_caption(decoder, vocab, hidden, cell, device)

    # Convert true caption indices to string
    true_caption = indices_to_string(true_caption_indices, vocab)

    # Calculate and accumulate WER
    wer = calculate_wer(true_caption, generated_caption)
    total_wer += wer
    num_samples += 1

    print(f"True Caption: {true_caption}")
    print(f"Generated Caption: {generated_caption}")
    print(f"WER: {wer}\n")

# Calculate average WER across the test set
average_wer = total_wer / num_samples
print(f"Average WER on the Train Set: {average_wer}")

True Caption: الحمد الله
Generated Caption: الحمد الله
WER: 0.0

True Caption: السلام عليكم رحمة الله بركة
Generated Caption: السلام عليكم رحمة الله بركة
WER: 0.0

True Caption: الحمد الله
Generated Caption: الحمد الله
WER: 0.0

True Caption: اسم الله
Generated Caption: اسم الله
WER: 0.0

True Caption: لا شرك الله
Generated Caption: لا شرك الله
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: اسم الله
Generated Caption: اسم الله
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: جميع الصم العرب السامع
WER: 0.0

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: اليوم اقدم انتم برنامج اخر
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: جميع الصم العرب السامع
WER: 0.0

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: اليوم اقدم انتم برنامج اخر
WER: 0.0

True Caption: لا شرك 

## Average WER on the Train Set: 0.0009363295880149813