Abderrahmane Balah - 201932370 - Section 1

This notebook has code related to developing an Encoder-Decoder model (with attention). The second notebook has the encoder-decoder model without attention.

# Attention Based ENCODER-DECODER for Sign Language Interpretation

[20 points] Develop an encoder-decoder model with attention to recognize the sign video (video captioning)

#Unzipping Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!unzip /content/drive/MyDrive/ICS471/Assignment3/Assignment03.zip -d /content

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0023.jpg  
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0024.jpg  
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0025.jpg  
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0026.jpg  
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0027.jpg  
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0028.jpg  
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0029.jpg  
  inflating: /content/Assignment03/train/0009/02_0009_(27_02_21_20_43_02)_c/02_0009_(27_02_21_20_43_02)_c_0030.jpg  

# Data Preprocessing

In [1]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms
from PIL import Image
import os

class SignLanguageDataset(Dataset):
    def __init__(self, root_dir, caption_mapping, vocab, transform=None, data=None):
        self.root_dir = root_dir
        self.caption_mapping = caption_mapping
        self.vocab = vocab
        self.transform = transform
        if(data is None):
          self.data = self._load_data()
        else:
          self.data = data

    def _load_data(self):
        data = []
        for folder_name in sorted(os.listdir(self.root_dir)):
            folder_path = os.path.join(self.root_dir, folder_name)
            for video_folder in os.listdir(folder_path):
                video_folder_path = os.path.join(folder_path, video_folder)
                frames = [os.path.join(video_folder_path, frame) for frame in sorted(os.listdir(video_folder_path))]
                caption = self.caption_mapping[folder_name]
                numericalized_caption = numericalize(caption, self.vocab)
                data.append((frames, numericalized_caption))
        return data

    def __getitem__(self, idx):
        frames_paths, numericalized_caption = self.data[idx]
        frames = [Image.open(frame_path).convert('RGB') for frame_path in frames_paths]

        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        else:
            to_tensor = transforms.ToTensor()
            frames = [to_tensor(frame) for frame in frames]

        frames_tensor = torch.stack(frames)
        caption_tensor = torch.tensor(numericalized_caption)
        return frames_tensor, caption_tensor

    def __len__(self):
        return len(self.data)

    def subset(self, indices):
        subset_data = [self.data[i] for i in indices]
        return SignLanguageDataset(self.root_dir, self.caption_mapping, self.vocab, self.transform, subset_data)

def load_captions(captions_file):
    with open(captions_file, 'r', encoding='utf-8') as file:
        captions = file.read().splitlines()
    # Create a mapping from folder number to caption
    caption_mapping = {str(i).zfill(4): caption for i, caption in enumerate(captions, 1)}
    return caption_mapping

def build_vocab(captions):
    tokenized_captions = [caption.lower().split() for caption in captions]
    vocab = build_vocab_from_iterator(tokenized_captions, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
    vocab.set_default_index(vocab['<unk>'])
    return vocab

def numericalize(caption, vocab):
    return [vocab['<sos>']] + [vocab[token] for token in caption.lower().split()] + [vocab['<eos>']]

def collate_fn(batch):
    # Separate frames and captions
    frames, captions = zip(*batch)
    # Pad the captions to the same length
    captions_padded = pad_sequence(captions, batch_first=True, padding_value=vocab['<eos>'])
    # Stack frames
    frames_stacked = torch.stack(frames)
    return frames_stacked, captions_padded

# Load the data with the collate function
def get_data_loader(root_dir, caption_mapping, vocab, batch_size, transform, shuffle=True, num_workers=4):
    dataset = SignLanguageDataset(root_dir=root_dir, caption_mapping=caption_mapping, vocab=vocab, transform=transform)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=collate_fn)



In [2]:
caption_mapping = load_captions('/content/Assignment03/groundTruth.txt')
all_captions = list(caption_mapping.values())
vocab = build_vocab(all_captions)

train_transform = transforms.Compose([
    transforms.Resize((256, 256)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])
test_transform = transforms.Compose([
    transforms.Resize((256, 256)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

# Test data loader
test_data_loader = get_data_loader(
    root_dir='/content/Assignment03/test/',
    caption_mapping=caption_mapping,
    vocab=vocab,
    batch_size=1,
    transform=test_transform,
    shuffle=True,
    num_workers=2
)


In [3]:
from sklearn.model_selection import train_test_split

full_test_dataset = SignLanguageDataset(root_dir='/content/Assignment03/train/',
                                        caption_mapping=caption_mapping,
                                        vocab=vocab,
                                        transform=train_transform)


captions = [str(full_test_dataset.data[i][1]) for i in range(len(full_test_dataset))]


train_indices, test_indices = train_test_split(range(len(full_test_dataset)),
                                               stratify=captions,
                                               test_size=0.2)  


train_dataset = full_test_dataset.subset(train_indices)
val_dataset = full_test_dataset.subset(test_indices)


train_data_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
val_data_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=2, collate_fn=collate_fn)

# Model Building

In [4]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
import torch.nn.functional as F

## Attention

In [5]:
import math

class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value):
        # Compute the dot product between query and key
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
        attn_weights = F.softmax(scores, dim=-1)
        context = torch.matmul(attn_weights, value)
        return context, attn_weights

## Encoder

In [6]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, num_layers=1, num_frames=80):
        super(Encoder, self).__init__()
        self.num_frames = num_frames

        # CNN Part to get the features of every frame
        mobilenet_weights = MobileNet_V2_Weights.DEFAULT
        mobilenet = models.mobilenet_v2(weights=mobilenet_weights)
        modules = list(mobilenet.children())[:-1]  # Remove the classifier layer to just take the features
        self.mobilenet = nn.Sequential(*modules)

        # Freeze the MobileNet weights
        for param in self.mobilenet.parameters():
            param.requires_grad = False

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # LSTM Part
        self.lstm = nn.LSTM(mobilenet.last_channel*8*8, hidden_size, num_layers, batch_first=True)


    def forward(self, images):
        # images shape: (batch_size, num_frames, C, H, W)
        batch_size, _, C, H, W = images.size()
        assert self.num_frames == images.size(1), "Each video should have 80 frames"

        cnn_in = images.view(batch_size * self.num_frames, C, H, W)

        # Pass each frame through the CNN
        features = self.mobilenet(cnn_in)
        #convert it from (1280, 8, 8) into (1280*8*8)
        features = features.view(batch_size, self.num_frames, -1)

        # Pass the sequence of features through the LSTM
        lstm_out, (hidden_states, cell_states) = self.lstm(features)

        # Retrieve the final hidden state for the last time step
        final_hidden_states = hidden_states[-1]
        final_cell_states = cell_states[-1]
        return lstm_out, final_hidden_states, final_cell_states


## Decoder

In [7]:

import torch
import torch.nn as nn

class Decoder(nn.Module):
    def __init__(self, vocab, embed_size, hidden_size, vocab_size, dropout=0.5, max_seq_length=20):
        super(Decoder, self).__init__()
        self.vocab = vocab
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = 1
        self.num_directions = 1

        self.lstm = nn.LSTMCell(input_size=embed_size, hidden_size=hidden_size)
        self.fc_out = nn.Linear(in_features=hidden_size * 2, out_features=vocab_size)
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size)
        self.dropout = nn.Dropout(dropout)
        self.max_seq_length = max_seq_length
        self.attention = Attention()

    def forward(self, captions, encoder_outputs, hidden_state, cell_state, state = 'test'):
        batch_size = captions.size(0)
        hidden_state = hidden_state.view(batch_size, self.hidden_size)
        cell_state = cell_state.view(batch_size, self.hidden_size)

        max_forward_length = self.max_seq_length if state=='test' else captions.size(1)

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, max_forward_length, self.vocab_size).to(captions.device)

        input = captions[:, 0]

        for t in range(0, max_forward_length):
            captions_embed = self.embed(input)
            hidden_state, cell_state = self.lstm(captions_embed, (hidden_state, cell_state))

            # Apply attention
            context, _ = self.attention(hidden_state.unsqueeze(1), encoder_outputs, encoder_outputs)
            context = context.squeeze(1)

            # Combine context with LSTM output before final layer
            output = self.fc_out(self.dropout(torch.cat((hidden_state, context), dim=1)))
            outputs[:, t, :] = output

            if self.training:
                # Pass the correct output for training purposes
                input = captions[:, t+1] if t+1<captions.size(1) else None
            else:
                # Get the highest predicted word from the previous output for evaluation
                top1 = output.argmax(1)
                input = top1
        return outputs[:, :t, :]



## Validation Function

In [8]:
def validate_model(encoder, decoder, val_data_loader, criterion):
    encoder.eval()
    decoder.eval()
    total_val_loss = 0
    with torch.no_grad():
        for frames, captions in val_data_loader:
            frames, captions = frames.to(device), captions.to(device)
            encoder_output, hidden_state, cell_state = encoder(frames)
            outputs = decoder(captions[:, :],encoder_output, hidden_state, cell_state, state = 'val')
            loss = criterion(outputs.reshape(-1, outputs.size(2)), captions[:, 1:].contiguous().view(-1))
            total_val_loss += loss.item()
    return total_val_loss / len(val_data_loader)

## Train Loop

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

def train(encoder, decoder, encoder_filename, decoder_filename):
  encoder.to(device)
  decoder.to(device)

  # Define loss function and optimizers
  criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
  parameters = list(encoder.parameters()) + list(decoder.parameters())
  optimizer = optim.Adam(parameters, lr=0.0001)

  # Training loop
  num_epochs = 10  
  best_val_loss = float('inf')
  for epoch in range(num_epochs):
      encoder.train()
      decoder.train()
      total_loss = 0

      for batch_idx, (frames, captions) in enumerate(train_data_loader):
          frames, captions = frames.to(device), captions.to(device)

          # Zero the gradients
          optimizer.zero_grad()


          # Forward pass through the encoder
          encoder_output, hidden_state, cell_state = encoder(frames)

          # Forward pass through the decoder
          outputs = decoder(captions[:, :], encoder_output, hidden_state, cell_state, state = 'train')

          # Calculate the loss
          loss = criterion(outputs.reshape(-1, outputs.size(2)), captions[:, 1:].contiguous().view(-1))
          total_loss += loss.item()

          # Backward pass and optimize
          loss.backward()
          optimizer.step()


          if batch_idx % 10 == 0:  # Print loss every 10 batches
              print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_data_loader)}], Loss: {loss.item():.4f}")

      # Print epoch loss
      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_data_loader):.4f}')

      # Validation phase
      val_loss = validate_model(encoder, decoder, val_data_loader, criterion)
      print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')

      # Check for early stopping
      if val_loss < best_val_loss:
          print(f'Saving Model! {val_loss} < {best_val_loss}')
          best_val_loss = val_loss

          torch.save(encoder.state_dict(), encoder_filename)
          torch.save(decoder.state_dict(), decoder_filename)

Version 1: hidden_size = 256

In [11]:
# Initialize models
encoder = Encoder(hidden_size=256)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=256, vocab_size=len(vocab), dropout=0.2)

# Check if CUDA is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(encoder, decoder, "encoder_v1.pth", "decoder_v1.pth")

Epoch [1/10], Step [1/107], Loss: 3.5177
Epoch [1/10], Step [11/107], Loss: 2.5982
Epoch [1/10], Step [21/107], Loss: 2.6639
Epoch [1/10], Step [31/107], Loss: 2.5630
Epoch [1/10], Step [41/107], Loss: 2.1911
Epoch [1/10], Step [51/107], Loss: 2.1477
Epoch [1/10], Step [61/107], Loss: 2.6897
Epoch [1/10], Step [71/107], Loss: 3.0765
Epoch [1/10], Step [81/107], Loss: 2.8686
Epoch [1/10], Step [91/107], Loss: 1.9804
Epoch [1/10], Step [101/107], Loss: 2.2476
Epoch [1/10], Loss: 2.4816
Epoch [1/10], Validation Loss: 2.4800
Saving Model! 2.4799630374551933 < inf
Epoch [2/10], Step [1/107], Loss: 1.5750
Epoch [2/10], Step [11/107], Loss: 1.5585
Epoch [2/10], Step [21/107], Loss: 2.1051
Epoch [2/10], Step [31/107], Loss: 2.1921
Epoch [2/10], Step [41/107], Loss: 2.0132
Epoch [2/10], Step [51/107], Loss: 1.9613
Epoch [2/10], Step [61/107], Loss: 1.6413
Epoch [2/10], Step [71/107], Loss: 2.2427
Epoch [2/10], Step [81/107], Loss: 2.2462
Epoch [2/10], Step [91/107], Loss: 2.2049
Epoch [2/10], S

Version 2: hidden_size = 512

In [12]:
# Initialize models
encoder = Encoder(hidden_size=512)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=512, vocab_size=len(vocab), dropout=0.2)

# Check if CUDA is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(encoder, decoder, "encoder_v2.pth", "decoder_v2.pth")

Epoch [1/10], Step [1/107], Loss: 3.4986
Epoch [1/10], Step [11/107], Loss: 2.6621
Epoch [1/10], Step [21/107], Loss: 2.5829
Epoch [1/10], Step [31/107], Loss: 1.9167
Epoch [1/10], Step [41/107], Loss: 2.6469
Epoch [1/10], Step [51/107], Loss: 2.4583
Epoch [1/10], Step [61/107], Loss: 2.2745
Epoch [1/10], Step [71/107], Loss: 1.7546
Epoch [1/10], Step [81/107], Loss: 1.7199
Epoch [1/10], Step [91/107], Loss: 1.7483
Epoch [1/10], Step [101/107], Loss: 1.8920
Epoch [1/10], Loss: 2.2728
Epoch [1/10], Validation Loss: 2.1358
Saving Model! 2.1358425695205403 < inf
Epoch [2/10], Step [1/107], Loss: 1.4494
Epoch [2/10], Step [11/107], Loss: 1.9173
Epoch [2/10], Step [21/107], Loss: 1.3251
Epoch [2/10], Step [31/107], Loss: 1.4270
Epoch [2/10], Step [41/107], Loss: 1.8062
Epoch [2/10], Step [51/107], Loss: 1.1685
Epoch [2/10], Step [61/107], Loss: 1.6185
Epoch [2/10], Step [71/107], Loss: 1.2748
Epoch [2/10], Step [81/107], Loss: 1.1987
Epoch [2/10], Step [91/107], Loss: 1.7111
Epoch [2/10], S

Version 3: hidden_size = 1024

In [13]:
# Initialize models
encoder = Encoder(hidden_size=1024)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=1024, vocab_size=len(vocab), dropout=0.2)

# Check if CUDA is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(encoder, decoder, "encoder_v3.pth", "decoder_v3.pth")

Epoch [1/10], Step [1/107], Loss: 3.5390
Epoch [1/10], Step [11/107], Loss: 2.0261
Epoch [1/10], Step [21/107], Loss: 2.0792
Epoch [1/10], Step [31/107], Loss: 2.0749
Epoch [1/10], Step [41/107], Loss: 1.8912
Epoch [1/10], Step [51/107], Loss: 1.9288
Epoch [1/10], Step [61/107], Loss: 2.6319
Epoch [1/10], Step [71/107], Loss: 1.6559
Epoch [1/10], Step [81/107], Loss: 1.8259
Epoch [1/10], Step [91/107], Loss: 1.3396
Epoch [1/10], Step [101/107], Loss: 1.0012
Epoch [1/10], Loss: 2.1245
Epoch [1/10], Validation Loss: 1.7460
Saving Model! 1.746014816181682 < inf
Epoch [2/10], Step [1/107], Loss: 1.3160
Epoch [2/10], Step [11/107], Loss: 1.4942
Epoch [2/10], Step [21/107], Loss: 0.9742
Epoch [2/10], Step [31/107], Loss: 1.4279
Epoch [2/10], Step [41/107], Loss: 0.7168
Epoch [2/10], Step [51/107], Loss: 1.2136
Epoch [2/10], Step [61/107], Loss: 1.3586
Epoch [2/10], Step [71/107], Loss: 0.7061
Epoch [2/10], Step [81/107], Loss: 0.9100
Epoch [2/10], Step [91/107], Loss: 0.9165
Epoch [2/10], St

Therefore, Version 3 is the best one, with 1024 hidden size.

In [16]:
# Initialize models
encoder = Encoder(hidden_size=1024)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=1024, vocab_size=len(vocab), dropout=0.2)

# Check if CUDA is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(encoder, decoder, "encoder_v3.pth", "decoder_v3.pth")

Epoch [1/10], Step [1/107], Loss: 3.5185
Epoch [1/10], Step [11/107], Loss: 2.4386
Epoch [1/10], Step [21/107], Loss: 2.2330
Epoch [1/10], Step [31/107], Loss: 2.5829
Epoch [1/10], Step [41/107], Loss: 2.0005
Epoch [1/10], Step [51/107], Loss: 1.4852
Epoch [1/10], Step [61/107], Loss: 1.9690
Epoch [1/10], Step [71/107], Loss: 1.6024
Epoch [1/10], Step [81/107], Loss: 1.5708
Epoch [1/10], Step [91/107], Loss: 2.3873
Epoch [1/10], Step [101/107], Loss: 1.5814
Epoch [1/10], Loss: 2.0646
Epoch [1/10], Validation Loss: 1.6553
Saving Model! 1.655275051281831 < inf
Epoch [2/10], Step [1/107], Loss: 1.5466
Epoch [2/10], Step [11/107], Loss: 1.0578
Epoch [2/10], Step [21/107], Loss: 1.3068
Epoch [2/10], Step [31/107], Loss: 0.9122
Epoch [2/10], Step [41/107], Loss: 0.8101
Epoch [2/10], Step [51/107], Loss: 0.5381
Epoch [2/10], Step [61/107], Loss: 1.4579
Epoch [2/10], Step [71/107], Loss: 0.7305
Epoch [2/10], Step [81/107], Loss: 0.7090
Epoch [2/10], Step [91/107], Loss: 0.8425
Epoch [2/10], St

# Testing on testing set

In [17]:
encoder = Encoder(hidden_size=1024).to(device)
decoder = Decoder(vocab = vocab, embed_size=256, hidden_size=1024, vocab_size=len(vocab), dropout=0.2).to(device)

encoder.load_state_dict(torch.load('encoder_v3.pth'))
decoder.load_state_dict(torch.load('decoder_v3.pth'))

encoder.eval()
decoder.eval()

Decoder(
  (vocab): Vocab()
  (lstm): LSTMCell(256, 1024)
  (fc_out): Linear(in_features=2048, out_features=34, bias=True)
  (embed): Embedding(34, 256)
  (dropout): Dropout(p=0.2, inplace=False)
  (attention): Attention()
)

In [18]:
def generate_caption(decoder, vocab, encoder_outputs, hidden_state, cell_state, device, max_length=20):
    start_token_idx = vocab.lookup_indices(['<sos>'])[0]  # Fetch the index for <sos>
    sequence = torch.tensor([[start_token_idx]], dtype=torch.long).to(device)

    with torch.no_grad():
        outputs = decoder(sequence, encoder_outputs, hidden_state, cell_state, state='test')

    predicted_indices = outputs.squeeze(0).argmax(dim=1).tolist()
    caption = indices_to_string(predicted_indices, vocab)

    return caption



def indices_to_string(indices, vocab):
    if isinstance(indices, torch.Tensor):
        indices = indices.tolist()

    words = [vocab.get_itos()[idx] for idx in indices]

    # Exclude special tokens like <start>, <end>, and <pad> from the final sentence
    special_tokens = {'<sos>', '<eos>', '<pad>'}
    filtered_words = [word for word in words if word not in special_tokens]

    # Join the words to form a sentence
    sentence = ' '.join(filtered_words)

    return sentence



## Testing set results

In [12]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.5.2


In [19]:
import jiwer

def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

total_wer = 0
num_samples = 0

for frames, captions in test_data_loader:
    frames = frames.to(device)
    captions = captions.to(device)

    with torch.no_grad():
        encoder_output, hidden_states, cell_states = encoder(frames)

    hidden = hidden_states[0].unsqueeze(0)
    cell = cell_states[0].unsqueeze(0)

    true_caption_indices = captions[0]
    generated_caption = generate_caption(decoder, vocab, encoder_output, hidden, cell, device)

    # Convert true caption indices to string
    true_caption = indices_to_string(true_caption_indices, vocab)

    # Calculate and accumulate WER
    wer = calculate_wer(true_caption, generated_caption)
    total_wer += wer
    num_samples += 1

    print(f"True Caption: {true_caption}")
    print(f"Generated Caption: {generated_caption}")
    print(f"WER: {wer}\n")

# Calculate average WER across the test set
average_wer = total_wer / num_samples
print(f"Average WER on the Test Set: {average_wer}")




True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: اليوم اقدم انتم برنامج اخر
WER: 0.0

True Caption: اسم الله
Generated Caption: الحمد الله
WER: 0.5

True Caption: الله اكبر
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 2.5

True Caption: ايضا كلمات عادية
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 1.6666666666666667

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: اليوم اقدم انتم برنامج اخر
WER: 0.0

True Caption: السلام عليكم رحمة الله بركة
Generated Caption: السلام عليكم رحمة الله بركة
WER: 0.0

True Caption: موضوع دراسة لغة الاشارة العربية
Generated Caption: موضوع دراسة لغة الاشارة العربية
WER: 0.0

True Caption: كلمات اليوم متفرقة في الدين
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 0.0

True Caption: كلمات اليوم متفرقة في الدين
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 0.0

True Caption: لا شرك الله
Generated Caption: اليوم اقدم انتم برنامج
WER: 1.3333333333333333

True Caption: السلام عليكم رحمة الله بركة
Generated Caption: ا

### Average WER on the Test Set: 0.9325757575757574
The reasoning behind this is that it could be due to the heavy difference between the train and test datasets. We noticed that the validation loss was actually very good, but all of a sudden the test has a very bad accuracy.

## Validation set results

In [20]:
total_wer = 0
num_samples = 0
# Test data loader
evaluation_train_data_loader = get_data_loader(
    root_dir='/content/Assignment03/train/',
    caption_mapping=caption_mapping,
    vocab=vocab,
    batch_size=1,
    transform=train_transform,
    shuffle=True,
    num_workers=2
)
for frames, captions in val_data_loader:
    frames = frames.to(device)
    captions = captions.to(device)

    with torch.no_grad():
        encoder_output, hidden_states, cell_states = encoder(frames)

    hidden = hidden_states[0].unsqueeze(0)
    cell = cell_states[0].unsqueeze(0)

    true_caption_indices = captions[0]
    generated_caption = generate_caption(decoder, vocab, encoder_output, hidden, cell, device)

    # Convert true caption indices to string
    true_caption = indices_to_string(true_caption_indices, vocab)

    # Calculate and accumulate WER
    wer = calculate_wer(true_caption, generated_caption)
    total_wer += wer
    num_samples += 1

    print(f"True Caption: {true_caption}")
    print(f"Generated Caption: {generated_caption}")
    print(f"WER: {wer}\n")

# Calculate average WER across the test set
average_wer = total_wer / num_samples
print(f"Average WER on the Validation Set: {average_wer}")

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: جميع الصم العرب السامع
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: جميع الصم العرب السامع
WER: 0.0

True Caption: السلام عليكم رحمة الله بركة
Generated Caption: السلام عليكم رحمة الله بركة
WER: 0.0

True Caption: ايضا كلمات عادية
Generated Caption: ايضا كلمات عادية
WER: 0.0

True Caption: الحمد الله
Generated Caption: الحمد الله
WER: 0.0

True Caption: السلام عليكم رحمة الله بركة
Generated Caption: السلام عليكم رحمة الله بركة
WER: 0.0

True Caption: موضوع دراسة لغة الاشارة العربية
Generated Caption: موضوع دراسة لغة الاشارة العربية
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: جميع الصم العرب السامع
WER: 0.0

True Caption: اليوم اقدم انتم برنامج اخر
Generated Caption: اليوم اقدم انتم برنامج اخر
WER: 0.0

True Caption: الحمد الله
Generated Caption: الحمد الله
WER: 0.0

True Caption: اسم الله
Generated Caption: اسم الله
WER: 0.0



### Average WER on the Validation Set: 0.009345794392523364

## Training set results


In [21]:
total_wer = 0
num_samples = 0
# Test data loader
evaluation_train_data_loader = get_data_loader(
    root_dir='/content/Assignment03/train/',
    caption_mapping=caption_mapping,
    vocab=vocab,
    batch_size=1,
    transform=train_transform,
    shuffle=True,
    num_workers=2
)
for frames, captions in evaluation_train_data_loader:
    frames = frames.to(device)
    captions = captions.to(device)

    with torch.no_grad():
        encoder_output, hidden_states, cell_states = encoder(frames)

    hidden = hidden_states[0].unsqueeze(0)
    cell = cell_states[0].unsqueeze(0)

    true_caption_indices = captions[0]
    generated_caption = generate_caption(decoder, vocab, encoder_output, hidden, cell, device)

    # Convert true caption indices to string
    true_caption = indices_to_string(true_caption_indices, vocab)

    # Calculate and accumulate WER
    wer = calculate_wer(true_caption, generated_caption)
    total_wer += wer
    num_samples += 1

    print(f"True Caption: {true_caption}")
    print(f"Generated Caption: {generated_caption}")
    print(f"WER: {wer}\n")

# Calculate average WER across the test set
average_wer = total_wer / num_samples
print(f"Average WER on the Train Set: {average_wer}")

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: كلمات اليوم متفرقة في الدين
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 0.0

True Caption: لا شرك الله
Generated Caption: لا شرك الله
WER: 0.0

True Caption: موضوع دراسة لغة الاشارة العربية
Generated Caption: موضوع دراسة لغة الاشارة العربية
WER: 0.0

True Caption: اسم الله
Generated Caption: اسم الله
WER: 0.0

True Caption: جميع الصم العرب السامع
Generated Caption: جميع الصم العرب السامع
WER: 0.0

True Caption: الله اكبر
Generated Caption: الله اكبر
WER: 0.0

True Caption: كلمات اليوم متفرقة في الدين
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 0.0

True Caption: السلام عليكم رحمة الله بركة
Generated Caption: السلام عليكم رحمة الله بركة
WER: 0.0

True Caption: لا شرك الله
Generated Caption: لا شرك الله
WER: 0.0

True Caption: كلمات اليوم متفرقة في الدين
Generated Caption: كلمات اليوم متفرقة في الدين
WER: 0.0

True Caption: لا شرك الله
Generated Caption: لا شرك الله
WER: 0.0

True Caption: السلام

### Average WER on the Train Set: 0.0018726591760299626
