In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms.functional import resize
from torch.utils.data import DataLoader, Dataset, random_split
from PIL import Image
import os
import glob
import torchvision.io
import cv2
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [None]:
class LipReadingDataset(Dataset):
    def __init__(self, root_dir, phase='train', transform=None, fixed_frame_count=29, max_classes=500):
        self.root_dir = root_dir
        self.phase = phase
        self.transform = transform
        self.classes = sorted(os.listdir(root_dir))[:max_classes]
        self.files = []
        self.fixed_frame_count = fixed_frame_count
        
        for cls in self.classes:
            class_dir = os.path.join(root_dir, cls, phase)
            for file in os.listdir(class_dir):
                if file.endswith('.mp4'):
                    self.files.append((os.path.join(class_dir, file), cls))
        
    def __len__(self):
        return len(self.files)
        
    def __getitem__(self, idx):
        video_path, label = self.files[idx]
        frames = self.load_video(video_path)
        
        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        
        label = self.classes.index(label)
        frames = torch.stack(frames)
        
        return frames, label
    
    def load_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
        cap.release()
        return frames

In [None]:
# Function to split the dataset
def split_dataset(dataset, train_ratio=0.8):
    train_size = int(train_ratio * len(dataset))
    val_size = len(dataset) - train_size
    return random_split(dataset, [train_size, val_size])

# Function to evaluate the model
def evaluate_model(model, data_loader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for videos, labels in data_loader:
            videos = videos.to(device)
            labels = labels.to(device)
            outputs = model(videos)
            _, preds = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

def plot_metrics(train_losses, val_accuracies, train_accuracies):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(18, 5))

    plt.subplot(1, 3, 1)
    plt.plot(epochs, train_losses, label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 3, 2)
    plt.plot(epochs, val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Validation Accuracy')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 3, 3)
    plt.plot(epochs, train_accuracies, label='Training Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training Accuracy')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

In [None]:
# Define the model architecture using VGG
class LipReadingModel(nn.Module):
    def __init__(self, num_classes=500):
        super(LipReadingModel, self).__init__()
        # VGG16 as feature extractor
        self.vgg = torchvision.models.vgg16(weights='DEFAULT')

        self.vgg.classifier = nn.Identity()  # Remove final classification layer
        
        self.bn1 = nn.BatchNorm2d(512)
        
        # RNN for sequence modeling
        self.rnn = nn.LSTM(input_size=512*2*2, hidden_size=256, num_layers=2, batch_first=True, bidirectional=True, dropout=0.5)

        # Fully connected layer for classification
        self.fc = nn.Linear(256*2, num_classes)  # bidirectional doubles the output features

    def forward(self, x):
        batch_size, timesteps, C, H, W = x.size()
        c_in = x.view(batch_size * timesteps, C, H, W)
        c_out = self.vgg.features(c_in)
        
        c_out = self.bn1(c_out)  # Apply batch normalization
        c_out = torch.nn.functional.relu(c_out)  # Example activation function
        
        c_out = c_out.view(batch_size, timesteps, -1)  # Flatten for LSTM
        r_out, _ = self.rnn(c_out)

        out = self.fc(r_out[:, -1, :])
        return out

In [None]:
# Hyperparameters grid
num_epochs = 20
learning_rates = [0.001, 0.0001, 0.0001]
batch_sizes = [8, 16]
best_val_accuracy = 0
best_model = None

# Directories and device setup
root_dir = '/kaggle/input/lrw_25_words/preprocessed_25_1000'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Define transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])

# Dataset and DataLoader
train_val_dataset = LipReadingDataset(root_dir, 'train', transform=transform, max_classes=25)
train_dataset, val_dataset = split_dataset(train_val_dataset, train_ratio=0.95)
test_dataset = LipReadingDataset(root_dir, 'test', transform=transform, max_classes=25)

best_train_loss = []
best_val_acc = []
best_train_acc = []

# Hyperparameter tuning
for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        num_classes = len(train_val_dataset.classes)
        model = LipReadingModel(num_classes=num_classes)
        model = nn.DataParallel(model, device_ids=[0, 1]) # For 2 GPUs
        model.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

        # Collect and plot metrics
        train_losses = []
        val_accuracies = []
        train_accuracies = []

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            epoch_loss = 0.0
            progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}')
            
            all_labels = []
            all_preds = []

            for videos, labels in progress_bar:
                videos = videos.to(device)
                labels = labels.to(device)
                outputs = model(videos)
                loss = criterion(outputs, labels)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                epoch_loss += loss.item()
                progress_bar.set_postfix(loss=loss.item())
                
                _, preds = torch.max(outputs, 1)
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())

            avg_loss = epoch_loss / len(train_loader)
            train_accuracy = accuracy_score(all_labels, all_preds)
            val_accuracy = evaluate_model(model, val_loader, device)

            train_losses.append(avg_loss)
            val_accuracies.append(val_accuracy)
            train_accuracies.append(train_accuracy)

            print(f'Learning Rate: {learning_rate}, Batch Size: {batch_size}, Epoch [{epoch + 1}/{num_epochs}], Avg Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}')
            
            scheduler.step()  # Step the scheduler
            
            if epoch > 5 and val_accuracy < 0.06:
                break

            # Track the best model based on validation accuracy
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                best_model = model.state_dict()
                best_hyperparams = (learning_rate, batch_size)
        if val_accuracy >= best_val_accuracy:
            best_train_loss = train_losses
            best_val_acc = val_accuracies
            best_train_acc = train_accuracies
        plot_metrics(train_losses, val_accuracies, train_accuracies)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {total_params}")

# Save the best model after all epochs
best_model_path = '/kaggle/working/best_vgg16_lstm_25Class_model.pth'
torch.save(best_model, best_model_path)
print(f"Best model saved to {best_model_path} with learning rate: {best_hyperparams[0]} and batch size: {best_hyperparams[1]}")

# Evaluate on test set
model.load_state_dict(best_model)
test_accuracy = evaluate_model(model, test_loader, device)
print(f'Test Accuracy: {test_accuracy:.4f}')

# Plot metrics for the best model
plot_metrics(train_losses, val_accuracies, train_accuracies)