In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import models
import torchvision.transforms as transforms
from torchvision.transforms.functional import resize
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import os
import glob
import torchvision.io
import cv2
from tqdm.notebook import tqdm

In [2]:
# Custom dataset class for Lip Reading
class LipReadingDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.video_paths = []
        self.labels = []
        self.classes = sorted(os.listdir(root_dir))
        if '.DS_Store' in self.classes:
            self.classes.remove('.DS_Store')
        
        for label, class_name in enumerate(self.classes):
            class_dir = os.path.join(root_dir, class_name, 'train')
            video_files = glob.glob(os.path.join(class_dir, "*.mp4"))
            self.video_paths.extend(video_files)
            self.labels.extend([label] * len(video_files))

        # Debug prints
        print(f"Found {len(self.video_paths)} videos across {len(self.classes)} classes.")
        if len(self.video_paths) == 0:
            print("No videos found. Please check the dataset directory structure and paths.")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        frames = self.load_video_frames(video_path)

        if self.transform:
            frames = self.transform(frames)

        return frames, label

    def load_video_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # Convert frame to grayscale
            frame = Image.fromarray(frame)  # Convert numpy array to PIL Image
            frames.append(frame)
        cap.release()
        return frames

In [3]:
# Transform for video frames
class ToTensor:
    def __call__(self, frames):
        tensor = torch.stack([transforms.ToTensor()(frame) for frame in frames])  
        return tensor

In [4]:
# Path to the processed_selected_mp4_files directory
root_dir = './processed_selected_mp4_files'  # Update this path

# Dataset and DataLoader
transform = ToTensor()
train_dataset = LipReadingDataset(root_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, collate_fn=lambda x: collate_fn(x))

# Collate function to handle variable-length sequences
def collate_fn(batch):
    videos, labels = zip(*batch)
    max_len = max(len(video) for video in videos)
    padded_videos = []
    for video in videos:
        pad_size = max_len - len(video)
        # Ensure the tensor dimensions match for concatenation

        padded_video = torch.cat([video, torch.zeros((pad_size, video.shape[1], video.shape[2], video.shape[3]))], dim=0)  # Add padding for grayscale frames
        
        # Resize video to (max_len, 1, 224, 224)
        resized_video = torch.stack([resize(frame, (224, 224)) for frame in padded_video])
        padded_videos.append(resized_video)
        # padded_videos.append(padded_video)
    return torch.stack(padded_videos), torch.tensor(labels)

Found 20 videos across 2 classes.


In [5]:
class CNNFeatureExtractor(nn.Module):
    def __init__(self):
        super(CNNFeatureExtractor, self).__init__()
        self.cnn = models.resnet18(weights="DEFAULT")
        self.cnn.conv1 = nn.Conv2d(1, self.cnn.conv1.out_channels, kernel_size=self.cnn.conv1.kernel_size, 
                                   stride=self.cnn.conv1.stride, padding=self.cnn.conv1.padding, bias=False)
        self.cnn = nn.Sequential(*list(self.cnn.children())[:-2])  # Remove the classification layers

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.size()
        x = x.view(batch_size * seq_length, c, h, w)
        features = self.cnn(x)
        features = features.view(batch_size, seq_length, -1)
        return features


In [6]:
class CNNLSTM(nn.Module):
    def __init__(self, cnn_model, hidden_dim, num_classes, num_layers=1):
        super(CNNLSTM, self).__init__()
        self.cnn = cnn_model
        self.lstm = nn.LSTM(input_size=512 * 7 * 7, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        with torch.no_grad():
            cnn_features = self.cnn(x)
        lstm_out, _ = self.lstm(cnn_features)
        output = self.fc(lstm_out[:, -1, :])  # Use the output from the last LSTM cell
        return output


In [7]:
def train(model, dataloader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


num_classes = len(train_dataset.classes)  # Automatically get the number of classes
learning_rate = 0.001
num_epochs = 20
batch_size = 10

cnn_model = CNNFeatureExtractor().to(device)
model = CNNLSTM(cnn_model, hidden_dim=256, num_classes=len(train_dataset.classes)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

dataloader = train_loader

train(model, dataloader, criterion, optimizer, num_epochs=num_epochs)

  return F.conv2d(input, weight, bias, self.stride,


Epoch 1/20, Loss: 1.5678398609161377
Epoch 2/20, Loss: 0.6310233473777771
Epoch 3/20, Loss: 0.5345368385314941
Epoch 4/20, Loss: 0.4503503739833832
Epoch 5/20, Loss: 0.4365369379520416
Epoch 6/20, Loss: 0.3823126256465912
Epoch 7/20, Loss: 0.3784024715423584
Epoch 8/20, Loss: 0.3330995440483093
Epoch 9/20, Loss: 0.2866179645061493
Epoch 10/20, Loss: 0.2673974335193634
Epoch 11/20, Loss: 0.21023885905742645
Epoch 12/20, Loss: 0.2609831988811493
Epoch 13/20, Loss: 0.21097242832183838
Epoch 14/20, Loss: 0.16426411271095276
Epoch 15/20, Loss: 0.14461857080459595
Epoch 16/20, Loss: 0.11699937283992767
Epoch 17/20, Loss: 0.09704429656267166
Epoch 18/20, Loss: 0.11190438270568848
Epoch 19/20, Loss: 0.0853935033082962
Epoch 20/20, Loss: 0.0870797410607338
