In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms.functional import resize
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import os
import glob
import torchvision.io
import cv2
from tqdm.notebook import tqdm

In [2]:
# Custom dataset class for Lip Reading
class LipReadingDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.video_paths = []
        self.labels = []
        self.classes = sorted(os.listdir(root_dir))
        if '.DS_Store' in self.classes:
            self.classes.remove('.DS_Store')
        
        for label, class_name in enumerate(self.classes):
            class_dir = os.path.join(root_dir, class_name, 'train')
            video_files = glob.glob(os.path.join(class_dir, "*.mp4"))
            self.video_paths.extend(video_files)
            self.labels.extend([label] * len(video_files))

        # Debug prints
        print(f"Found {len(self.video_paths)} videos across {len(self.classes)} classes.")
        if len(self.video_paths) == 0:
            print("No videos found. Please check the dataset directory structure and paths.")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        frames = self.load_video_frames(video_path)

        if self.transform:
            frames = self.transform(frames)

        return frames, label

    def load_video_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # Convert frame to grayscale
            frame = Image.fromarray(frame)  # Convert numpy array to PIL Image
            frames.append(frame)
        cap.release()
        return frames

In [3]:
# Transform for video frames
class ToTensor:
    def __call__(self, frames):
        tensor = torch.stack([transforms.ToTensor()(frame) for frame in frames])  
        return tensor

In [4]:
# Path to the processed_selected_mp4_files directory
root_dir = './processed_selected_mp4_files'  # Update this path

# Dataset and DataLoader
transform = ToTensor()
train_dataset = LipReadingDataset(root_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, collate_fn=lambda x: collate_fn(x))

# Collate function to handle variable-length sequences
def collate_fn(batch):
    videos, labels = zip(*batch)
    max_len = max(len(video) for video in videos)
    padded_videos = []
    for video in videos:
        pad_size = max_len - len(video)
        # Ensure the tensor dimensions match for concatenation

        padded_video = torch.cat([video, torch.zeros((pad_size, video.shape[1], video.shape[2], video.shape[3]))], dim=0)  # Add padding for grayscale frames
        
        # Resize video to (max_len, 1, 224, 224)
        resized_video = torch.stack([resize(frame, (224, 224)) for frame in padded_video])
        padded_videos.append(resized_video)
        # padded_videos.append(padded_video)
    return torch.stack(padded_videos), torch.tensor(labels)


Found 20 videos across 2 classes.


In [5]:
class LipReadingModel(nn.Module):
    def __init__(self, num_classes):
        super(LipReadingModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.lstm = nn.LSTM(64*56*56, 256, batch_first=True)
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        x = x.view(batch_size * seq_len, c, h, w)
        x = self.cnn(x)
        x = x.view(batch_size, seq_len, -1)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x


In [6]:
def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")
        for i, (videos, labels) in enumerate(progress_bar):
            videos = videos.to(device)
            labels = labels.to(device)

            outputs = model(videos)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = epoch_loss / len(train_loader)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_loss:.4f}')

# Training configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = len(train_dataset.classes)
learning_rate = 0.001
num_epochs = 20

# Initialize model, loss, and optimizer
model = LipReadingModel(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs, device)

Epoch 1/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [1/20], Average Loss: 1.5201


Epoch 2/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [2/20], Average Loss: 1.6781


Epoch 3/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [3/20], Average Loss: 1.4613


Epoch 4/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [4/20], Average Loss: 1.2949


Epoch 5/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [5/20], Average Loss: 1.1270


Epoch 6/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [6/20], Average Loss: 0.9771


Epoch 7/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [7/20], Average Loss: 0.8446


Epoch 8/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [8/20], Average Loss: 0.7657


Epoch 9/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [9/20], Average Loss: 0.7221


Epoch 10/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [10/20], Average Loss: 0.7023


Epoch 11/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [11/20], Average Loss: 0.6941


Epoch 12/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [12/20], Average Loss: 0.7162


Epoch 13/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [13/20], Average Loss: 0.7335


Epoch 14/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [14/20], Average Loss: 0.7467


Epoch 15/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [15/20], Average Loss: 0.7481


Epoch 16/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [16/20], Average Loss: 0.7422


Epoch 17/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [17/20], Average Loss: 0.7278


Epoch 18/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [18/20], Average Loss: 0.7176


Epoch 19/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [19/20], Average Loss: 0.7094


Epoch 20/20:   0%|          | 0/2 [00:00<?, ?batch/s]

Epoch [20/20], Average Loss: 0.7011
