In [1]:
import cv2
import mediapipe as mp
import os


In [2]:
# Initialize Mediapipe Face Mesh
mp_face_mesh = mp.solutions.face_mesh.FaceMesh(max_num_faces=1)


In [3]:
def extract_lips(video_path, output_dir):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to RGB for Mediapipe
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = mp_face_mesh.process(rgb_frame)

        if results.multi_face_landmarks:
            # Extract lip landmarks
            landmarks = results.multi_face_landmarks[0]
            h, w, _ = frame.shape
            lip_landmarks = [(int(lm.x * w), int(lm.y * h)) for lm in landmarks.landmark[61:81]]
            
            # Crop the lip region
            min_x = min([x for x, y in lip_landmarks])
            max_x = max([x for x, y in lip_landmarks])
            min_y = min([y for x, y in lip_landmarks])
            max_y = max([y for x, y in lip_landmarks])
            lip_roi = frame[min_y:max_y, min_x:max_x]

            # Resize the cropped region
            if lip_roi.size > 0:
                lip_roi_resized = cv2.resize(lip_roi, (100, 50))
                # Save the frame
                frame_path = os.path.join(output_dir, f"frame_{frame_count:04d}.jpg")
                cv2.imwrite(frame_path, lip_roi_resized)
                frame_count += 1

    cap.release()


Model Architecture

In [5]:
pip install torch torchvision torchaudio


Collecting torch
  Downloading torch-2.5.1-cp39-cp39-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.20.1-cp39-cp39-win_amd64.whl.metadata (6.2 kB)
Collecting torchaudio
  Downloading torchaudio-2.5.1-cp39-cp39-win_amd64.whl.metadata (6.5 kB)
Collecting filelock (from torch)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.5.1-cp39-cp39-win_amd64.whl (203.0 MB)
   ---------------------------------------- 0.0/203.0 MB ? e

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

# Define the LipReadingModel
class LipReadingModel(nn.Module):
    def __init__(self, vocab_size):
        super(LipReadingModel, self).__init__()
        
        # Conv3d for processing the sequence of frames
        self.conv1 = nn.Conv3d(1, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
        self.conv2 = nn.Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
        
        # LSTM layer to capture temporal relationships
        self.lstm = nn.LSTM(input_size=64 * 100 * 50, hidden_size=128, num_layers=2, batch_first=True)
        
        # Output layer
        self.fc = nn.Linear(128, vocab_size)

    def forward(self, x):
        # Apply Conv3d layers
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        
        # Flatten the spatial dimensions for LSTM
        x = x.view(x.size(0), x.size(2), -1)  # [batch_size, sequence_length, features]
        
        # LSTM to process the sequence
        x, _ = self.lstm(x)
        
        # Fully connected layer
        x = self.fc(x)
        
        return x

# Dummy Dataset Class (to load preprocessed frames and text sequences)
class LipReadingDataset(Dataset):
    def __init__(self, frames, texts):
        self.frames = frames  # List of frame sequences
        self.texts = texts  # List of corresponding transcriptions
    
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        return torch.tensor(self.frames[idx]), torch.tensor(self.texts[idx])

# Example data (replace with actual data)
frames = [torch.randn(30, 1, 100, 50)]  # List of sequences of lip images (e.g., 30 frames)
texts = [[1, 2, 3]]  # Corresponding transcriptions (encoded as integers)

# Create dataset and dataloader
dataset = LipReadingDataset(frames, texts)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define model, optimizer, and loss function
vocab_size = 40  # Example vocab size (can be expanded)
model = LipReadingModel(vocab_size)

optimizer = Adam(model.parameters(), lr=0.001)
ctc_loss = nn.CTCLoss()

# Training loop
for epoch in range(10):  # Example 10 epochs
    for frames, texts in dataloader:
        optimizer.zero_grad()
        
        # Reshape frames to [batch_size, channels, frames, height, width]
        frames = frames.view(frames.size(0), 1, 30, 100, 50)
        
        output = model(frames)
        
        # Calculate lengths for CTC loss
        input_lengths = torch.full((frames.size(0),), output.size(1), dtype=torch.long)
        target_lengths = torch.tensor([len(text) for text in texts], dtype=torch.long)
        
        # Compute CTC loss
        loss = ctc_loss(output.permute(1, 0, 2), texts, input_lengths, target_lengths)
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


  return torch.tensor(self.frames[idx]), torch.tensor(self.texts[idx])


Epoch 1, Loss: -4.896500110626221
Epoch 2, Loss: -3.7988059520721436
Epoch 3, Loss: -3.0832996368408203
Epoch 4, Loss: -2.4976933002471924
Epoch 5, Loss: -1.7474242448806763
Epoch 6, Loss: -0.8763757348060608
Epoch 7, Loss: 0.07741797715425491
Epoch 8, Loss: 1.066719889640808
Epoch 9, Loss: 2.006410598754883
Epoch 10, Loss: 2.7438583374023438


In [25]:
import cv2
import torch

# Load trained model (make sure you save and load your model appropriately)
model.eval()

# Capture video from webcam
cap = cv2.VideoCapture(0)  # Change to video file if needed

sequence = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess frame to extract lip region (using the same preprocessing as above)
    lip_roi = preprocess_frame(frame)  # Implement preprocessing to extract lip region
    sequence.append(lip_roi)
    
    if len(sequence) == 30:  # Process after 30 frames
        input_tensor = torch.tensor(sequence).unsqueeze(0).float()  # Add batch dimension
        
        # Get predictions from model
        with torch.no_grad():
            prediction = model(input_tensor)
        
        # Decode the prediction (use a decoder to convert logits to text)
        predicted_text = decode_prediction(prediction)  # Implement decoding logic
        
        print("Predicted Text:", predicted_text)
        
        sequence = []  # Reset for next sequence

cap.release()
cv2.destroyAllWindows()


In [26]:
# Test the webcam
test_webcam()


NameError: name 'test_webcam' is not defined