In [2]:
import os
import cv2
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import glob
import torchvision.io
from sklearn.metrics import accuracy_score, top_k_accuracy_score
import copy
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [2]:
class CNN(nn.Module):
    def __init__(self, num_classes=500):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv3d(1, 64, kernel_size=(3, 3, 3), padding=1)
        self.bn1 = nn.BatchNorm3d(64)
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=1)
        self.bn2 = nn.BatchNorm3d(128)
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=1)
        self.bn3a = nn.BatchNorm3d(256)
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=1)
        self.bn3b = nn.BatchNorm3d(256)
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=1)
        self.bn4a = nn.BatchNorm3d(512)
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=1)
        self.bn4b = nn.BatchNorm3d(512)

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3))
        self.bn5a = nn.BatchNorm3d(512)
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3))
        self.bn5b = nn.BatchNorm3d(512)
        self.pool4 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2))

        self.flatten = nn.Flatten()

        self.fc6 = nn.Linear(512 * 2 * 2 * 2, 4096)
        self.fc7 = nn.Linear(4096, 1024)
        self.fc8 = nn.Linear(1024, num_classes)
        
        self.dropout = nn.Dropout(p=0.3)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)

        x = self.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)

        x = self.relu(self.bn3a(self.conv3a(x)))
        x = self.relu(self.bn3b(self.conv3b(x)))
        x = self.pool3(x)

        x = self.relu(self.bn4a(self.conv4a(x)))
        x = self.relu(self.bn4b(self.conv4b(x)))

        x = self.relu(self.bn5a(self.conv5a(x)))
        x = self.relu(self.bn5b(self.conv5b(x)))
        x = self.pool4(x)

        x = self.flatten(x)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)
        x = self.fc8(x)
        x = self.softmax(x)
        return x

In [3]:
class LipReadingDataset(Dataset):
    def __init__(self, root_dir, phase='train', transform=None, fixed_frame_count=29, max_classes=500):
        self.root_dir = root_dir
        self.phase = phase
        self.transform = transform
        self.classes = sorted(os.listdir(root_dir))[:max_classes]
        self.files = []
        self.fixed_frame_count = fixed_frame_count
        
        for cls in self.classes:
            class_dir = os.path.join(root_dir, cls, phase)
            for file in os.listdir(class_dir):
                if file.endswith('.mp4'):
                    self.files.append((os.path.join(class_dir, file), cls))
        
    def __len__(self):
        return len(self.files)
        
    def __getitem__(self, idx):
        video_path, label = self.files[idx]
        frames = self.load_video(video_path)
        
        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        
        label = self.classes.index(label)
        frames = torch.stack(frames)
        frames = frames.permute(1, 0, 2, 3)
        
        return frames, label
    
    def load_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frames.append(frame)
        cap.release()
        return frames

In [4]:
batch_size = 32
root_dir = '/kaggle/input/processed-25-lrw/preprocessed_25'
transform = transforms.Compose([
    transforms.ToTensor()
])
train_dataset = LipReadingDataset(root_dir, 'train', transform=transform)
test_dataset = LipReadingDataset(root_dir, 'test', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [5]:
def combine_batch_and_frames(in_vid):
    batch_size, channels, frames, x, y = in_vid.shape
    in_vid = in_vid.transpose(1,2)
    return in_vid.reshape(batch_size * frames, channels, x ,y)

class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = nn.Sequential(
                                nn.Conv2d(in_channels, out_channels, kernel_size=1),
                                nn.BatchNorm2d(out_channels),
                            )
        self.relu = nn.ReLU()
        
    
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out
        
class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        self.layer1 = ResBlock(64, 64)
        self.layer2 = ResBlock(64, 128)
        self.layer3 = ResBlock(128, 256)
        self.layer4 = ResBlock(256, 512)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        return x

class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size

    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()
        
class TemporalBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, dilation, padding, dropout=0.1):
        super(TemporalBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.net = nn.Sequential(
            self.conv1, self.chomp1, self.bn1, self.relu1, self.dropout1,
            self.conv2, self.chomp2, self.bn2, self.relu2, self.dropout2
        )
        self.downsample = nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else None
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.1):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

class LipReading(nn.Module):
    def __init__(self):
        super(LipReading, self).__init__()
        self.frontend3D = nn.Sequential(
                        nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
                        nn.BatchNorm3d(64),
                        nn.ReLU(),
                        nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
        
        self.resnet = ResNet()
        self.tcn = TemporalConvNet(512, [32, 64, 128], kernel_size=3, dropout=0.3)
        self.linear = nn.Linear(128, 25)
        
    
    def forward(self, x):
        batch_size = x.shape[0]
        x = self.frontend3D(x)
        new_frames = x.shape[2]
        x = combine_batch_and_frames(x)
        x = self.resnet(x)
        x = x.view(batch_size, new_frames, x.size(1))
        x = x.transpose(1,2)
        x = self.tcn(x)
        x = torch.mean(x, dim=2)
        x = self.linear(x)
        return x



In [None]:
#
#3dto2dtensor changes shape from (batch, channels, frames, x, y) to
# (batch * frames, channels, x, y)
# this shape is sent to resnet
# resnet runs conv2d's on this shape (idk why conv3x3 is a 2d conv really...)
#basic block is a conv2d, batch norm, relu, conv2d, batch norm, rlu
#resnet is 4 make_layers, where each make_layer is a loop making x 
# number of layers with given channels
# layers is a len 4 array of 2's, so each make_layer makes 2 layers
# final is a block of 512 channels, idrk the x, y
# adaptiveavgpool takes avg of x, y; gets 512 channels of 1x1
# finally, rehsapes to (batch*frames, channels) and returns
# now, back in Lipreading module, we reshape back to (batch, frames, channels)
# this, we reshape into (batch, channels, frames) to run our 1d conv on
# TCN iterates across frames e.g. 
#time = 1 to 25. kernel size of 3 goes 1 to 3, 2 to 4, 3 to 5, etc. 
# In addition, it takes all 512 channels and dot products to create a single value
# which becomes the single element in the output
# There are out_channel channels in the output, so with proper padding,
# we still have 25 frames, but each input channel is used for every output channel
# and we have out_channel output_channels

#Essentially, each conv1d is a weight matrix of size (in_channels, kernel_size)
# and we have out_channels number of these weight matrices

#Then, we have n amount of these conv1d things in sequence, altering
#dilation, padding, and stride, so that each subsequent conv1d accesses
# a wider area, ultimately returning (batch size, output_channels, frames) shape
# this, we average over all the frames to get (batch_size, output_channels)
# finally, we pass this into a linear layer to get our final scores

#What we have is the same frontend conv3d
#A miniature resnet with 2 64, 2 128, and 2 256 layers
# resulting in a (batch * frames, 256) output from resnet
#then 4 tcn layers



In [7]:
def evaluate_model(model, test_loader, device, criterion):
    test_loss = 0
    correct = 0
    total = 0
    model.eval()  
    with torch.no_grad():  
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)  
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    test_loss /= len(test_loader)
    accuracy = 100. * correct / total
    model.train()
    return test_loss, accuracy

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = len(train_dataset.classes)
learning_rate = 0.001
weight_decay = 0.001
num_epochs = 20

model = LipReading()
model = nn.DataParallel(model)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

best_test_accuracy = 0
best_model = None

train_losses = []
test_losses = []
test_accuracies = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}')
    for videos, labels in progress_bar:
        videos = videos.to(device)
        labels = labels.to(device)
        outputs = model(videos)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / len(train_loader)
    test_loss, test_accuracy = evaluate_model(model, test_loader, device, criterion)
    if test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        best_model = copy.deepcopy(model.state_dict())
    test_losses.append(test_loss)
    train_losses.append(avg_loss)
    test_accuracies.append(test_accuracy)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Avg Loss: {avg_loss:.4f}, Test Loss: {test_loss: .4f} Testing Accuracy: {test_accuracy:.4f}')
    
    scheduler.step()  

    
model_save_path = f'/kaggle/working/tcn_model25_less_channels_Acc{best_test_accuracy:2.0f}.pth'
torch.save(best_model, model_save_path)
print(f"Model saved to {model_save_path}. Accuracy: {best_test_accuracy}")



In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()

model = LipReading()
model = nn.DataParallel(model)
model.load_state_dict(torch.load('/kaggle/working/tcn_model25Acc77.pth'))
model.to(device)
test_loss, test_accuracy = evaluate_model(model, test_loader, device, criterion)
print(test_loss, test_accuracy)

0.7780043211502906 77.14285714285714


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()

model = CNN(num_classes=25)
model = nn.DataParallel(model) #For 2 GPUs
model.load_state_dict(torch.load('/kaggle/input/cnn_25/pytorch/1/1/cnn_model_25.pth'))
model.to(device)
test_loss, test_accuracy = evaluate_model(model, test_loader, device, criterion)
print(test_loss, test_accuracy)

2.617603448721079 67.59183673469387


In [16]:
def create_confusion_matrix(model, data_loader, device, criterion, path, classes):
    test_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_targets = []

    model.eval()  
    with torch.no_grad():  
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device) 
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    test_loss /= len(test_loader)
    accuracy = 100. * correct / total

    print(f'Accuracy: {accuracy:.2f}%')
    print(f'Test Loss: {test_loss:.6f}')

    cm = confusion_matrix(all_targets, all_preds)

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot(cmap=plt.cm.Blues)
    plt.xticks(rotation=90)
    plt.savefig(path)
    plt.show()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
classes = sorted(os.listdir('/kaggle/input/processed-25-lrw/preprocessed_25'))

model = CNN(num_classes=25)
model = nn.DataParallel(model) #For 2 GPUs
model.load_state_dict(torch.load('/kaggle/input/cnn_25/pytorch/1/1/cnn_model_25.pth'))
model.to(device)
create_confusion_matrix(model, test_loader, device, criterion, '/kaggle/working/cnn_confusion_mat', classes)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
classes = sorted(os.listdir('/kaggle/input/processed-25-lrw/preprocessed_25'))


model = LipReading()
model = nn.DataParallel(model) #For 2 GPUs
model.load_state_dict(torch.load('/kaggle/working/tcn_model25Acc77.pth'))
model.to(device)
create_confusion_matrix(model, test_loader, device, criterion, '/kaggle/working/tcn_confusion_mat', classes)
