## Web cum test

In [4]:
import time
import cv2
from cvzone.HandTrackingModule import HandDetector
from cvzone.PoseModule import PoseDetector


def webcum():
    cap = cv2.VideoCapture(0)
    hands_detector = HandDetector()
    pose_detector = PoseDetector()
    while True:
        success, img = cap.read()
        hands, img1 = hands_detector.findHands(img)
        print(hands)
        img2 = pose_detector.findPose(img)
        lmList, bboxInfo = pose_detector.findPosition(img, bboxWithHands=False)
        print(lmList)
        print()
        time.sleep(2)
        # cv2.imshow("CUM", img)
        # cv2.waitKey(1)

## Dataset

In [8]:
import json
import os
import torch
from torch.utils.data import Dataset, DataLoader
from cvzone.HandTrackingModule import HandDetector
from cvzone.PoseModule import PoseDetector
import cv2

In [38]:
class ASLDataset(Dataset):
    """
    A custom dataset class for loading American Sign Language (ASL) videos and their corresponding labels.

    Args:
        video_folder (str): Path to the folder containing the ASL videos.
        name_with_label (dict): A dictionary mapping video names to their labels.
        classes (list): A list of class names.
        transform (callable, optional): A function/transform to apply to the frames of the videos.

    Attributes:
        video_folder (str): Path to the folder containing the ASL videos.
        name_with_label (dict): A dictionary mapping video names to their labels.
        videos_names (list): A list of video names.
        classes (list): A list of class names.
        transform (callable, optional): A function/transform to apply to the frames of the videos.
        hands_detector (HandDetector): An instance of the HandDetector class for hand detection.
        pose_detector (PoseDetector): An instance of the PoseDetector class for pose detection.
    """

    def __init__(self, video_folder, name_with_label, classes, transform=None):
        self.video_folder = video_folder
        self.name_with_label = name_with_label
        self.videos_names = list(self.name_with_label.keys())
        self.classes = classes
        self.transform = transform
        self.hands_detector = HandDetector()
        self.pose_detector = PoseDetector()
        self.max_frame_count = 0

        # Load videos
        self.video_tensor_sequences = self._load_all_videos()

    def _load_all_videos(self):
        """
        Loads all videos in advance.
        """

        video_tensor_sequences = []
        default_frame = [0] * (21 * 3 * 2 + 33 * 3)
        
        for index in range(len(self.videos_names)):
            print(f"{index} out of {len(self.videos_names)}")
            video_path = self.video_folder + '/' + self.videos_names[index] + '.mp4'
            # Open the video file using OpenCV
            video = cv2.VideoCapture(video_path)
            frames_points = []
            frame_cnt = 0
            start_frame, end_frame = self.name_with_label[self.videos_names[index]][1], self.name_with_label[self.videos_names[index]][2]
            while video.isOpened():
                ret, frame = video.read()
                frame_cnt += 1
                # If frame inside action frames then preprocess them
                if ret and start_frame <= frame_cnt <= end_frame:
                    # Perform any necessary preprocessing on the frame
                    if self.transform is not None:
                        frame = self.transform(frame)
                    # Collect all points. 21 points for each hand, 33 points on pose
                    points = [0] * (21 * 3 * 2 + 33 * 3)
                    # Recognize hands and collect them into list of all points
                    hands, img1 = self.hands_detector.findHands(frame)
                    for i in range(len(hands)):
                        ind_shift = 0
                        if hands[i].get('type') == 'Left':
                            ind_shift = 21 * 3
                        hand_points = hands[i].get('lmList')
                        for j in range(len(hand_points)):
                            for k in range(3):
                                points[ind_shift + j * 3 + k] = hand_points[j][k]

                    # Recognize the pose and collect points
                    img2 = self.pose_detector.findPose(frame)
                    lmList, bboxInfo = self.pose_detector.findPosition(frame, bboxWithHands=False)
                    for i in range(len(lmList)):
                        for j in range(1, 4):
                            points[21 * 3 * 2 + i * 3 + j - 1] = lmList[i][j]
                    frames_points.append(points)
                elif not ret:
                    break

            # Release the video object
            video.release()

            # Convert the list of frames to a PyTorch tensor
            tensor = torch.tensor(frames_points)
            self.max_frame_count = max(self.max_frame_count, len(frames_points))
            video_tensor_sequences.append(tensor)
        
    def __len__(self):
        """
        Returns the number of videos in the dataset.

        Returns:
            int: The number of videos in the dataset.
        """
        return len(self.videos_names)

    def __getitem__(self, index):
        """
        Retrieves a video and its corresponding label from the dataset.

        Args:
            index (int): The index of the video to retrieve.

        Returns:
            tuple: A tuple containing the video frames as a PyTorch tensor and the label.
        """


        return self.video_tensor_sequences[index], self.name_with_label[self.videos_names[index]][0]


In [39]:
def read_preprocess_json(json_name, videos_root):
    """
    Read json and separate videos on predefined subsets(train, val, test). Check for existence of videos.

    :param json_name: path or name of json file in format {'video_name.mp4': {'subset': 'train', 'action': [class_num,
    start_frame, end_frame]}}
    :param videos_root: root folder of all videos
    :return: train, validation and test dictionaries in format {'video_name.mp4': [class_num, start_frame, end_frame]}
    """
    videos = json.load(open(json_name))
    train, val, test = dict(), dict(), dict()
    for name in os.listdir(videos_root):
        name = name[:-4]
        if videos.get(name) is None:
            continue
        if videos[name]['subset'] == 'train':
            train[name] = videos[name]['action']
        elif videos[name]['subset'] == 'val':
            val[name] = videos[name]['action']
        elif videos[name]['subset'] == 'test':
            test[name] = videos[name]['action']
    return train, val, test

In [40]:
def read_classes(path):
    classes = dict()
    with open(path, 'r') as file:
        for line in file:
            line = line.strip().split('\t')
            key = int(line[0])
            value = line[1]
            classes[key] = value
    return classes

In [41]:
# Loading datasets (train, validation and test)

# LOL TOO MUCH
train, val, test = read_preprocess_json('wlasl_dataset/output.json', 'wlasl_dataset/videos')
print(len(train))
print(len(val))
classes = read_classes('wlasl_dataset/wlasl_class_list.txt')
train_dataset = ASLDataset('wlasl_dataset/videos', train, classes)
val_dataset = ASLDataset('wlasl_dataset/videos', val, classes)
test_dataset = ASLDataset('wlasl_dataset/videos', test, classes)

print(train_dataset.max_frame_count)
print(val_dataset.max_frame_count)
print(test_dataset.max_frame_count)

32
6
0 out of 32
1 out of 32
2 out of 32
3 out of 32
4 out of 32
5 out of 32
6 out of 32
7 out of 32
8 out of 32
9 out of 32
10 out of 32
11 out of 32
12 out of 32
13 out of 32
14 out of 32
15 out of 32
16 out of 32
17 out of 32
18 out of 32
19 out of 32
20 out of 32
21 out of 32
22 out of 32
23 out of 32
24 out of 32
25 out of 32
26 out of 32
27 out of 32
28 out of 32
29 out of 32
30 out of 32
31 out of 32
0 out of 6
1 out of 6
2 out of 6
3 out of 6
4 out of 6
5 out of 6
0 out of 7
1 out of 7
2 out of 7
3 out of 7
4 out of 7
5 out of 7
6 out of 7
103
73
101


In [None]:
# Create dataloaders

batch_size = 4

train_dataloader = DataLoader()

## Model

In [2]:
import torch.nn as nn

class SequenceModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.lstm = nn.Sequential(
            nn.LSTM(1662, 64, batch_first=True),
            nn.ReLU(),
            nn.LSTM(64, 128, batch_first=True),
            nn.ReLU(),
            nn.LSTM(128, 64, batch_first=True),
            nn.ReLU()
        )
        self.linear = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.Softmax(dim=1),
        )

    def forward(self, x):
        x = self.lstm(x)
        x = self.linear(x)

        return x

## Training

In [None]:
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm


def train_model(model: nn.Module, epochs: int, criterion, optimizer):
    """
    Function that trains model using number of epochs, loss function, optimizer.
    Can use validation or test data set for evaluation.
    Calculates f1 score.

    Parameter
    ---------
    model : nn.Module
      Model to train.
    epochs: int
      Number of train epochs
    criterion
      The loss function from pytorch
    optimizer
      The optimizer from pytorch
    """

    # Train the model
    for epoch in range(epochs):
        model.train()

        #
        predicted_train = []
        true_train = []

        train_loss = 0.0

        bar = tqdm(train_dataloader)
        iterations = 0

        for inputs, outputs in bar:
            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            predictions = model(inputs.to(device))
            loss = criterion(predictions, outputs.to(device))
            train_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

            # Get predicted classes and true classes from data
            for item in predictions:
                predicted_train.append(int(torch.argmax(item).cpu().numpy()))
            for item in outputs:
                true_train.append(int(torch.argmax(item).cpu().numpy()))
            iterations += 1
            bar.set_postfix(({"loss": f"{train_loss/(iterations*train_dataloader.batch_size)}"}))

        # Computing loss
        train_loss /= len(train_dataset)
        # Computing f1 score
        train_f1 = f1_score(true_train, predicted_train, average="macro")

        # Printing information in the end of train loop
        test_loss, test_f1 = test_model(model, criterion, test_dataloader)
        print(f"Epoch {epoch+1} train (loss: {train_loss:.4f}, f1 score: {train_f1:.4f}) test (loss: {test_loss:.4f}, f1 score: {test_f1:.4f})")


def test_model(model: nn.Module, criterion, test_dataloader: DataLoader):
    """
    Function that evaluates model on specified dataloader
    by specified loss function.

    Parameter
    ---------
    model : nn.Module
      Model to train.
    criterion
      The loss function from pytorch
    test_dataloader: DataLoader
      The dataset for testing model

    Returns
    -------
    float: loss of model on given dataset
    float: f1 score of model on given dataset
    """

    model.eval()

    # Test loss value
    test_loss = 0.0

    # Lists for calculation f1 score
    predicted_test = []
    true_test = []

    with torch.no_grad():
        for inputs, outputs in test_dataloader:

            # Forward pass
            predictions = model(inputs.to(device))
            test_loss += criterion(predictions, outputs.to(device))

            # Get predicted classes and true classes from data
            for item in predictions:
                predicted_test.append(int(torch.argmax(item).cpu().numpy()))
            for item in outputs:
                true_test.append(int(torch.argmax(item).cpu().numpy()))

    # Computation of test loss
    test_loss /= len(test_dataloader)

    # Computation of f1 score
    test_f1 = f1_score(true_test, predicted_test, average="macro")
    test_accuracy = accuracy_score(true_test, predicted_test)
    return test_loss.item(), test_f1, test_accuracy
