# **Image Processing**

In [41]:
import cv2
import numpy as np

In [42]:
def capture_frames(video_path):
    video = cv2.VideoCapture(video_path)
    video_length = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) - 1

    count = 0
    frames = []
    while video.isOpened():
        ret, frame = video.read()
        if not ret:
            continue
        frames.append(frame)
        count += 1
        if count > (video_length - 1):
            video.release()
    video.release()
    return np.array(frame)

In [43]:
def get_frames(video_path, num_frames=15, dim=(512, 512)):
    frames = capture_frames(video_path)
    video_length = len(frames)
    steps = video_length/num_frames
    count = 0
    new_frames = []
    while count < video_length:
        frame = frames[int(count)]
        frame = cv2.resize(frame, dim)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        new_frames.append(frame)
        count += steps
    
    return np.array(new_frames[:num_frames])


# **Datasets**

In [44]:
import torch
from torch.utils.data import Dataset
import pandas as pd

# from image_processing import get_frames

In [65]:
class SignsFrames(Dataset):
    def __init__(self, 
                data_path,
                label_path,
                video_type="color",
                n_frames=16,
                img_size=(512, 512),
                n_channels=3,
                n_classes=226,
                transform=None,
                target_transform=None):
        super(SignsFrames, self).__init__()
        self.n_frames = n_frames
        self.img_size = img_size
        self.n_channels = n_channels
        self.transform = transform
        self.target_transform = target_transform
        labels_file = pd.read_csv(label_path, names=["signerX_sampleY", "sample_id"])
        self.labels = labels_file.iloc[:, 1]
        self.samples = data_path + labels_file.iloc[:, 0] + "_" + video_type + ".mp4"

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        np_frames = get_frames(self.samples[idx], num_frames=self.n_frames, dim=self.img_size)
        frames = torch.empty((self.n_frames, self.n_channels, *self.img_size))
        if self.transform:
            for i in range(self.n_frames):
                frames[i] = self.transform(np_frames[i])
        if self.target_transform:
            label = self.target_transform(self.labels[idx])

        (f, c, h, w) = frames.size()
        frames = frames.view(c, f, h, w)
        
        return frames, label


# **Models**

## ***Common***

In [76]:
import torch
import pickle
from sklearn.metrics import accuracy_score

In [61]:
def train_epoch(model, dataloader, loss_fn, optimizer, epoch, device, log_interval):
    model = model.to(device)
    model.train()
    size = len(dataloader.dataset)
    losses = []
    all_label = []
    all_pred = []

    for batch_idx, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        pred = model(X)
        if isinstance(pred, list):
            pred = pred[0]

        loss = loss_fn(pred, y.squeeze())
        losses.append(loss.item())

        prediction = torch.max(pred, 1)[1]

        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % log_interval == 0:
            loss, current = loss.item(), batch_idx * len(X)
            print(f"loss: {loss:>7f}  |  [{current:5d}/{size:>5d}]")

        training_loss = sum(losses) / len(losses)
        print("Average Training Loss of Epoch {}: {:.6f}".format(epoch + 1, training_loss))

In [79]:
def val_epoch(model, criterion, dataloader, device, epoch, phase='Train', exp_name = None):
    model.eval()
    losses = []
    all_label = []
    all_pred = []
    score_frag = []
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            # get the inputs and labels
            inputs_clips, labels = data['data'].to(device), data['label'].to(device)
            # forward
            outputs_clips = []
            for i_clip in range(inputs_clips.size(1)):
                inputs = inputs_clips[:,i_clip,:,:]
                outputs_clips.append(model(inputs))
                # if isinstance(outputs, list):
                #     outputs = outputs[0]
            outputs = torch.mean(torch.stack(outputs_clips, dim=0), dim=0)
            if phase == 'Test':
                score_frag.append(outputs.data.cpu().numpy())
            # compute the loss
            loss = criterion(outputs, labels.squeeze())
            losses.append(loss.item())
            # collect labels & prediction
            prediction = torch.max(outputs, 1)[1]
            all_label.extend(labels.squeeze())
            all_pred.extend(prediction)
            if phase == 'Test':
                score = np.concatenate(score_frag)

        # Compute the average loss & accuracy
        validation_loss = sum(losses)/len(losses)
        all_label = torch.stack(all_label, dim=0)
        all_pred = torch.stack(all_pred, dim=0)
        validation_acc = accuracy_score(all_label.squeeze().cpu().data.squeeze().numpy(), all_pred.cpu().data.squeeze().numpy())

    if phase == 'Test':
        with open('./results/{}/results_epoch{:03d}_{}.pkl'.format(exp_name, epoch+1, validation_acc), 'wb') as f:
            score_dict = dict(zip(dataloader.dataset.sample_names, score))
            pickle.dump(score_dict, f)
    return validation_loss

## ***Models***

#### **1. Conv3D**

In [48]:
import sys
import os
import inspect

import torch
import torch.nn as nn
import torchvision

In [49]:
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
sys.path.insert(0, current_dir)

In [50]:
def convert_relu_to_swish(model: nn.Module):
    for child_name, child in model.named_children():
        if isinstance(child, nn.ReLU):
            setattr(model, child_name, nn.SiLU(True))
        else:
            convert_relu_to_swish(child)


In [51]:
class Swish(nn.Module):
    def __init(self):
        super().__init__()

    def forward(self, x):
        return x.mult_(torch.sigmoid(x))

In [52]:
class r2plus1d_18(nn.Module):
    def __init__(self, pretrained=True, n_classes=226, dropout_p=0.5):
        super(r2plus1d_18, self).__init__()
        self.pretrained = pretrained
        self.n_classes = n_classes

        model = torchvision.models.video.r2plus1d_18(pretrained=self.pretrained)
        modules = list(model.children())[:-1]
        self.r2plus1d_18 = nn.Sequential(*modules)
        convert_relu_to_swish(self.r2plus1d_18)
        self.fc1 = nn.Linear(model.fc.in_features, self.n_classes)
        self.dropout = nn.Dropout(dropout_p, inplace=True)

    def forward(self, x):
        (b, f, c, h, w) = x.size()
        x = x.view(b, c, f, h, w)

        out = self.r2plus1d_18(x)
        out = out.flatten(1)
        out = self.dropout(out)
        out = self.fc1(out)

        return out

# Main

In [66]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor, Lambda
from collections import OrderedDict

# from dataset import SignsFrames
# from models.Conv3D import r2plus1d_18
# from models.common import train_epoch

In [67]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

In [68]:
exp_name = "rgb_final"
model_path = f"checkpoint/{exp_name}"
if not os.path.exists(model_path):
    os.mkdir(model_path)
if not os.path.exists(os.path.join("results", exp_name)):
    os.mkdir(os.path.join("results", exp_name))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device.type

'cuda'

In [69]:
x = torch.zeros((3, 3)).to(device)
x.device

device(type='cuda', index=0)

In [70]:
data_path = "../Datasets/AUTSL/val/"
label_path = "../Datasets/AUTSL/labels/val_labels.csv"
n_classes = 226
epochs = 2
batch_size = 2
learning_rate = 1e-3
weight_decay = 1e-4
log_interval = 80
sample_size = 128
drop_p = 0.0
hidden1, hidden2 = 512, 256

params = {
    "data_path": data_path,
    "label_path": label_path,
    "video_type": "color",
    "n_frames": 10,
    "img_size": (128, 128),
    "n_channels": 3,
    "n_classes": n_classes,
    "transform": ToTensor(),
    "target_transform": Lambda(lambda y: torch.zeros(n_classes, dtype=torch.float).scatter(0, torch.tensor(y), value=1))
}

In [72]:
autsl_dt = SignsFrames(**params)
frames, label = autsl_dt[0]
print(type(frames))
print(frames.size())
train_dl = DataLoader(autsl_dt, batch_size=batch_size, shuffle=True)

<class 'torch.Tensor'>
torch.Size([3, 10, 128, 128])


In [80]:
model = torchvision.models.video.r2plus1d_18(pretrained=True, progress=True)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

for epoch in range(epochs):
    torch.cuda.empty_cache()
    val_epoch(model, loss_fn, train_dl, device, epoch)


TypeError: list indices must be integers or slices, not str

In [34]:
model = r2plus1d_18(pretrained=True, n_classes=500)
checkpoint = torch.load("pretrained/slr_resnet2d+1.pth")
new_state_dict = OrderedDict()
for k, v in checkpoint.items():
    name = k[7:]
    new_state_dict[name] = v
model.load_state_dict(new_state_dict)
model.fc1 = nn.Linear(model.fc1.in_features, n_classes)

model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.1, patience=10)

In [None]:
for epoch in range(epochs):
    print("lr: ", get_lr(optimizer))
    train_epoch(model, train_dl, loss_fn, optimizer, epoch, device, log_interval)

    torch.save(model.state_dict(), os.path.join(model_path, f"sign_resnet2d+1_epoch{epoch + 1:03d}.pth"))
    

In [40]:
import torch
print(torch.cuda.memory_summary(device=device, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  250395 KB |  376175 KB |  500686 KB |  250290 KB |
|       from large pool |  239396 KB |  360080 KB |  478687 KB |  239291 KB |
|       from small pool |   10999 KB |   16644 KB |   21999 KB |   10999 KB |
|---------------------------------------------------------------------------|
| Active memory         |  250395 KB |  376175 KB |  500686 KB |  250290 KB |
|       from large pool |  239396 KB |  360080 KB |  478687 KB |  239291 KB |
|       from small pool |   10999 KB |   16644 KB |   21999 KB |   10999 KB |
|---------------------------------------------------------------

In [38]:
torch.cuda.empty_cache()