# Task 3: Understand human gesture and body language based on your own built dataset and model

## 1. Do literature search on dataset building and other deep learning based models applied on gesture recognition. Comment on their applications and benefits.

## 2. In the earlier two tasks, you have learned how to do the gesture classification task using the given dataset. Now, you need to collect data by yourself and build your own dataset. The dataset is not limited to gestures. Postures and behavior are encouraged. Please place your data referring to the format of the given dataset. For good performance, the number of data in each class is recommended over 50. For the number of classes, it is better to have more than 3.

## 3. Design your own neural network architecture. Fully connected or convolutional layers used in the first two tasks is acceptable. But you are encouraged to learn more deep learning models and achieve it as possible as you can.

## 3. Write down the problems you encountered during the experiment, the solutions, and your experiences.

In [108]:
%matplotlib inline
import cv2
import numpy as np
import os
import itertools
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.utils.data as utils_data
from torchvision import datasets, transforms
from PIL import Image

In [109]:
path = './own/images'
path_processed = './own_processed/images'


# -------------------images processing--------------
for mainDir, subDir, fileList in os.walk(path):
    for file in fileList:
        currentPath = os.path.join(mainDir, file)
        if file != '.DS_Store':
            original = cv2.imread(filename=currentPath)
            processedImage = cv2.resize(original, (96,96))

            new_mainDir = path_processed + mainDir.split(path)[-1]
            if not os.path.exists(new_mainDir):
                os.makedirs(new_mainDir)
            cv2.imwrite(os.path.join(new_mainDir, file), processedImage)

# -----------------label generation----------------
label_path = './own_processed/labels'
if not os.path.exists(label_path):
    os.makedirs(label_path)

files = os.listdir(path)
index = 0
for i, file in enumerate(files):
    if file != '.DS_Store':
        subclass_label_path = os.path.join(label_path, file + '.txt')
        with open(subclass_label_path, 'w') as f:
            f.write('#label\n')
        for _ in range(len(os.listdir(os.path.join(path_processed, file)))):
            with open(subclass_label_path, 'a') as f:
                f.write('{:d}\n'.format(index))
        index = index + 1
    f.close()

In [110]:
Image = []
path_images = './own_processed/images'
for mainDir, subDir, fileList in os.walk(path_images):
    for file in fileList:
        currentPath = os.path.join(mainDir, file)
        Image.append(cv2.resize(cv2.imread(currentPath), (96, 96)))
Image = np.array(Image)
Image = np.transpose(Image, (0, 3, 1, 2))
dataset_size, C, H, W = Image.shape
# for FCNN model, the image need to be stretched into one dimension: (b, h, w)->(b, h*w)
print (Image.shape)


Label = []
path_labels = './own_processed/labels'
for file in os.listdir(path_labels):
    Label.append(np.loadtxt(os.path.join(path_labels, file)))
Label = np.array(list(itertools.chain.from_iterable(Label)))
num_classes = int(np.max(Label))+1
print(num_classes)

(206, 3, 96, 96)
4


In [111]:
device = torch.device("mps")

In [112]:
dataset = utils_data.TensorDataset(torch.Tensor(Image), torch.LongTensor(Label))
split_ratio = 0.8
train_size = int(split_ratio * dataset_size)
test_size = dataset_size - train_size
train_set, test_set = utils_data.random_split(dataset, [train_size, test_size])
train_loader = utils_data.DataLoader(dataset=train_set, batch_size=32, shuffle=True)
test_loader = utils_data.DataLoader(dataset=test_set, batch_size=32, shuffle=True)
print('Data is ready!')

Data is ready!


In [113]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(64)  # Batch Normalization
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout1 = nn.Dropout(0)  # Dropout
        
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)  # Batch Normalization
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout2 = nn.Dropout(0)  # Dropout
        
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)  # Batch Normalization
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout3 = nn.Dropout(0)  # Dropout
        
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(128)  # Batch Normalization
        self.relu4 = nn.ReLU()
        #self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout4 = nn.Dropout(0)  # Dropout

        self.fc = nn.Linear(128 * 12 * 12, 512)  # Adjust based on final spatial dimensions
        self.relu5 = nn.ReLU()
        self.dropout5 = nn.Dropout(0)  # Dropout
        
        self.fc2 = nn.Linear(512, 256)  # Adjust based on final spatial dimensions
        self.relu6 = nn.ReLU()
        self.dropout6 = nn.Dropout(0)  # Dropout
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        x = self.dropout1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = self.dropout2(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.pool3(x)
        x = self.dropout3(x)
        
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        #x = self.pool3(x)
        x = self.dropout4(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = self.relu5(x)
        x = self.dropout5(x)
        
        x = self.fc2(x)
        x = self.relu5(x)
        x = self.dropout6(x)
        return x

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

class CNNRNN(nn.Module):
    def __init__(self, cnn, rnn):
        super(CNNRNN, self).__init__()
        self.cnn = cnn
        self.rnn = rnn

    def forward(self, x):
        features = []
        for frame in x:
            frame = frame.unsqueeze(0)  # Add batch dimension
            features.append(self.cnn(frame))
        features = torch.stack(features, dim=0)  # Stack features along time axis
        out = self.rnn(features)
        return out


In [114]:
cnn = CNN()
rnn = RNN(input_size=256, hidden_size=256, num_layers=2, num_classes=4)
model = CNNRNN(cnn, rnn).to(device)
# if torch.cuda.is_available():
#     model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
loss_func = nn.CrossEntropyLoss()

In [115]:
best_accuracy = 0
test1 = np.zeros(500)
loss1 = np.zeros(500)
for epoch in range(500):
    running_loss = 0.0
    train_acc = 0.0
    for step, (batch_image, batch_label) in enumerate(train_loader):
        model.train()
        #device = torch.device("mps")
        batch_image, batch_label = batch_image.to(device), batch_label.to(device)
        #batch_output = batch_output.to(device)
        batch_output = model(batch_image)
        batch_loss = loss_func(batch_output, batch_label)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        running_loss += batch_loss.item()

        # train accuracy
        _, train_predicted = torch.max(batch_output.data, 1)
        train_acc += (train_predicted == batch_label).sum().item()

    train_acc /= train_size
    running_loss /= (step+1)

    # ----------test----------
    model.eval()
    test_acc = 0.0
    for test_image, test_label in test_loader:
        #device = torch.device("mps")
        test_image, test_label = test_image.to(device), test_label.to(device)
        test_output = model(test_image)
        _, predicted = torch.max(test_output.data, 1)
        test_acc += (predicted == test_label).sum().item()
    test_acc /= test_size

    print('epoch={:d}\ttrain loss={:.6f}\ttrain accuracy={:.3f}\ttest accuracy={:.3f}'.format(
        epoch, running_loss, train_acc, test_acc))
    test1[epoch] = test_acc
    loss1[epoch] = running_loss

    if test_acc >= best_accuracy:
        torch.save(model.state_dict(), './RNN_model.pkl')
        best_accuracy = test_acc
plt.figure(figsize=(6, 5))
plt.plot(test1,'red')
plt.figure(figsize=(6, 5))
plt.plot(loss1,'red')

epoch=0	train loss=1.364523	train accuracy=0.299	test accuracy=0.500
epoch=1	train loss=1.028064	train accuracy=0.616	test accuracy=0.548
epoch=2	train loss=0.791059	train accuracy=0.689	test accuracy=0.857
epoch=3	train loss=0.582236	train accuracy=0.841	test accuracy=0.762
epoch=4	train loss=0.443226	train accuracy=0.835	test accuracy=0.881
epoch=5	train loss=0.316845	train accuracy=0.896	test accuracy=0.881
epoch=6	train loss=0.310204	train accuracy=0.890	test accuracy=0.905
epoch=7	train loss=0.312252	train accuracy=0.878	test accuracy=0.905
epoch=8	train loss=0.279663	train accuracy=0.902	test accuracy=0.762
epoch=9	train loss=0.245221	train accuracy=0.902	test accuracy=0.905
epoch=10	train loss=0.233355	train accuracy=0.921	test accuracy=0.857
epoch=11	train loss=0.241722	train accuracy=0.915	test accuracy=0.619
epoch=12	train loss=0.286038	train accuracy=0.890	test accuracy=0.857
epoch=13	train loss=0.243379	train accuracy=0.921	test accuracy=0.905
epoch=14	train loss=0.223471	t


KeyboardInterrupt

