In [2]:
import itertools
import torch
from scipy import ndimage as ndimage
from sklearn.utils import shuffle
import time
import math

In [2]:
# From paper
try:
    from tensorboardX import SummaryWriter
except:
    class SummaryWriter():
        def __init__(self):
            pass
        def add_scalar(self, tag, scalar_value, global_step=None, walltime=None):
            pass

In [8]:
import glob
import numpy
import pickle
from scipy import ndimage as ndimage
from sklearn.model_selection import train_test_split
""""
    Data set citation: 
    Dynamic Hand Gesture Recognition using Skeleton-based Features ,
    Quentin De Smedt, Hazem Wannous and Jean-Philippe Vandeborre, 
    2016 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW).
    Download from http://www-rech.telecom-lille.fr/DHGdataset/ and unzip into ./dataset_shrec
""""
def resize_gestures(input_gestures, final_length=100):
    output_gestures = numpy.array([numpy.array([ndimage.zoom(x_i.T[j], final_length / len(x_i), mode='reflect') for j in range(numpy.size(x_i, 1))]).T for x_i in input_gestures])
    return output_gestures


def load_gestures(dataset, root, version_x, resize_gesture_to_length):
    if version_x == '3D':
            pattern = root + '/gesture_*/finger_*/subject_*/essai_*/skeletons_world.txt'
    else:
            pattern = root + '/gesture_*/finger_*/subject_*/essai_*/skeletons_image.txt'

    gestures_filenames = sorted(glob.glob(pattern))
    gestures = [numpy.genfromtxt(f) for f in gestures_filenames]
    if resize_gesture_to_length is not None:
        gestures = resize_gestures(gestures, final_length=resize_gesture_to_length)
#     print(filename.split('/') for filename in gestures_filenames)
    labels_14 = [int(filename.split('/')[-5].split('_')[1]) for filename in gestures_filenames]
    labels_28 = [int(filename.split('/')[-4].split('_')[1]) for filename in gestures_filenames]
    labels_28 = [labels_14[idx_gesture] if n_fingers_used == 1 else 14 + labels_14[idx_gesture] for idx_gesture, n_fingers_used in enumerate(labels_28)]
    return gestures, labels_14, labels_28


def write_data(data, filepath):
    with open(filepath, 'wb') as output_file:
        pickle.dump(data, output_file)


# def load_data(filepath):
#     file = open(filepath, 'rb')
#     data = pickle.load(file, encoding='latin1')  #could be'utf8'
#     file.close()
#     return data['x_train'], data['x_test'], data['y_train_14'], data['y_train_28'], data['y_test_14'], data['y_test_28']

In [18]:
gestures, labels_14, labels_28 = load_gestures(dataset='shrec',
                                               root='dataset_shrec',
                                               version_x='3D',
                                               resize_gesture_to_length=100)
# Split the dataset into train and test sets if you want:
x_train, x_test, y_train_14, y_test_14, y_train_28, y_test_28 = train_test_split(gestures, labels_14, labels_28, test_size=0.15)

# print(len(x_train), len(y_train_14), len(y_train_28), len(x_test), len(y_test_14), len(y_test_28) )

# Save the dataset
data = {
    'x_train': x_train,
    'x_test': x_test,
    'y_train_14': y_train_14,
    'y_train_28': y_train_28,
    'y_test_14': y_test_14,
    'y_test_28': y_test_28
}
write_data(data, filepath='dhg_data.pckl')

In [10]:
def load_data(filepath='dhg_data.pckl'):
    file = open(filepath, 'rb')
    data = pickle.load(file, encoding='latin1') 
    file.close()
    return data['x_train'], data['x_test'], data['y_train_14'], data['y_train_28'], data['y_test_14'], data['y_test_28']

In [11]:
def resize_sequences_length(x_train, x_test, final_length=100):
    x_train = numpy.array([numpy.array([ndimage.zoom(x_i.T[j], final_length / len(x_i), mode='reflect') for j in range(numpy.size(x_i, 1))]).T for x_i in x_train])
    x_test  = numpy.array([numpy.array([ndimage.zoom(x_i.T[j], final_length / len(x_i), mode='reflect') for j in range(numpy.size(x_i, 1)) ]).T for x_i in x_test])
    return x_train, x_test

In [12]:
def shuffle_dataset(x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28):
    x_train, y_train_14, y_train_28 = shuffle(x_train, y_train_14, y_train_28)
    x_test,  y_test_14,  y_test_28  = shuffle(x_test,  y_test_14,  y_test_28)
    return x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28

In [13]:
def preprocess_data(x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28):
    x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28 = shuffle_dataset(x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28)
    x_train, x_test = resize_sequences_length(x_train, x_test, final_length=100)
    return x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28

In [14]:
def convert_to_pytorch_tensors(x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28):
    # as numpy
    y_train_14, y_train_28, y_test_14, y_test_28 = numpy.array(y_train_14), numpy.array(y_train_28), numpy.array(y_test_14), numpy.array(y_test_28)
    
    # -- REQUIRED by the pytorch loss function implementation --
    # Remove 1 to all classes items (1-14 => 0-13 and 1-28 => 0-27)
    y_train_14, y_train_28, y_test_14, y_test_28 = y_train_14 - 1, y_train_28 - 1, y_test_14 - 1, y_test_28 - 1
    
    # as torch
    x_train, x_test = torch.from_numpy(x_train), torch.from_numpy(x_test)
    y_train_14, y_train_28, y_test_14, y_test_28 = torch.from_numpy(y_train_14), torch.from_numpy(y_train_28), torch.from_numpy(y_test_14), torch.from_numpy(y_test_28)

    # -- REQUIRED by the pytorch loss function implementation --
    # correct the data type (for the loss function used)
    x_train, x_test = x_train.type(torch.FloatTensor), x_test.type(torch.FloatTensor)
    y_train_14, y_train_28, y_test_14, y_test_28 = y_train_14.type(torch.LongTensor), y_train_28.type(torch.LongTensor), y_test_14.type(torch.LongTensor), y_test_28.type(torch.LongTensor)
    
    return x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28

In [15]:
# -------------
# Misc.
# -------------
def batch(tensor, batch_size=32):
    """Return a list of (mini) batches"""
    tensor_list = []
    length = tensor.shape[0]
    i = 0
    while True:
        if (i + 1) * batch_size >= length:
            tensor_list.append(tensor[i * batch_size: length])
            return tensor_list
        tensor_list.append(tensor[i * batch_size: (i + 1) * batch_size])
        i += 1


def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '{:02d}m {:02d}s'.format(int(m), int(s))


def get_accuracy(model, x, y_ref):
    """Get the accuracy of the pytorch model on a batch"""
    acc = 0.
    model.eval()
    with torch.no_grad():
        predicted = model(x)
        _, predicted = predicted.max(dim=1)
        acc = 1.0 * (predicted == y_ref).sum().item() / y_ref.shape[0]

    return acc

In [4]:
class HandGestureNet(torch.nn.Module):
    """
    citation:
    ------------
        @inproceedings{devineau2018deep,
            title={Deep learning for hand gesture recognition on skeletal data},
            author={Devineau, Guillaume and Moutarde, Fabien and Xi, Wang and Yang, Jie},
            booktitle={2018 13th IEEE International Conference on Automatic Face \& Gesture Recognition (FG 2018)},
            pages={106--113},
            year={2018},
            organization={IEEE}
        }
    """
    
    def __init__(self, n_channels=66, n_classes=14, dropout_probability=0.2):

        super(HandGestureNet, self).__init__()
        
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.dropout_probability = dropout_probability

        # Layers ----------------------------------------------
        self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.all_residual = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(in_features=9 * n_channels * 12, out_features=1936),  # <-- 12: depends of the sequences lengths (cf. below)
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=1936, out_features=n_classes)
        )

        # Initialization --------------------------------------
        # Xavier init
        for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual):
            for layer in module:
                if layer.__class__.__name__ == "Conv1d":
                    torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                    torch.nn.init.constant_(layer.bias, 0.1)

        for layer in self.fc:
            if layer.__class__.__name__ == "Linear":
                torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                torch.nn.init.constant_(layer.bias, 0.1)

    def forward(self, input):

        # Work on each channel separately
        all_features = []

        for channel in range(0, self.n_channels):
            input_channel = input[:, :, channel]

            # Add a dummy (spatial) dimension for the time convolutions
            # Conv1D format : (batch_size, n_feature_maps, duration)
            input_channel = input_channel.unsqueeze(1)

            high = self.all_conv_high[channel](input_channel)
            low = self.all_conv_low[channel](input_channel)
            ap_residual = self.all_residual[channel](input_channel)

            # Time convolutions are concatenated along the feature maps axis
            output_channel = torch.cat([
                high,
                low,
                ap_residual
            ], dim=1)
            all_features.append(output_channel)

        # Concatenate along the feature maps axis
        all_features = torch.cat(all_features, dim=1)
        
        # Flatten for the Linear layers
        all_features = all_features.view(-1, 9 * self.n_channels * 12)  # <-- 12: depends of the initial sequence length (100).
        # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture:
        # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well!

        # Fully-Connected Layers
        output = self.fc(all_features)

        return output

In [25]:
# Load the dataset
x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28 = load_data()

# Shuffle sequences and resize sequences
x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28 = preprocess_data(x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28)

# Convert to pytorch variables
x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28 = convert_to_pytorch_tensors(x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28)

In [26]:
model = HandGestureNet(n_channels=66, n_classes=14)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

In [28]:
def train(model, criterion, optimizer,
          x_train, y_train, x_test, y_test,
          force_cpu=False, num_epochs=5):
    
    # from the paper
    if torch.cuda.is_available() and not force_cpu:
        device = torch.device("cuda")
    elif torch.has_mps:
        device = torch.device('mps')
    else: 
        device = torch.device("cpu")
        
    model = model.to(device)
    x_train, y_train, x_test, y_test = x_train.to(device), y_train.to(device), x_test.to(device), y_test.to(device)
    
    # (bonus) log accuracy values to visualize them in tensorboard:
    writer = SummaryWriter()
    
    # Prepare all mini-batches
    x_train_batches = batch(x_train)
    y_train_batches = batch(y_train)
    
    # Training starting time
    start = time.time()

    print('[INFO] Started to train the model.')
    print('Training the model on {}.'.format('GPU' if (device == torch.device('cuda') or device == torch.device('mps')) else 'CPU'))
    
    for ep in range(num_epochs):

        # Ensure we're still in training mode
        model.train()

        current_loss = 0.0

        for idx_batch, train_batches in enumerate(zip(x_train_batches, y_train_batches)):

            # get a mini-batch of sequences
            x_train_batch, y_train_batch = train_batches

            # zero the gradient parameters
            optimizer.zero_grad()

            # forward
            outputs = model(x_train_batch)

            # backward + optimize
            # backward
            loss = criterion(outputs, y_train_batch)
            loss.backward()
            # optimize
            optimizer.step()
            # for an easy access
            current_loss += loss.item()
        
        train_acc = get_accuracy(model, x_train, y_train)
        test_acc = get_accuracy(model, x_test, y_test)
        
        writer.add_scalar('data/accuracy_train', train_acc, ep)
        writer.add_scalar('data/accuracy_test', test_acc, ep)
        print('Epoch #{:03d} | Time elapsed : {} | Loss : {:.4e} | Accuracy_train : {:.4e} | Accuracy_test : {:.4e}'.format(
                ep + 1, time_since(start), current_loss, train_acc, test_acc))

    print('[INFO] Finished training the model. Total time : {}.'.format(time_since(start)))

In [29]:
# Please adjust the training epochs count, and the other hyperparams (lr, dropout, ...), for a non-overfitted training according to your own needs.
# tip: use tensorboard to display the accuracy (see cells above for tensorboard usage)

num_epochs = 20

train(model=model, criterion=criterion, optimizer=optimizer,
      x_train=x_train, y_train=y_train_14, x_test=x_test, y_test=y_test_14,
      num_epochs=num_epochs)

[INFO] Started to train the model.
Training the model on CPU.
Epoch #001 | Time elapsed : 02m 35s | Loss : 3.3115e+02 | Accuracy_train : 3.2689e-01 | Accuracy_test : 3.1667e-01
Epoch #002 | Time elapsed : 05m 09s | Loss : 1.2053e+02 | Accuracy_train : 5.1639e-01 | Accuracy_test : 4.4524e-01
Epoch #003 | Time elapsed : 07m 54s | Loss : 8.6311e+01 | Accuracy_train : 7.2605e-01 | Accuracy_test : 6.4762e-01
Epoch #004 | Time elapsed : 10m 48s | Loss : 6.3097e+01 | Accuracy_train : 7.9328e-01 | Accuracy_test : 7.1667e-01
Epoch #005 | Time elapsed : 13m 30s | Loss : 5.0130e+01 | Accuracy_train : 8.3025e-01 | Accuracy_test : 7.4048e-01
Epoch #006 | Time elapsed : 16m 35s | Loss : 4.3560e+01 | Accuracy_train : 8.5042e-01 | Accuracy_test : 7.8333e-01
Epoch #007 | Time elapsed : 19m 18s | Loss : 3.7319e+01 | Accuracy_train : 8.7437e-01 | Accuracy_test : 8.1667e-01
Epoch #008 | Time elapsed : 22m 09s | Loss : 3.2202e+01 | Accuracy_train : 9.0420e-01 | Accuracy_test : 8.4524e-01
Epoch #009 | Time 

In [30]:
torch.save(model.state_dict(), 'gesture_pretrained_model.pt')

In [5]:
model = HandGestureNet(n_channels=66, n_classes=14)
model.load_state_dict(torch.load('gesture_pretrained_model.pt'))
# model.eval()


HandGestureNet(
  (all_conv_high): ModuleList(
    (0-65): 66 x Sequential(
      (0): Conv1d(1, 8, kernel_size=(7,), stride=(1,), padding=(3,))
      (1): ReLU()
      (2): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      (3): Conv1d(8, 4, kernel_size=(7,), stride=(1,), padding=(3,))
      (4): ReLU()
      (5): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      (6): Conv1d(4, 4, kernel_size=(7,), stride=(1,), padding=(3,))
      (7): ReLU()
      (8): Dropout(p=0.2, inplace=False)
      (9): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
    )
  )
  (all_conv_low): ModuleList(
    (0-65): 66 x Sequential(
      (0): Conv1d(1, 8, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): ReLU()
      (2): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      (3): Conv1d(8, 4, kernel_size=(3,), stride=(1,), padding=(1,))
      (4): ReLU()
      (5): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      (6): Conv1d(4, 4, kernel_size=(3,), stride=

In [6]:
import keras.utils as image
import numpy as np
from tensorflow.keras.applications.resnet50 import preprocess_input
# make predictions
# with torch.no_grad():
# #     demo_gesture_batch = torch.randn(32, 100, 66)
# #     img = image.load_img('test1.jpg', target_size=(224,224))
# #     image_array = image.img_to_array(img)
# #     img_batch = np.expand_dims(image_array, axis=0)
# #     img_preprocessed = preprocess_input(img_batch)
# #     predictions = model(32,100,66)
# #     _, predictions = predictions.max(dim=1)
#     print("Predicted gesture classes: {}".format(predictions))

dict = {
    0: "Grab",
    1: "Tap",
    2:"Expand",
    3:"Pinch",
    4:"Rotation Clockwis",
    5:"Rotation Counter Clockwise",
    6:"Swipe Right",
    7:"Swipe Left",
    8:"Swipe Up",
    9:"Swipe Down",
    10:"Swipe X",
    11:"Swipe +",
    12:"Swipe V",
    13:"Shake"
}
with torch.no_grad():
    demo_gesture_batch = torch.Tensor(32, 100, 66)
#     print(demo_gesture_batch)
    predictions = model(demo_gesture_batch)
    _, predictions = predictions.max(dim=1)
    l = predictions.tolist()
    for k in l:
        print("detected :", dict[k])

detected : Grab
detected : Grab
detected : Grab
detected : Grab
detected : Grab
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis
detected : Rotation Clockwis


In [7]:
# pip install 'pillow<7.0.0'

Note: you may need to restart the kernel to use updated packages.


The system cannot find the file specified.


In [28]:
from PIL import Image
import torchvision.transforms as transforms

# Read the image
image = Image.open('opencv_frame_0.png')

# Define a transform to convert the image to tensor
transform = transforms.ToTensor()

# Convert the image to PyTorch tensor
tensor = transform(image)
predictions = model(tensor)
_, predictions = predictions.max(dim=1)
ll = predictions.tolist()
for k in ll:
    print("detected :", dict[k])

print( "detected as:", dict[max(set(ll), key=ll.count)])
# print the converted image tensor
# print(tensor)

detected : Tap
detected : Rotation Counter Clockwise
detected : Shake
detected : Tap
detected : Shake
detected : Tap
detected : Rotation Counter Clockwise
detected : Shake
detected : Tap
detected : Shake
detected : Tap
detected : Rotation Counter Clockwise
detected : Shake
detected : Tap
detected : Shake
detected as: Tap
