In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import math
import os

In [18]:
device = 'cpu'

In [37]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len):
        super().__init__()
        # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
        # max_len determines how far the position can have an effect on a token (window)
        
        # Info
        self.dropout = nn.Dropout(dropout_p)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [49]:
class Transformer(nn.Module):
    """
    Model from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/p/c80afbc9ffb1/
    """
    # Constructor
    def __init__(
        self,
        num_tokens=10, # number of ASL signs
        dim_model=74, # number of features in pose vector
        num_heads=2,
        num_encoder_layers=6,
        num_decoder_layers=6,
        dim_feedforward=2048,
        dropout_p=0.1 # default dropout
    ):
        super().__init__()

        # INFO
        self.model_type = "Transformer"
        self.dim_model = dim_model

        # LAYERS
        self.positional_encoder = PositionalEncoding(
            dim_model=dim_model, dropout_p=dropout_p, max_len=5000
        )
        self.transformer = nn.Transformer(
            d_model=dim_model, # plus one for positional encoding
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward
        )
        self.out = nn.Linear(dim_model, num_tokens)

    def forward(self, src, tgt):
        # Src size must be (batch_size, src sequence length)
        # Tgt size must be (batch_size, tgt sequence length, dim_model)
        src = self.positional_encoder(src)

        # we permute to obtain size (sequence length, batch_size, dim_model),b
        src = src.permute(1, 0, 2)
        #tgt = tgt.permute(1, 0, 2) # should be size (1, batch_size, dim_model)

        # Transformer blocks - Out size = (sequence length, batch_size, num_tokens)
        transformer_out = self.transformer(src, tgt)
        out = self.out(transformer_out)

        return out

In [66]:
def read_feature_file(file):
    df = pd.read_csv(file)
    max_frame = max(df['frame_number'])
    frames = []
    for i in range(max_frame):
        df_coord = df[df['frame_number'] == i].drop(['frame_number', 'landmark'], axis=1)
        if len(df_coord) == 0:
            continue
        frames.append(sum(df_coord.values.tolist(), [])[:-1])
    return frames

def transform(features):
    return torch.tensor(features)

def target_transform(label):
    one_hot = [0 for i in range(10)]
    one_hot[label] = 1
    return torch.tensor(one_hot, dtype = torch.float64)

In [9]:
import json
import os

class VideoDataset(Dataset):
    def __init__(self, annotations_file, features_dir, transform=None, target_transform=None):
        self.labels = pd.read_csv(annotations_file).to_dict(orient="split")['data']
        self.features_dir = features_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        video_id, label = self.labels[idx]
        path = self.features_dir + "/" + str(video_id) + ".csv"
        features = read_feature_file(path)
        if self.transform:
            features = self.transform(features)
        if self.target_transform:
            label = self.target_transform(label)
        return features, label

In [77]:
DIM_MODEL = 74 # REPLACE WITH NUMBER OF FEATURES
CLASS_QUERY = torch.zeros(1, DIM_MODEL)

training_data = VideoDataset("train_annotations.csv", "./features", transform=transform, target_transform=target_transform, )
test_data = VideoDataset("test_annotations.csv", "./features", transform=transform, target_transform=target_transform)
train_dataloader = DataLoader(training_data, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=True)

In [65]:
def train_loop(model, opt, loss_fn, dataloader):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    model.train()
    total_loss = 0
    softmax = torch.nn.Softmax(dim=1)
    
    for batch in dataloader:
        X, y = batch[0], batch[1]
        X, y = X.to(device), y.to(device)

        y_input = torch.stack([CLASS_QUERY], dim=0) # i have no idea if this is right
        y_expected = y

        # Standard training except we pass in y_input
        pred = model(X, y_input) 

        # Permute pred to have batch size first again
        pred = pred.permute(1, 2, 0)[:, :, 0]   
        pred = softmax(pred)
        loss = loss_fn(pred, y_expected)

        opt.zero_grad()
        loss.backward()
        opt.step()
    
        total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [71]:
def validation_loop(model, loss_fn, dataloader):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    model.eval()
    total_loss = 0
    softmax = torch.nn.Softmax(dim=1)
    
    with torch.no_grad():
        for batch in dataloader:
            X, y = batch[0], batch[1]
            X, y = X.to(device), y.to(device)

            y_input = torch.stack([CLASS_QUERY], dim=0) # i have no idea if this is right
            y_expected = y

            # Standard training except we pass in y_input
            pred = model(X, y_input) 

            # Permute pred to have batch size first again
            pred = pred.permute(1, 2, 0)[:, :, 0]   
            pred = softmax(pred)
            loss = loss_fn(pred, y_expected)
            total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [72]:
def fit(model, opt, loss_fn, train_dataloader, val_dataloader, epochs):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    # Used for plotting later on
    train_loss_list, validation_loss_list = [], []
    
    print("Training and validating model")
    for epoch in range(epochs):
        print("-"*25, f"Epoch {epoch + 1}","-"*25)
        
        train_loss = train_loop(model, opt, loss_fn, train_dataloader)
        train_loss_list += [train_loss]
        
        validation_loss = validation_loop(model, loss_fn, val_dataloader)
        validation_loss_list += [validation_loss]
        
        print(f"Training loss: {train_loss:.4f}")
        print(f"Validation loss: {validation_loss:.4f}")
        print()
        
    return train_loss_list, validation_loss_list

model = Transformer()
loss_fn = torch.nn.CrossEntropyLoss()
opt = torch.optim.SGD(model.parameters(), lr=0.001)
train_loss_list, validation_loss_list = fit(model, opt, loss_fn, train_dataloader, val_dataloader, 10)



Training and validating model
------------------------- Epoch 1 -------------------------
Training loss: 1.5839
Validation loss: 1.4780

------------------------- Epoch 2 -------------------------
Training loss: 1.4874
Validation loss: 1.4705

------------------------- Epoch 3 -------------------------
Training loss: 1.4777
Validation loss: 1.4679

------------------------- Epoch 4 -------------------------
Training loss: 1.4737
Validation loss: 1.4665

------------------------- Epoch 5 -------------------------
Training loss: 1.4714
Validation loss: 1.4655

------------------------- Epoch 6 -------------------------
Training loss: 1.4696
Validation loss: 1.4649

------------------------- Epoch 7 -------------------------
Training loss: 1.4686
Validation loss: 1.4645

------------------------- Epoch 8 -------------------------
Training loss: 1.4675
Validation loss: 1.4641

------------------------- Epoch 9 -------------------------
Training loss: 1.4669
Validation loss: 1.4638

-------

In [75]:
def predict(model, input_sequence):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    model.eval()
    
    y_input = torch.stack([CLASS_QUERY], dim=0)

    pred = model(input_sequence, y_input)
        
    next_item = pred.topk(1)[1].view(-1)[-1].item() # num with highest probability

    return next_item

In [76]:
correct = 0
for batch in test_dataloader:
    pred = predict(model, batch[0])
    if pred == batch[0]:
        correct += 1
print correct / len(test_dataloader)

0
