In [1]:
import os
from datasets.rwth import RWTHSignDataset

import torch
import torch.nn as nn

import torchvision.transforms as transforms
from torch.utils.data import DataLoader

from models.convlstm import ConvLSTM

%load_ext autoreload
%autoreload 2

In [2]:
class RWTHFrameStack(RWTHSignDataset):
    def __init__(self, wsize = 12, stride = 7, **kwargs):
        super(RWTHFrameStack, self).__init__(**kwargs)
        self.wsize = wsize
        self.stride = stride
        
    def __getitem__(self, idx):
        self.frames, _ = super(RWTHFrameStack, self).__getitem__(idx)
        flen = len(self.frames)
        if flen:
            self.framestack = [self.frames[i:i+self.wsize] \
                           for i in range(0, flen - self.wsize, self.stride)]
            self.framestack = [torch.stack(f) for f in self.framestack]
            return torch.stack(self.framestack)
        else:
            self.framestack = torch.Tensor([])
            return self.framestack
        
    def __len__(self):
        return len(self.framestack)

In [3]:
preprocess = preprocess = transforms.Compose([
    transforms.Resize((257, 257)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [4]:
basedir = "/home/reverie/datasets/phoenix2014-release/phoenix-2014-multisigner"
csvf = "annotations/manual/train.corpus.csv"

csvf = os.path.join(basedir, csvf)
signdir = os.path.join(basedir, "features/fullFrame-210x260px/train/")
rwthfs = RWTHFrameStack(csv_file = csvf, sign_dir=signdir, transform=preprocess)

In [5]:
t = rwthfs.__getitem__(18)

In [6]:
t.shape

torch.Size([14, 12, 3, 257, 257])

In [7]:
#import h5py
#from tqdm import tqdm

            

In [8]:
class ConvAutoencoder(nn.Module):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
        # Encoder
        self.conv1 = nn.Conv2d(3, 128, kernel_size=11, stride=4, padding=1) 
        self.conv2 = nn.Conv2d(128, 64, kernel_size=5, stride=2, padding=1)
        self.clstm1 = ConvLSTM((31, 31), 64, \
                               kernel_size=(3, 3), hidden_dim=64, num_layers=1, \
                               batch_first=True)#, return_all_layers=True)
        # Bottleneck
        self.bottleneck = ConvLSTM((31, 31), 64, \
                                   kernel_size=(3, 3), hidden_dim=32, num_layers=1, \
                                   batch_first = True)#, return_all_layers=True)
        
        # Decoder
        self.clstm2 = ConvLSTM((31, 31), 32, \
                               kernel_size=(3, 3), hidden_dim=64, num_layers=1, \
                               batch_first = True)#, return_all_layers=True)
        self.deconv1 = nn.ConvTranspose2d(64, 128, kernel_size=5, stride=2, padding=1)
        self.deconv2 = nn.ConvTranspose2d(128, 3, kernel_size=11, stride=4, padding=1)
    
    def encode(self, x):
        """
        Encode including bottleneck.
        """
        self.batch_size, self.num_frames = x.shape[0], x.shape[1]
        x = x.view(-1, x.shape[2], x.shape[3], x.shape[4])
        x = self.conv1(x)
        x = self.conv2(x)
        #breakpoint()
        x = x.view(-1, self.num_frames, x.shape[1], x.shape[2], x.shape[3])
        #breakpoint()
        x, _ = self.clstm1(x)
        x, _ = self.bottleneck(x[0])
        return x[0]
    
    def decode(self, x):
        x, _ = self.clstm2(x)
        x = x[0]
        x = x.view(-1, x.shape[2], x.shape[3], x.shape[4])
        breakpoint()
        x = self.deconv1(x)
        x = self.deconv2(x)
        x = x.view(self.batch_size, self.num_frames, x.shape[1], x.shape[2], x.shape[3])
        return x
    
    def forward(self, x):
        """
        x is a video sequence of size T x 3 x 256 x 256
        """
        x = self.encode(x)
        x = self.decode(x)
        return x        
        

In [9]:
cae = ConvAutoencoder()

In [10]:
cae = cae.cuda()

In [11]:
t.shape

torch.Size([14, 12, 3, 257, 257])

In [12]:
t = cae.forward(t.cuda())

> <ipython-input-8-5b4d696367b5>(42)decode()
-> x = self.deconv1(x)
(Pdb) c


In [13]:
t.shape

torch.Size([14, 12, 3, 257, 257])

In [14]:
#nn.ConvTranspose2d