In [37]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [38]:
class Flatten(nn.Module):
    def forward(self, x):
        out_x = x.transpose(1, 2)
        out_x = out_x.contiguous()
        dims = out_x.size()
        out_x = out_x.view(dims[0], dims[1], dims[2]*dims[3]*dims[4])
        return out_x

class LipNet(nn.Module):
    def __init__(self, hidden_size=256, vocab_size=28, n_layers=1, in_channels=1):
        super(LipNet, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.in_channels = in_channels
        self.conv1 = nn.Conv3d(in_channels=self.in_channels, out_channels=32, kernel_size=(3, 5, 5), 
                               stride=(1, 2, 2), padding=(1, 2, 2))
        self.pooling = nn.MaxPool3d((1, 2, 2))
        self.batchnorm1 = nn.BatchNorm3d(32)
        self.conv2 = nn.Conv3d(in_channels=32, out_channels=64, kernel_size=(3, 5, 5), 
                               stride=(1, 2, 2), padding=(1, 2, 2))
        self.batchnorm2 = nn.BatchNorm3d(64)
        self.conv3 = nn.Conv3d(in_channels=64, out_channels=96, kernel_size=(3, 3, 3), 
                               stride=(1, 2, 2), padding=(1, 1, 1))
        self.batchnorm3 = nn.BatchNorm3d(96)
        self.flat = Flatten()
        self.gru1 = nn.GRU(input_size=96, hidden_size=hidden_size, num_layers=self.n_layers, 
                           bidirectional=True, batch_first=True)
        self.dense1 = nn.Linear(512, 28)
        self.softmax = nn.Softmax(dim=2)
    def forward(self, input, hidden):
        output = self.conv1(input)
        output = self.pooling(output)
        output = self.conv2(output)
        output = self.pooling(output)
        output = self.conv3(output)

        output = self.pooling(output)
        output = self.flat(output)
        output, hidden = self.gru1(output, hidden)
        output = self.dense1(output)
        #print(output.size())
        output = self.softmax(output)
        return output, hidden
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(2, batch_size, self.hidden_size))

In [39]:
ln = LipNet()
hidden = ln.init_hidden(1)

In [41]:
a = torch.Tensor(1, 1, 75, 50, 100)

In [42]:
a = a.zero_()

In [43]:
#a.view(1, 3, 75*5000)

In [44]:
test_fuck = Variable(a)

In [46]:
ln(test_fuck, hidden)

(Variable containing:
 (0 ,.,.) = 
 1.00000e-02 *
   3.3759  3.6246  3.4595  ...   3.5942  3.5296  3.5146
   3.3747  3.6099  3.4315  ...   3.6024  3.5313  3.5329
   3.3762  3.6023  3.4185  ...   3.6078  3.5310  3.5441
            ...             ⋱             ...          
   3.3956  3.5546  3.3931  ...   3.6289  3.5439  3.5675
   3.4094  3.5355  3.3869  ...   3.6390  3.5493  3.5664
   3.4403  3.4998  3.3749  ...   3.6572  3.5567  3.5620
 [torch.FloatTensor of size 1x75x28], Variable containing:
 ( 0 ,.,.) = 
 
 Columns 0 to 8 
    0.0116  0.0214 -0.0078  0.0019 -0.0451  0.0476  0.0336 -0.0120  0.0069
 
 Columns 9 to 17 
    0.0408  0.0355  0.0540  0.0368  0.0726 -0.0092 -0.0157  0.0362  0.0592
 
 Columns 18 to 26 
    0.0019 -0.0474 -0.0481 -0.0154 -0.0135  0.0476 -0.0887 -0.0228  0.0597
 
 Columns 27 to 35 
   -0.0627  0.0537  0.0001 -0.0054  0.0166  0.0733 -0.0076 -0.0079  0.0295
 
 Columns 36 to 44 
    0.0469  0.0128 -0.0708 -0.0088  0.0632 -0.0205  0.0250  0.0269 -0.0182
 
 Colum

In [233]:
from skimage.io import imread

In [234]:
FRAME_PATH = "/media/artem/data/Dataset/faces/"

In [235]:
from os import path
import os
from tqdm import tqdm

In [None]:
def fixname(s):
    return s.split('_')[2]

speakers = {}
for s in tqdm(os.listdir(FRAME_PATH)):
    PATH = path.join(FRAME_PATH, s)
    speakers[s] = {}
    for folder in os.listdir(PATH):
        PATH2 = path.join(PATH, folder)
        speakers[s][fixname(folder)] = []
        for filename in os.listdir(PATH2):
            speakers[s][fixname(folder)].append(imread(path.join(PATH2, filename)))

 88%|████████▊ | 29/33 [08:49<01:13, 18.27s/it]

In [8]:
WORD_PATH = "/media/artem/data/WLAS/gridcorpus/"

def fixname(s):
    return s.split('.')[0]

word_alignments = {}
for s in tqdm(os.listdir(WORD_PATH)):
    PATH = path.join(WORD_PATH, s, "align")
    word_alignments[s] = {}
    for filename in os.listdir(PATH):
        word_alignments[s][fixname(filename)] = []
        with open(path.join(PATH, filename)) as ftr:
            for line in ftr:
                l1, l2, w = line.split()
                l1 = round(int(l1) / 1000) - 1
                l2 = round(int(l2) / 1000) + 1
                word_alignments[s][fixname(filename)].append((w, l1, l2))

  0%|          | 0/3 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/media/artem/data/WLAS/gridcorpus/video/align'

In [65]:
word_alignments['s1']['sbbu1s']

[('sil', -1, 12),
 ('set', 10, 21),
 ('blue', 19, 26),
 ('by', 24, 29),
 ('u', 27, 33),
 ('one', 31, 39),
 ('soon', 37, 49),
 ('sil', 47, 75)]

In [153]:
import numpy as np

MAX_WORDS = 8
MAX_FRAMES = 8
for_valida = ["s5", "s14"]

def encode_words(s):
    res = []
    for word, _, _ in s:
        if word == 'sil':
            res.append(0)
        else:
            #print(word, s)
            res.extend(ord(a) - ord('a') + 1 for a in word)
            res.append(0)
    if s[-1][0] != 'sil':
        res.pop()
    return res

def generate_XY(speakers, word_alignments, words_lengths=(1, 2), frame_length=24, drop_rate=0.8):
    X, Y = [], []
    for s in speakers.keys():
        if s in for_valida:
            continue
        for vid in speakers[s].keys():
            if len(speakers[s][vid]) == 75 and vid in word_alignments[s] and np.random.rand() > drop_rate:
                length = np.random.choice(np.arange(*words_lengths)) 
                pos = np.random.choice(len(word_alignments[s][vid]) - length + 1)
                l, r = word_alignments[s][vid][pos][1], word_alignments[s][vid][pos + length - 1][2]
                l = max(0, l)
                if (r - l > frame_length):
                    continue
                X.append(speakers[s][vid][l:r])
                Y.append(encode_words(word_alignments[s][vid][pos:pos+length]))
    return X, Y

In [154]:
X, Y = generate_XY(speakers, word_alignments, drop_rate=0.99)

In [155]:
def add_zeros(X):
    max_len = max(len(x) for x in X)
    return np.array([x + [np.zeros((120, 120)) for i in range(max_len - len(x))] for x in X])

def iterate_batch(X, Y, batch_size=32):
    ind = np.arange(len(X))
    np.random.shuffle(ind)
    X = [X[i] for i in ind]
    Y = [Y[i] for i in ind]
    for i in range(0, len(X), batch_size):
        yield add_zeros(X[i:i+batch_size]), Y[i:i+batch_size]