In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [3]:
class LipNet(nn.Module):
    def __init__(self):
        super(LipNet, self).__init__()
        self.pooling = nn.MaxPool3d((1, 2, 2))
        self.conv1 = nn.Conv3d(1, 32, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
        self.batchnorm1 = nn.BatchNorm3d(32)
        self.conv2 = nn.Conv3d(32, 64, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
        self.batchnorm2 = nn.BatchNorm3d(64)
        self.conv3 = nn.Conv3d(64, 96, (3, 3, 3), stride=(1, 2, 2), padding=(1, 2, 2))
        self.batchnorm3 = nn.BatchNorm3d(96)
        self.gru1 = nn.GRU(num_layers=2, bidirectional=True, batch_first=True)
        self.gru2 = nn.GRU(num_layers=2, bidirectional=True, batch_first=True)
        self.dense1 = nn.Linear()
        self.dense2 = nn.Linear()
    def forward(self, input):
        pass

In [4]:
from skimage.io import imread

In [29]:
FRAME_PATH = "/datadrive/faces/"

In [30]:
from os import path
import os
from tqdm import tqdm

In [45]:
def fixname(s):
    return s.split('_')[2]

speakers = {}
for s in tqdm(os.listdir(FRAME_PATH)):
    PATH = path.join(FRAME_PATH, s)
    speakers[s] = {}
    for folder in os.listdir(PATH):
        PATH2 = path.join(PATH, folder)
        speakers[s][fixname(folder)] = []
        for filename in os.listdir(PATH2):
            speakers[s][fixname(folder)].append(imread(path.join(PATH2, filename)))

100%|██████████| 33/33 [07:09<00:00, 13.02s/it]


In [64]:
WORD_PATH = "/home/mavrandr/gridcorpus/words/"

def fixname(s):
    return s.split('.')[0]

word_alignments = {}
for s in tqdm(os.listdir(WORD_PATH)):
    PATH = path.join(WORD_PATH, s, "align")
    word_alignments[s] = {}
    for filename in os.listdir(PATH):
        word_alignments[s][fixname(filename)] = []
        with open(path.join(PATH, filename)) as ftr:
            for line in ftr:
                l1, l2, w = line.split()
                l1 = round(int(l1) / 1000) - 1
                l2 = round(int(l2) / 1000) + 1
                word_alignments[s][fixname(filename)].append((w, l1, l2))

100%|██████████| 34/34 [00:02<00:00, 12.83it/s]


In [65]:
word_alignments['s1']['sbbu1s']

[('sil', -1, 12),
 ('set', 10, 21),
 ('blue', 19, 26),
 ('by', 24, 29),
 ('u', 27, 33),
 ('one', 31, 39),
 ('soon', 37, 49),
 ('sil', 47, 75)]

In [153]:
import numpy as np

MAX_WORDS = 8
MAX_FRAMES = 8
for_valida = ["s5", "s14"]

def encode_words(s):
    res = []
    for word, _, _ in s:
        if word == 'sil':
            res.append(0)
        else:
            #print(word, s)
            res.extend(ord(a) - ord('a') + 1 for a in word)
            res.append(0)
    if s[-1][0] != 'sil':
        res.pop()
    return res

def generate_XY(speakers, word_alignments, words_lengths=(1, 2), frame_length=24, drop_rate=0.8):
    X, Y = [], []
    for s in speakers.keys():
        if s in for_valida:
            continue
        for vid in speakers[s].keys():
            if len(speakers[s][vid]) == 75 and vid in word_alignments[s] and np.random.rand() > drop_rate:
                length = np.random.choice(np.arange(*words_lengths)) 
                pos = np.random.choice(len(word_alignments[s][vid]) - length + 1)
                l, r = word_alignments[s][vid][pos][1], word_alignments[s][vid][pos + length - 1][2]
                l = max(0, l)
                if (r - l > frame_length):
                    continue
                X.append(speakers[s][vid][l:r])
                Y.append(encode_words(word_alignments[s][vid][pos:pos+length]))
    return X, Y

In [154]:
X, Y = generate_XY(speakers, word_alignments, drop_rate=0.99)

In [155]:
def add_zeros(X):
    max_len = max(len(x) for x in X)
    return np.array([x + [np.zeros((120, 120)) for i in range(max_len - len(x))] for x in X])

def iterate_batch(X, Y, batch_size=32):
    ind = np.arange(len(X))
    np.random.shuffle(ind)
    X = [X[i] for i in ind]
    Y = [Y[i] for i in ind]
    for i in range(0, len(X), batch_size):
        yield add_zeros(X[i:i+batch_size]), Y[i:i+batch_size]