In [107]:
import numpy as np
import tensorflow as tf
import os
import tqdm
import glob
import sys
import matplotlib.pyplot as plt
import pickle
import multiprocessing
import itertools
import random
from sklearn.model_selection import train_test_split
%matplotlib inline

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
with open("../data/lip_reading/file2nums.pickle", "rb") as f:
    file2nums = pickle.load(f)

In [92]:
with open("../data/lip_reading/new_alig", "r") as f:
    phonemes = f.readlines()
phonemes = list(map(lambda x: x.strip().split(), phonemes))

In [94]:
with open("../data/lip_reading/2_features/phones", "r") as f:
    id2phone = list(map(str.strip, f.readlines()))

In [96]:
phone2id = dict(zip(id2phone, range(len(id2phone))))

In [98]:
file2phomenes = {}
for filename, *phones in phonemes:
    file2phomenes[filename] = list(map(lambda x: phone2id[x], phones))

In [102]:
def align_frames(n, initial_video, timesteps):
    assert initial_video.shape[0] == len(timesteps)
    timesteps_new = np.round(timesteps / 10)
    new_video = np.zeros((n, initial_video.shape[2], initial_video.shape[1]))
    pointer_cur = 0
    for i in range(n):
        if pointer_cur + 1 < len(timesteps_new):
            if timesteps_new[pointer_cur + 1] - \
                np.floor((timesteps_new[pointer_cur + 1] - \
                      timesteps_new[pointer_cur]) / 2) <= i:
                pointer_cur += 1
        frame = initial_video[pointer_cur].T
        new_video[i] = (frame - np.mean(frame)) / np.std(frame, ddof=1)
#         print(i, timesteps_new[pointer_cur])
    return new_video

In [103]:
for file in tqdm.tqdm(glob.glob("../data/lip_reading/1_video_lips/train_*/*.pickle")):
    file_only_name = os.path.splitext(os.path.basename(file))[0]
    labels = file_only_name.split("_")[2]
    labels = list(map(int, labels))
    try:
        labels_rec = file2nums[file_only_name]
        assert labels_rec == labels
    except KeyError:
        continue
    mfcc_file = os.path.join("../data/lip_reading/2_features/mfcc/train/", file_only_name + ".npy")
    mfcc = np.load(mfcc_file)
    with open(file, "rb") as f:
        current_video = pickle.load(f)
        landmarks = pickle.load(f)
        timesteps = pickle.load(f)
        frames = pickle.load(f)
    full_video = align_frames(mfcc.shape[0], current_video, timesteps)
    file_out = os.path.join("../data/lip_reading/synchronized/train/", file_only_name + ".npz")
    cur_phomenes = file2phomenes[file_only_name]
    np.savez(file_out, mfcc=mfcc, video=full_video, landmarks=landmarks, labels=np.array(labels), phonemes=cur_phomenes)

100%|██████████| 9435/9435 [05:56<00:00, 26.47it/s]


In [114]:
files = list(glob.glob("../data/lip_reading/synchronized/train/*.npz"))

In [116]:
files_train, files_val = train_test_split(files, test_size=0.1)

In [117]:
with open("train_files.pickle", "wb") as f:
    pickle.dump(files_train, f, pickle.HIGHEST_PROTOCOL)
with open("val_files.pickle", "wb") as f:
    pickle.dump(files_val, f, pickle.HIGHEST_PROTOCOL)