In [1]:
import numpy as np
import tensorflow as tf
import os
import tqdm
import glob
import sys
import matplotlib.pyplot as plt
import pickle
import multiprocessing
import itertools
import random
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
with open("../data/lip_reading/file2nums.pickle", "rb") as f:
    file2nums = pickle.load(f)

In [4]:
with open("../data/lip_reading/new_alig", "r") as f:
    phonemes = f.readlines()
phonemes = list(map(lambda x: x.strip().split(), phonemes))

In [5]:
with open("../data/lip_reading/2_features/phones", "r") as f:
    id2phone = list(map(str.strip, f.readlines()))

In [6]:
phone2id = dict(zip(id2phone, range(len(id2phone))))

In [7]:
file2phomenes = {}
for filename, *phones in phonemes:
    file2phomenes[filename] = list(map(lambda x: phone2id[x], phones))

In [9]:
def align_frames(n, initial_video, timesteps):
    assert initial_video.shape[0] == len(timesteps)
    timesteps_new = np.round(timesteps / 10)
    new_video = np.zeros((n, initial_video.shape[2], initial_video.shape[1]))
    pointer_cur = 0
    for i in range(n):
        if pointer_cur + 1 < len(timesteps_new):
            if timesteps_new[pointer_cur + 1] - \
                np.floor((timesteps_new[pointer_cur + 1] - \
                      timesteps_new[pointer_cur]) / 2) <= i:
                pointer_cur += 1
        frame = initial_video[pointer_cur].T
        new_video[i] = (frame - np.mean(frame)) / np.std(frame, ddof=1)
#         print(i, timesteps_new[pointer_cur])
    return new_video

In [11]:
for file in tqdm.tqdm(glob.glob("../data/lip_reading/1_video_lips/train_*/*.pickle")):
    file_only_name = os.path.splitext(os.path.basename(file))[0]
    labels = file_only_name.split("_")[2]
    labels = list(map(int, labels))
    try:
        labels_rec = file2nums[file_only_name]
        assert labels_rec == labels
    except KeyError:
        print("no transcription for file", file_only_name)
        continue
    mfcc_file = os.path.join("../data/lip_reading/2_features/mfcc/train/", file_only_name + ".npy")
    mfcc = np.load(mfcc_file)
    mfcc = (mfcc - mfcc.mean(axis=0)) / mfcc.std(axis=0)
    
    fbanks_file = os.path.join("../data/lip_reading/2_features/fbank/train/", file_only_name + ".npy")
    fbanks = np.load(fbanks_file)
    fbanks = (fbanks - fbanks.mean(axis=0)) / fbanks.std(axis=0)
    
    with open(file, "rb") as f:
        current_video = pickle.load(f)
        landmarks = pickle.load(f)
        timesteps = pickle.load(f)
        frames = pickle.load(f)
    full_video = align_frames(mfcc.shape[0], current_video, timesteps)
    file_out = os.path.join("../data/lip_reading/synchronized/train/", file_only_name + ".npz")
    cur_phomenes = file2phomenes[file_only_name]
    full_video = full_video[:len(cur_phomenes)] # fix?
    mfcc = mfcc[:len(cur_phomenes)] # fix ? 
    fbanks = mfcc[:len(cur_phomenes)] 
    np.savez(file_out, mfcc=mfcc, fbanks=fbanks, video=full_video, landmarks=landmarks, 
             labels=np.array(labels), phonemes=cur_phomenes)

  2%|▏         | 164/9435 [00:06<05:45, 26.81it/s]

no transcription for file F0637_01_9840153267_Android_SM


  9%|▊         | 804/9435 [00:29<04:46, 30.15it/s]

no transcription for file F0248_01_8327940651_Android_SM


  9%|▉         | 857/9435 [00:31<04:31, 31.54it/s]

no transcription for file F0343_01_7639215084_Android_SM


 10%|█         | 983/9435 [00:36<04:38, 30.31it/s]

no transcription for file F0637_01_5963184720_Android_SM
no transcription for file F0580_01_6523709841_Android_SM


 13%|█▎        | 1249/9435 [00:47<04:16, 31.86it/s]

no transcription for file F0248_01_1893642075_Android_SM


 14%|█▎        | 1278/9435 [00:49<06:50, 19.85it/s]

no transcription for file F0248_01_8207145963_Android_SM


 14%|█▍        | 1309/9435 [00:50<05:07, 26.41it/s]

no transcription for file F0211_02_9810643752_Android_SM


 19%|█▉        | 1779/9435 [01:09<04:19, 29.46it/s]

no transcription for file F0211_03_1067453829_Android_SM


 21%|██        | 1938/9435 [01:17<07:17, 17.15it/s]

no transcription for file F0580_01_7862593104_Android_SM


 24%|██▍       | 2304/9435 [01:32<03:43, 31.94it/s]

no transcription for file F0211_03_3581479260_Android_SM


 25%|██▍       | 2351/9435 [01:34<05:29, 21.52it/s]

no transcription for file F0211_02_6028371954_Android_SM


 28%|██▊       | 2615/9435 [01:45<05:16, 21.52it/s]

no transcription for file F0275_01_4702591863_Android_SM


 28%|██▊       | 2624/9435 [01:45<08:18, 13.66it/s]

no transcription for file F0283_01_3607981452_Android_SM


 31%|███       | 2881/9435 [01:57<06:46, 16.14it/s]

no transcription for file F0211_02_6702385914_Android_SM


 32%|███▏      | 3037/9435 [02:03<04:55, 21.63it/s]

no transcription for file M0262_01_8794263051_Android_SM


 32%|███▏      | 3056/9435 [02:04<03:53, 27.35it/s]

no transcription for file M0342_01_7639215084_Android_SM


 33%|███▎      | 3151/9435 [02:07<03:16, 32.04it/s]

no transcription for file M0165_01_8567201493_iPhone_iphone6
no transcription for file M0107_01_0198654273_Android_SM


 39%|███▉      | 3679/9435 [02:30<03:25, 27.97it/s]

no transcription for file M0273_01_0921685374_Android_SM


 41%|████      | 3875/9435 [02:39<03:03, 30.32it/s]

no transcription for file M0342_01_2065387194_Android_SM


 41%|████      | 3888/9435 [02:39<03:19, 27.80it/s]

no transcription for file M0165_01_4695230871_iPhone_iphone6


 44%|████▍     | 4163/9435 [02:51<03:47, 23.14it/s]

no transcription for file M0262_01_0921685374_Android_SM


 44%|████▍     | 4193/9435 [02:52<03:01, 28.84it/s]

no transcription for file M0107_01_6851047923_Android_SM


 47%|████▋     | 4409/9435 [03:02<03:03, 27.45it/s]

no transcription for file M0140_01_4012579836_Android_SM


 51%|█████     | 4773/9435 [03:18<03:49, 20.31it/s]

no transcription for file M0190_03_83971_iPhone_iphone6


 52%|█████▏    | 4912/9435 [03:24<02:39, 28.40it/s]

no transcription for file M0100_01_4027851963_Android_htc


 52%|█████▏    | 4950/9435 [03:25<02:48, 26.57it/s]

no transcription for file M0273_01_4702591863_Android_SM


 58%|█████▊    | 5441/9435 [03:46<02:14, 29.63it/s]

no transcription for file M0100_01_1698075324_Android_htc


 58%|█████▊    | 5462/9435 [03:46<02:04, 31.80it/s]

no transcription for file M0153_02_3752809461_Android_SM


 59%|█████▊    | 5532/9435 [03:49<02:10, 29.94it/s]

no transcription for file M0100_01_5794182036_Android_htc
no transcription for file M0107_01_0362491785_Android_SM


 59%|█████▉    | 5607/9435 [03:52<02:15, 28.29it/s]

no transcription for file M0103_03_4532091867_iPhone_iphone6


 61%|██████    | 5731/9435 [03:57<02:37, 23.54it/s]

no transcription for file M0116_01_8521607493_Android_SM


 63%|██████▎   | 5985/9435 [04:08<02:23, 24.06it/s]

no transcription for file M026_01_20914_Android_SM


 65%|██████▍   | 6119/9435 [04:14<02:09, 25.51it/s]

no transcription for file M0342_01_4857312906_Android_SM


 65%|██████▍   | 6131/9435 [04:15<01:51, 29.65it/s]

no transcription for file M0190_03_6382074195_iPhone_iphone6


 66%|██████▌   | 6181/9435 [04:16<01:48, 29.86it/s]

no transcription for file M0291_01_3607981452_Android_SM


 66%|██████▌   | 6205/9435 [04:17<01:49, 29.53it/s]

no transcription for file M0614_01_7231908456_Android_SM


 67%|██████▋   | 6296/9435 [04:21<01:43, 30.24it/s]

no transcription for file M0756_01_2546710893_Android_SM


 67%|██████▋   | 6367/9435 [04:24<01:54, 26.69it/s]

no transcription for file M0760_01_3658079142_iPhone_iphone6


 71%|███████   | 6693/9435 [04:38<01:40, 27.26it/s]

no transcription for file M0717_01_9831657024_Android_SM


 72%|███████▏  | 6772/9435 [04:41<01:34, 28.21it/s]

no transcription for file M0612_02_4796280531_Android_SM


 73%|███████▎  | 6911/9435 [04:46<01:30, 27.80it/s]

no transcription for file M0756_01_1859634720_Android_SM


 73%|███████▎  | 6918/9435 [04:47<02:48, 14.98it/s]

no transcription for file M0622_01_8326501794_Android_SM


 75%|███████▍  | 7072/9435 [04:53<01:38, 23.98it/s]

no transcription for file M0614_01_8326501794_Android_SM


 75%|███████▌  | 7079/9435 [04:53<01:35, 24.70it/s]

no transcription for file M0760_01_6287915430_iPhone_iphone6


 80%|████████  | 7554/9435 [05:14<01:00, 30.98it/s]

no transcription for file M0612_01_6523709841_Android_SM


 81%|████████  | 7612/9435 [05:16<01:36, 18.95it/s]

no transcription for file M0487_03_4617392580_iPhone_iphone6


 82%|████████▏ | 7732/9435 [05:22<02:05, 13.57it/s]

no transcription for file M0622_01_7231908456_Android_SM


 82%|████████▏ | 7759/9435 [05:23<01:07, 24.72it/s]

no transcription for file M0746_03_7154386902_iPhone_iphone6


 84%|████████▍ | 7934/9435 [05:30<00:50, 29.63it/s]

no transcription for file M0568_01_5987236041_Android_SM


 85%|████████▍ | 7983/9435 [05:31<00:46, 31.01it/s]

no transcription for file M0717_01_23456_Android_SM


 88%|████████▊ | 8304/9435 [05:45<00:46, 24.08it/s]

no transcription for file M0746_03_4617392580_iPhone_iphone6


 90%|████████▉ | 8480/9435 [05:52<00:27, 35.23it/s]

no transcription for file M0762_01_1859634720_Android_SM


 90%|█████████ | 8508/9435 [05:53<00:26, 35.13it/s]

no transcription for file M0610_01_5903264178_Android_SM


 90%|█████████ | 8529/9435 [05:54<00:28, 32.31it/s]

no transcription for file M0746_01_3658079142_iPhone_iphone6


 91%|█████████ | 8578/9435 [05:56<00:39, 21.54it/s]

no transcription for file M0612_02_5804163729_Android_SM


 92%|█████████▏| 8689/9435 [06:00<00:25, 29.52it/s]

no transcription for file M0756_01_1269805473_Android_SM


 93%|█████████▎| 8756/9435 [06:03<00:21, 31.99it/s]

no transcription for file M0746_03_8526973014_iPhone_iphone6


 93%|█████████▎| 8814/9435 [06:05<00:19, 31.18it/s]

no transcription for file M0476_01_6287915430_iPhone_iphone6


 96%|█████████▌| 9053/9435 [06:15<00:14, 26.74it/s]

no transcription for file M0717_01_4698057312_Android_SM


 98%|█████████▊| 9276/9435 [06:25<00:06, 25.03it/s]

no transcription for file M0612_01_7862593104_Android_SM


100%|█████████▉| 9422/9435 [06:31<00:00, 26.36it/s]

no transcription for file M0717_01_1758490263_Android_SM


100%|█████████▉| 9432/9435 [06:31<00:00, 23.59it/s]

no transcription for file M0487_03_7154386902_iPhone_iphone6


100%|██████████| 9435/9435 [06:31<00:00, 24.63it/s]


In [114]:
files = list(glob.glob("../data/lip_reading/synchronized/train/*.npz"))

In [116]:
files_train, files_val = train_test_split(files, test_size=0.1)

In [117]:
with open("train_files.pickle", "wb") as f:
    pickle.dump(files_train, f, pickle.HIGHEST_PROTOCOL)
with open("val_files.pickle", "wb") as f:
    pickle.dump(files_val, f, pickle.HIGHEST_PROTOCOL)

In [14]:
for file in tqdm.tqdm(glob.glob("../data/lip_reading/1_video_lips/test/*.pickle")):
    file_only_name = os.path.splitext(os.path.basename(file))[0]
    labels = file_only_name.split("_")[2]
    labels = list(map(int, labels))

#     try:
#         labels_rec = file2nums[file_only_name]
#         if labels_rec != labels:
#             print(file_only_name, labels_rec, labels)
#     except KeyError:
#         print("no labels for file", file_only_name)

    mfcc_file = os.path.join("../data/lip_reading/2_features/mfcc/test/", file_only_name + ".npy")
    mfcc = np.load(mfcc_file)
    mfcc = (mfcc - mfcc.mean(axis=0)) / mfcc.std(axis=0)
    
    fbanks_file = os.path.join("../data/lip_reading/2_features/fbank/test/", file_only_name + ".npy")
    fbanks = np.load(fbanks_file)
    fbanks = (fbanks - fbanks.mean(axis=0)) / fbanks.std(axis=0)
    
    with open(file, "rb") as f:
        current_video = pickle.load(f)
        landmarks = pickle.load(f)
        timesteps = pickle.load(f)
        frames = pickle.load(f)
    full_video = align_frames(mfcc.shape[0], current_video, timesteps)
    file_out = os.path.join("../data/lip_reading/synchronized/test/", file_only_name + ".npz")
#     full_video = full_video[:len(cur_phomenes)] # fix?
#     mfcc = mfcc[] # fix ? 
#     print(save, file_only_name)
    np.savez(file_out, mfcc=mfcc, fbanks=fbanks, video=full_video, landmarks=landmarks, labels=np.array(labels))


  0%|          | 0/2909 [00:00<?, ?it/s][A
  0%|          | 1/2909 [00:00<06:05,  7.97it/s][A
  0%|          | 3/2909 [00:00<05:04,  9.56it/s][A
  0%|          | 5/2909 [00:00<04:20, 11.13it/s][A
  0%|          | 7/2909 [00:00<03:48, 12.70it/s][A
  0%|          | 9/2909 [00:00<03:44, 12.94it/s][A
  0%|          | 13/2909 [00:00<03:08, 15.37it/s][A
  1%|          | 16/2909 [00:00<02:41, 17.91it/s][A
  1%|          | 19/2909 [00:01<02:36, 18.46it/s][A
  1%|          | 23/2909 [00:01<02:13, 21.58it/s][A
  1%|          | 26/2909 [00:01<02:37, 18.35it/s][A
  1%|          | 29/2909 [00:01<02:25, 19.75it/s][A
  1%|          | 32/2909 [00:01<02:27, 19.48it/s][A
  1%|          | 35/2909 [00:01<02:39, 18.01it/s][A
  1%|▏         | 37/2909 [00:01<02:43, 17.57it/s][A
  1%|▏         | 40/2909 [00:02<02:25, 19.71it/s][A
  1%|▏         | 43/2909 [00:02<02:14, 21.24it/s][A
  2%|▏         | 46/2909 [00:02<02:24, 19.86it/s][A
  2%|▏         | 49/2909 [00:02<02:18, 20.63it/s][A
  2%|▏