In [129]:
import os
import numpy as np
import lmdb
import pickle

from tqdm import tqdm

import torch
from torch.utils.data import Dataset
from torchvision import datasets
from collections import namedtuple

import custom_datasets as cds

from dataset import ImageFileDataset, CodeRow

In [75]:
!cp VQVAE2/dataset.py .
!cp VQVAE2/custom_datasets.py .

In [16]:
paths = {'dataset':'How2Sign-Keypoints', 'lmdb': 'VQVAE2/codes'}
paths['labels'] = paths['dataset']+'/Labels'

for key in paths:
    print(f'{key} path exists: {os.path.exists(paths[key])}')

dataset path exists: True
lmdb path exists: True
labels path exists: True


In [62]:
env = lmdb.open(
        paths['lmdb'],
        max_readers=32,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False,
    )

with env.begin(write=False) as txn:
    lmdb_length = int(txn.get('length'.encode('utf-8')).decode('utf-8'))
    
    print(f'{lmdb_length} items in lmdb database')

print(f'listing labels directory:\n{os.listdir(paths["labels"])}')

1003760 items in lmdb database
listing labels directory:
['how2sign_train.csv', 'how2sign_realigned_test.csv', 'how2sign_test.csv', 'how2sign_realigned_train.csv', 'how2sign_val.csv', 'how2sign_realigned_val.csv']


In [101]:
with open(os.path.join(paths['labels'], 'how2sign_train.csv')) as r:
    labels = r.read().splitlines()
          
heading = labels[0].split()
labels = [x.split('\t') for x in labels[1:]]
heading2index = {x:i for i, x in enumerate(heading)}

print(f'{len(labels)} rows in labels (before dicting up)')

labels = {
    x[heading2index['SENTENCE_NAME']]: [
        x[heading2index['SENTENCE']], 
        x[heading2index['START']],
        x[heading2index['END']]
    ] for x in labels}
sentence_names = [x for x in labels]

print(f'{len(labels)} rows in labels (after dicting up)')
print()
print(heading)
print()
for i in range(10):
    index = sentence_names[i]
    print(f'{i+1}. {index}\t{labels[index][0]}')

31165 rows in labels (before dicting up)
31165 rows in labels (after dicting up)

['VIDEO_ID', 'VIDEO_NAME', 'SENTENCE_ID', 'SENTENCE_NAME', 'START', 'END', 'SENTENCE']

1. --7E2sU6zP4_10-5-rgb_front	And I call them decorative elements because basically all they're meant to do is to enrich and color the page.
2. --7E2sU6zP4_11-5-rgb_front	So they don't really have much of a symbolic meaning other than maybe life is richer, life is beautiful, but they've become so beautifully stylized and so you find them in different illuminative being rendered in very different ways.
3. --7E2sU6zP4_12-5-rgb_front	Now this is very, this is actually an insert of a kind of an envelope for stationary, and this is a very Italian design.
4. --7E2sU6zP4_13-5-rgb_front	This is all the you know, take off on the idea of the acanthus leaf.
5. --7E2sU6zP4_5-5-rgb_front	It's almost has a feathery like posture to it.
6. --7E2sU6zP4_6-5-rgb_front	And so, it's used in architecture as a decorative element in architect

In [104]:
labels_mapper = {}

index_found = 0

with env.begin(write=False) as txn:
    for i in range(lmdb_length):
        key = str(i).encode('utf-8')

        row = pickle.loads(txn.get(key))
        
        labels_mapper[row.filename] = {
            'top': row.top,
            'bottom': row.bottom
        }
        

In [125]:
fps = 30

labels_condensed = {}
labels_indices = {}

for key in sorted(labels_mapper.keys(), key=lambda x: int(x.split('_')[-1])):
    sentence_name, index = key.rsplit('_', 1)
    sentence_name = sentence_name.replace('_right', '')
    index = int(index)
    if sentence_name in labels_condensed:
        labels_condensed[sentence_name].append(labels_mapper[key])
        labels_indices[sentence_name].append(index)
    else:
        labels_condensed[sentence_name] = [labels_mapper[key]]
        labels_indices[sentence_name] = [index]

In [126]:
common = len(set(labels_condensed.keys()).intersection(set(labels.keys())))

print(f'{common} common out of {len(set(labels_condensed.keys()))}')

k = list(labels_condensed.keys())[1]
print(f'example key: {k}')

print()
print(labels_indices[k])

7250 common out of 7250
example key: dIhOvhzskUg_6-8-rgb_front

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 

In [131]:
map_size = 100 * 1024 * 1024 * 1024

lmdb_env = lmdb.open('video_codes', map_size=map_size)

CodeRowVideos = namedtuple('CodeRowVideos', ['code', 'labels', 'indices', 'filename'])

with lmdb_env.begin(write=True) as txn:
    pbar = tqdm(labels_condensed.keys())
    
    for index, key in enumerate(pbar):
        row = CodeRowVideos(code=labels_condensed[key], labels=labels[key], indices=labels_indices[key], filename=key)
    
        txn.put(str(index).encode('utf-8'), pickle.dumps(row))
        pbar.set_description(f'inserted: {index}')

    txn.put('length'.encode('utf-8'), str(index).encode('utf-8'))

inserted: 7249: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7250/7250 [01:46<00:00, 68.31it/s]


In [None]:
!cp process_data_for_pixelsnail.ipynb /home2/bipasha31/python_scripts/CurrentWork/SLP/utils