In [16]:
import glob
import os
import re
import json
import numpy as np

os.listdir('.')

['how2sign_train.csv',
 'train.txt',
 'common.txt',
 'all.txt',
 'how2sign_val.csv',
 '.ipynb_checkpoints',
 'how2sign_realigned_train.csv',
 '.how2sign_realigned_train.csv.swp',
 'how2sign_realigned_test.csv',
 'test.txt',
 'how2sign_realigned_val.csv',
 'create_labels_from_csv.ipynb',
 'val.txt',
 'how2sign_test.csv']

In [26]:
def extract_data(path):
    def isempty(x):
        return x.isspace() or x ==""
    content = open(path).read().splitlines()
    
    content = [x.split('\t')[-1] for x in content[1:]]
    content = [x.replace('-', ' ') for x in content]
    content = [re.sub('[^a-zA-Z\d\s]', '', x) for x in content]
    
    content = [[y.lower() for y in x.split(' ') if not isempty(y)] for x in content]
    sizes = [len(x) for x in content]
    content = [x for y in content for x in y]
    
    return list(set(content)), sizes
    
train, train_sizes = extract_data('how2sign_train.csv')
test, test_sizes = extract_data('how2sign_test.csv')
val, val_sizes = extract_data('how2sign_val.csv')

print(f'Examples: {train[0]}, {train[1]}, {test[3]}, {test[5]}, {val[60]}, {val[100]}')
print(f'No. of tokens: train: {len(train)}, test: {len(test)}, val: {len(val)}')

common_tokens = list(set(train).intersection(set(test).intersection(val)))
all_tokens = list(set(train + val + test))
new_test_tokens = len(all_tokens) - len(train)

print()
print(f'No. of tokens when everything is merged: {len(all_tokens)}')
print(f'Common tokens: {len(common_tokens)}')
print(f'Additional tokens outside of train: {new_test_tokens} ({round(new_test_tokens/len(train) * 100, 2)}%)')
print()

data = {'train': train, 'test': test, 'val': val, 'all': all_tokens, 'common': common_tokens}
for key in data:
    with open(f'{key}.txt', 'w') as w:
        json.dump(data[key], w)
        print(f'Wrote {key} to {key}.txt')

Examples: q, voyage, eyes, granular, advance, just
No. of tokens: train: 15364, test: 3664, val: 3208

No. of tokens when everything is merged: 16212
Common tokens: 1670
Additional tokens outside of train: 848 (5.52%)

Wrote train to train.txt
Wrote test to test.txt
Wrote val to val.txt
Wrote all to all.txt
Wrote common to common.txt


In [62]:
!cp create_labels_from_csv.ipynb /home2/bipasha31/python_scripts/CurrentWork/SLP/utils

In [60]:
def extract_timestamps(path):
    def get_ts(x, idx):
        return float(x.split('\t')[-idx])
        
    content = open(path).read().splitlines()
    
    fps = 30
    
    content = np.array([int((get_ts(x, 2) - get_ts(x, 3))*fps) for x in content[1:]])
    print(f'content: {content}, shape: {content.shape}, min:{content.min():.2f}, max:{content.max():.2f}, mean: {content.mean():.2f}')
    
    return content

train = extract_timestamps('how2sign_train.csv')
test = extract_timestamps('how2sign_test.csv')
val = extract_timestamps('how2sign_val.csv')

print()
sizes = np.array(train_sizes)
print(f'sizes: {sizes}, shape: {sizes.shape}, min: {sizes.min():.2f}, max:{sizes.max():.2f}, mean: {sizes.mean():.2f}, std: {sizes.std():.2f}')

print()
bound = 10
valid_num = (sizes < bound).sum()
print(f'Valid rows below {bound} words: {valid_num} ({round((valid_num/len(sizes))*100, 2)}%)')
bounded_train = train[sizes<bound]
print(f'content: {bounded_train}, shape: {bounded_train.shape}, min: {bounded_train.min():.2f}, max: {bounded_train.max():.2f}, mean: {bounded_train.mean():.2f}')

print()
bound = 256
valid_num = (train < bound).sum()
print(f'Valid rows below {bound} frames: {valid_num} ({round((valid_num/len(train))*100, 2)}%)')
bounded_sizes = sizes[train<bound]
print(f'content: {bounded_sizes}, shape: {bounded_sizes.shape}, min: {bounded_sizes.min():.2f}, max: {bounded_sizes.max():.2f}, mean: {bounded_sizes.mean():.2f}')


content: [342 530 327 ... 150 159 251], shape: (31165,), min:3.00, max:2418.00, mean: 170.91
content: [ 10 247 239 ... 260 536 203], shape: (2357,), min:4.00, max:1118.00, mean: 173.00
content: [235  41 214 ... 101 120 213], shape: (1741,), min:3.00, max:1596.00, mean: 169.54

sizes: [20 39 24 ... 12 13 25], shape: (31165,), min: 1.00, max:195.00, mean: 17.69, std: 12.65

Valid rows below 10 words: 8690 (27.88%)
content: [199  14  47 ...  35  54  53], shape: (8690,), min: 3.00, max: 1087.00, mean: 59.67

Valid rows below 256 frames: 25073 (80.45%)
content: [15  9 14 ... 12 13 25], shape: (25073,), min: 1.00, max: 47.00, mean: 13.35


In [87]:
a = np.array([2, 3, 5, 6, 7, 10, 12, 15])

b = {}
min, max = a.min(), a.max()
p = -1
for i in range(max):
    if i+1 in a:
        p += 1
    b[i+1] = 0 if p == -1 else p

for key in sorted(b.keys()):
    print(f'{key}: {b[key]}')

1: 0
2: 0
3: 1
4: 1
5: 2
6: 3
7: 4
8: 4
9: 4
10: 5
11: 5
12: 6
13: 6
14: 6
15: 7
