## Imports

In [16]:
from signlens.params import *
from signlens.preprocessing import data, preprocess
from utils import plot_landmarks, model_utils

# reload automatically python functions outside notebook
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test load_data_subset_csv

In [17]:
random_state = 42
frac = 1
balanced = False
n_frames = 100
noface = True
n_classes = 10

data1 = data.load_data_subset_csv(random_state=random_state, frac=frac, balanced=balanced, n_frames=n_frames, noface=noface, n_classes=n_classes)

[34mLoading data subset from train_train.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 77228 to 77228 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 77228 to 68935 (89.26%)
    ℹ️ Filtered on n_classes = 10. Size reduced from 68935 to 2801 (4.06%)
✅ Loaded 2801 rows (3.63% of the original 77228 rows) from the dataset.


In [18]:
random_state = 42
frac = 1
balanced = True
n_frames = 100
noface = True
n_classes = 10

data1 = data.load_data_subset_csv(random_state=random_state, frac=frac, balanced=balanced, n_frames=n_frames, noface=noface, n_classes=n_classes)

[34mLoading data subset from train_train.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 77228 to 77228 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 77228 to 68935 (89.26%)
    ℹ️ Filtered on n_classes = 10. Size reduced from 68935 to 2801 (4.06%)
    ⚠️ Total size smaller than requested, with 263 per sign instead of 280
    ℹ️ Balanced data, with average of 263.0 elements per class. Size reduced from 2801 to 2630 (93.90%)
✅ Loaded 2630 rows (3.41% of the original 77228 rows) from the dataset.


In [19]:
random_state = 42
frac = 1
balanced = True
n_frames = 100
noface = True
n_classes = 10

data1 = data.load_data_subset_csv(random_state=random_state, frac=frac, balanced=balanced, n_frames=n_frames, noface=noface, n_classes=n_classes)

[34mLoading data subset from train_train.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 77228 to 77228 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 77228 to 68935 (89.26%)
    ℹ️ Filtered on n_classes = 10. Size reduced from 68935 to 2801 (4.06%)
    ⚠️ Total size smaller than requested, with 263 per sign instead of 280
    ℹ️ Balanced data, with average of 263.0 elements per class. Size reduced from 2801 to 2630 (93.90%)
✅ Loaded 2630 rows (3.41% of the original 77228 rows) from the dataset.


## Test glossary

In [20]:
from signlens.preprocessing import glossary

glossary.load_glossary()

Unnamed: 0,sign
0,book
1,drink
2,before
3,chair
4,go
...,...
233,wake
236,weus
247,yucky
248,zebra


## Test train_test_split

In [21]:
data.load_data_subset_csv(frac=0.2, noface=False, balanced=True,
                                     n_classes=250, n_frames=100, random_state=42, csv_path=TRAIN_CSV_PATH)

[34mLoading data subset from train.csv[0m
    ℹ File with frames already exists, loaded matching 'sequence_id' rows.
    ℹ️ Filtered sequences with missing frames. Size reduced from 94477 to 94461 (99.98%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 94461 to 86168 (91.22%)
    ℹ️ Filtered on n_classes = 250. Size reduced from 86168 to 86168 (100.00%)
    ℹ️ Balanced data, with average of 68.9 elements per class. Size reduced from 86168 to 17233 (20.00%)
✅ Loaded 17233 rows (18.24% of the original 94477 rows) from the dataset.


Unnamed: 0,path,participant_id,sequence_id,sign,file_path,n_frames,n_frames2
0,train_landmark_files/2044/2194440747.parquet,2044,2194440747,balloon,/home/bfrisque/code/benoitfrisque/signlens/raw...,20,20
1,train_landmark_files/62590/2328635078.parquet,62590,2328635078,now,/home/bfrisque/code/benoitfrisque/signlens/raw...,8,8
2,train_landmark_files/30680/693401678.parquet,30680,693401678,can,/home/bfrisque/code/benoitfrisque/signlens/raw...,30,30
3,train_landmark_files/61333/3596198077.parquet,61333,3596198077,puzzle,/home/bfrisque/code/benoitfrisque/signlens/raw...,11,11
4,train_landmark_files/29302/1292281039.parquet,29302,1292281039,haveto,/home/bfrisque/code/benoitfrisque/signlens/raw...,40,40
...,...,...,...,...,...,...,...
17228,train_landmark_files/36257/3963006948.parquet,36257,3963006948,open,/home/bfrisque/code/benoitfrisque/signlens/raw...,10,10
17229,train_landmark_files/22343/33610143.parquet,22343,33610143,bee,/home/bfrisque/code/benoitfrisque/signlens/raw...,47,47
17230,train_landmark_files/18796/2519274145.parquet,18796,2519274145,flower,/home/bfrisque/code/benoitfrisque/signlens/raw...,95,95
17231,train_landmark_files/55372/1479664042.parquet,55372,1479664042,hot,/home/bfrisque/code/benoitfrisque/signlens/raw...,6,6


In [22]:
data.unique_train_test_split(force_rewrite=True)

[31mForce rewrite is enabled. Overwriting the existing train and test data.[0m
[34m[1m
Creating unique test set with test_size = 0.2[0m
[34mLoading data subset from train.csv[0m
    ℹ File with frames already exists, loaded matching 'sequence_id' rows.
    ℹ️ Filtered sequences with missing frames. Size reduced from 94477 to 94461 (99.98%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 94461 to 86168 (91.22%)
    ℹ️ Filtered on n_classes = 250. Size reduced from 86168 to 86168 (100.00%)
    ℹ️ Balanced data, with average of 68.9 elements per class. Size reduced from 86168 to 17233 (20.00%)
✅ Loaded 17233 rows (18.24% of the original 94477 rows) from the dataset.
[34m[1m
Creating training set[0m
[34mLoading data subset from train.csv[0m
    ℹ File with frames already exists, loaded matching 'sequence_id' rows.
    ℹ️ Filtered sequences with missing frames. Size reduced from 94477 to 94461 (99.98%)
    ℹ️ Filtered on n_classes = 250. Size reduced from 94461 to 94461 (10

In [23]:
test_data = data.load_data_subset_csv(frac=1, n_classes=None, csv_path=TRAIN_TEST_CSV_PATH)
test_data

[34mLoading data subset from train_test.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 17233 to 17233 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 17233 to 17233 (100.00%)
✅ Loaded 17233 rows (100.00% of the original 17233 rows) from the dataset.


Unnamed: 0,path,participant_id,sequence_id,sign,n_frames,n_frames2,file_path
0,train_landmark_files_noface/30680/713038551.pa...,30680,713038551,home,32,32,/home/bfrisque/code/benoitfrisque/signlens/raw...
1,train_landmark_files_noface/30680/1759628346.p...,30680,1759628346,gum,15,15,/home/bfrisque/code/benoitfrisque/signlens/raw...
2,train_landmark_files_noface/29302/876462514.pa...,29302,876462514,balloon,24,24,/home/bfrisque/code/benoitfrisque/signlens/raw...
3,train_landmark_files_noface/36257/622978617.pa...,36257,622978617,bye,100,100,/home/bfrisque/code/benoitfrisque/signlens/raw...
4,train_landmark_files_noface/37055/4248049582.p...,37055,4248049582,cat,25,25,/home/bfrisque/code/benoitfrisque/signlens/raw...
...,...,...,...,...,...,...,...
17228,train_landmark_files_noface/37779/852339409.pa...,37779,852339409,finish,15,15,/home/bfrisque/code/benoitfrisque/signlens/raw...
17229,train_landmark_files_noface/26734/332985931.pa...,26734,332985931,gift,23,23,/home/bfrisque/code/benoitfrisque/signlens/raw...
17230,train_landmark_files_noface/36257/1678443086.p...,36257,1678443086,bye,18,18,/home/bfrisque/code/benoitfrisque/signlens/raw...
17231,train_landmark_files_noface/26734/3253596579.p...,26734,3253596579,shower,29,29,/home/bfrisque/code/benoitfrisque/signlens/raw...


## Test encoding

In [24]:
from signlens.preprocessing.preprocess import encode_labels, decode_labels

### Test with all labels

In [25]:
n_classes = 250
test_data = data.load_data_subset_csv(frac=1, n_classes=n_classes, csv_path=TRAIN_TEST_CSV_PATH)
y = test_data.sign
y_encoded = encode_labels(y, num_classes=n_classes)
y_encoded

[34mLoading data subset from train_test.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 17233 to 17233 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 17233 to 17233 (100.00%)
    ℹ️ Filtered on n_classes = 250. Size reduced from 17233 to 17233 (100.00%)
✅ Loaded 17233 rows (100.00% of the original 17233 rows) from the dataset.


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
y_encoded.shape

(17233, 250)

In [27]:
y_decoded, proba = decode_labels(y_encoded)

In [28]:
y_decoded

['police',
 'open',
 'if',
 'pencil',
 'child',
 'grass',
 'noisy',
 'store',
 'donkey',
 'boat',
 'car',
 'eye',
 'see',
 'down',
 'black',
 'hello',
 'noisy',
 'chocolate',
 'frenchfries',
 'tooth',
 'dad',
 'finish',
 'like',
 'apple',
 'orange',
 'horse',
 'sleepy',
 'rain',
 'haveto',
 'many',
 'grass',
 'stuck',
 'icecream',
 'because',
 'bad',
 'into',
 'hide',
 'napkin',
 'store',
 'another',
 'down',
 'make',
 'why',
 'pig',
 'yourself',
 'cut',
 'lion',
 'why',
 'hide',
 'water',
 'refrigerator',
 'stuck',
 'taste',
 'owie',
 'nap',
 'child',
 'bird',
 'see',
 'mom',
 'food',
 'callonphone',
 'nap',
 'car',
 'snow',
 'dirty',
 'thirsty',
 'after',
 'puzzle',
 'noisy',
 'first',
 'say',
 'puppy',
 'any',
 'girl',
 'green',
 'dirty',
 'black',
 'sun',
 'bug',
 'sleepy',
 'underwear',
 'that',
 'grass',
 'bird',
 'zebra',
 'underwear',
 'orange',
 'please',
 'yourself',
 'like',
 'dance',
 'if',
 'hungry',
 'alligator',
 'jacket',
 'child',
 'there',
 'aunt',
 'touch',
 'goose',

In [29]:
list(y) == y_decoded

True

### Test with less classes

In [30]:
n_classes = 50
test_data = data.load_data_subset_csv(frac=1, n_classes=n_classes, csv_path=TRAIN_TEST_CSV_PATH)
y = test_data.sign
y_encoded = encode_labels(y, num_classes=n_classes)
y_encoded

[34mLoading data subset from train_test.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 17233 to 17233 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 17233 to 17233 (100.00%)
    ℹ️ Filtered on n_classes = 50. Size reduced from 17233 to 3450 (20.02%)
✅ Loaded 3450 rows (20.02% of the original 17233 rows) from the dataset.


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
y_encoded.shape

(3450, 50)

In [36]:
y_decoded, proba = decode_labels(y_encoded)

In [38]:
list(y) == y_decoded

True