## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import sklearn

import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

from pathlib import Path
from IPython.display import Image, display, Video, HTML
from ipywidgets import interact, widgets

from signlens.params import *
from signlens.preprocessing import data, preprocess
from utils import plot_landmarks, model_utils

# reload automatically python functions outside notebook
%load_ext autoreload
%autoreload 2

2024-03-23 12:14:29.892079: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Fetch data

In [177]:
random_state = 42
frac = 1
balanced = False
n_frames = 100
noface = True
n_classes = 10

# compare 2 methods
data1 = data.load_data_subset_csv(random_state=random_state, frac=frac, balanced=balanced, n_frames=n_frames, noface=noface, n_classes=n_classes)

[34mLoading data subset from train_train.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 77228 to 77228 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 77228 to 68935 (89.26%)
    ℹ️ Filtered on n_classes = 10. Size reduced from 68935 to 2801 (4.06%)
✅ Loaded 2801 rows (89.26% of the original 77228 rows) from the dataset.


In [178]:
random_state = 42
frac = 1
balanced = True
n_frames = 100
noface = True
n_classes = 10

# compare 2 methods
data1 = data.load_data_subset_csv(random_state=random_state, frac=frac, balanced=balanced, n_frames=n_frames, noface=noface, n_classes=n_classes)

[34mLoading data subset from train_train.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 77228 to 77228 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 77228 to 68935 (89.26%)
    ℹ️ Filtered on n_classes = 10. Size reduced from 68935 to 2801 (4.06%)
    ⚠️ Total size smaller than requested, with 263 per sign instead of 280
    ℹ️ Balanced data, with average of 263.0 elements per class. Size reduced from 2801 to 2630 (93.90%)
✅ Loaded 2630 rows (3.41% of the original 77228 rows) from the dataset.


In [144]:
data2

Unnamed: 0,path,participant_id,sequence_id,sign,n_frames,n_frames2,file_path
0,train_landmark_files/29302/3139450686.parquet,29302,3139450686,book,6,6,/home/bfrisque/code/benoitfrisque/signlens/raw...
1,train_landmark_files/18796/4028026583.parquet,18796,4028026583,book,6,6,/home/bfrisque/code/benoitfrisque/signlens/raw...
2,train_landmark_files/28656/3236765669.parquet,28656,3236765669,book,203,203,/home/bfrisque/code/benoitfrisque/signlens/raw...
3,train_landmark_files/2044/3779884319.parquet,2044,3779884319,book,11,11,/home/bfrisque/code/benoitfrisque/signlens/raw...
4,train_landmark_files/49445/1531731799.parquet,49445,1531731799,book,40,40,/home/bfrisque/code/benoitfrisque/signlens/raw...
...,...,...,...,...,...,...,...
12145,train_landmark_files/61333/1185031115.parquet,61333,1185031115,red,10,10,/home/bfrisque/code/benoitfrisque/signlens/raw...
12146,train_landmark_files/34503/1281552174.parquet,34503,1281552174,red,11,11,/home/bfrisque/code/benoitfrisque/signlens/raw...
12147,train_landmark_files/29302/3975162469.parquet,29302,3975162469,red,28,28,/home/bfrisque/code/benoitfrisque/signlens/raw...
12148,train_landmark_files/18796/2763858681.parquet,18796,2763858681,red,20,20,/home/bfrisque/code/benoitfrisque/signlens/raw...


In [101]:
from signlens.preprocessing import glossary

glossary.load_glossary()

Unnamed: 0,sign
0,book
1,drink
2,before
3,chair
4,go
...,...
233,wake
236,weus
247,yucky
248,zebra


In [179]:
data.unique_train_test_split(force_rewrite=True)

[34m[1m
Creating unique test set with test_size = 0.2[0m
[34mLoading data subset from train.csv[0m
    ℹ File with frames already exists, loaded matching 'sequence_id' rows.
    ℹ️ Filtered sequences with missing frames. Size reduced from 94477 to 94461 (99.98%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 94461 to 86168 (91.22%)
    ℹ️ Filtered on n_classes = 250. Size reduced from 86168 to 86168 (100.00%)
    ℹ️ Balanced data, with average of 68.9 elements per class. Size reduced from 86168 to 17233 (20.00%)
✅ Loaded 17233 rows (18.24% of the original 94477 rows) from the dataset.
[34m[1m
Creating training set[0m
[34mLoading data subset from train.csv[0m
    ℹ File with frames already exists, loaded matching 'sequence_id' rows.
    ℹ️ Filtered sequences with missing frames. Size reduced from 94477 to 94461 (99.98%)
    ℹ️ Filtered on n_classes = 250. Size reduced from 94461 to 94461 (100.00%)
✅ Loaded 94461 rows (99.98% of the original 94477 rows) from the dataset.
