In [1]:
import pandas as pd
from pathlib import Path

from signlens.params import *
from signlens.preprocessing import data, preprocess
from utils import plot_landmarks


# reload automatically python functions outside notebook
%load_ext autoreload
%autoreload 2

2024-03-23 08:44:13.589711: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data from landmarks

In [27]:
data_landmarks = data.load_data_subset_csv(frac=1, n_classes=250, n_frames=None, noface=False, csv_path=TRAIN_CSV_PATH)

[34mLoading data subset from train.csv[0m
✅ File with frames already exists, loaded matching 'sequence_id' rows.
✅ Filtered sequences with missing frames. Size reduced from 94477 to 94461 (99.98%)
✅ Filtered on n_classes = 250. Size reduced from 94461 to 94461 (100.00%)
✅ Loaded 94461 rows (99.98% of the original 94477 rows) from the dataset.


In [28]:
gloss_landmarks = pd.DataFrame(data_landmarks['sign'].unique())\
            .rename(columns={0:'sign'})
gloss_landmarks['sign_lower'] = gloss_landmarks['sign'].str.lower()
gloss_landmarks.sort_values(by='sign', inplace=True, ignore_index=True)
gloss_landmarks


Unnamed: 0,sign,sign_lower
0,TV,tv
1,after,after
2,airplane,airplane
3,all,all
4,alligator,alligator
...,...,...
245,yesterday,yesterday
246,yourself,yourself
247,yucky,yucky
248,zebra,zebra


## Data from videos

In [29]:
WLASL_DIR = Path('../../raw_data/WLASL')
WLASL_JSON_PATH = WLASL_DIR/'WLASL_v0.3.json'
WLASL_VIDEO_DIR = WLASL_DIR/'videos'

data_videos = pd.read_json(WLASL_JSON_PATH)

In [31]:
gloss_videos = pd.DataFrame(data_videos['gloss'].unique())\
            .rename(columns={0:'sign_videos'})
gloss_videos['sign_videos_lower'] = gloss_videos['sign_videos'].str.lower()
gloss_videos


Unnamed: 0,sign_videos,sign_videos_lower
0,book,book
1,drink,drink
2,computer,computer
3,before,before
4,chair,chair
...,...,...
1995,washington,washington
1996,waterfall,waterfall
1997,weigh,weigh
1998,wheelchair,wheelchair


## Merge

In [39]:
# Merge df with videos and df from landmarks to take only the matchin elements (the sign videos should be sorted by most used asl signs)
merged_df = gloss_videos.merge(gloss_landmarks, left_on='sign_videos_lower', right_on='sign_lower').reset_index()
most_used_signs = merged_df.sign
most_used_signs

0        book
1       drink
2      before
3       chair
4          go
        ...  
196      gift
197      lamp
198     penny
199     stuck
200        TV
Name: sign, Length: 201, dtype: object

In [43]:
# take the missing signs from the landmarks glossary to append them
missing_signs = gloss_landmarks[~gloss_landmarks.sign.isin(most_used_signs)].sign
missing_signs.head()

12       backyard
34    callonphone
41          cheek
43           chin
51         cowboy
Name: sign, dtype: object

In [44]:
missing_signs.shape

(49,)

In [45]:
all_signs = pd.concat([most_used_signs, missing_signs])
all_signs

0        book
1       drink
2      before
3       chair
4          go
        ...  
233      wake
236      weus
247     yucky
248     zebra
249    zipper
Name: sign, Length: 250, dtype: object

In [46]:
glossary_csv_path = TRAIN_DATA_DIR + '/glossary.csv'
glossary_csv_path 

'/home/bfrisque/code/benoitfrisque/signlens/raw_data/asl-signs/glossary.csv'

In [51]:
all_signs.to_csv(glossary_csv_path, index=True)

In [54]:
pd.read_csv(glossary_csv_path, index_col=0)

Unnamed: 0,sign
0,book
1,drink
2,before
3,chair
4,go
...,...
233,wake
236,weus
247,yucky
248,zebra
