## Configuration

In [54]:
class CFG:
    data_path = ""
    sequence_length = 512
    fold_group = 1
    n_fold=5
    ROWS_PER_FRAME = 543  # number of landmarks per frame    
    

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import json
import os

from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold

## Utilities

In [56]:


def load_relevant_data_subset_with_imputation(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns) 
    # pd.read_parquet(pq_path).head() =>
    #    frame     row_id  type  landmark_index         x         y         z
    # 0     20  20-face-0  face               0  0.494400  0.380470 -0.030626
    # 1     20  20-face-1  face               1  0.496017  0.350735 -0.057565
    # 2     20  20-face-2  face               2  0.500818  0.359343 -0.030283    
    
    
    _ = data.replace(np.nan, 1, inplace=True)# np.nan, 0, inplace=True
    n_frames = int(len(data) / CFG.ROWS_PER_FRAME)
    # len(data), n_frames => 12489, 23
    # type(data) => <class 'pandas.core.frame.DataFrame'>
    # data.shape => (12489, 3)
    data = data.values.reshape(n_frames, CFG.ROWS_PER_FRAME, len(data_columns))
    # data.shape => (23, 543, 3)
    # type(data) => <class 'numpy.ndarray'>
    return data.astype(np.float32)

def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / CFG.ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, CFG.ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

def read_dict(file_path):
    path = os.path.expanduser(file_path)
    with open(path, "r") as f:
        dic = json.load(f)
    return dic

## Load data

In [8]:
train = pd.read_csv(f"{CFG.data_path}train.csv")
label_index = read_dict(f"{CFG.data_path}sign_to_prediction_index_map.json")
# label_index => {'TV': 0, 'after': 1, 'airplane': 2, 'all': 3, 'alligator': 4, ...., 'yourself': 246, 'yucky': 247, 'zebra': 248, 'zipper': 249}
label_index =  {k.lower(): v for k, v in label_index.items()}
index_label = dict([(label_index[key], key) for key in label_index])
# index_label => {0: 'TV', 1: 'after', 2: 'airplane', 3: 'all', 4: 'alligator', ...., 246: 'yourself', 247: 'yucky', 248: 'zebra', 249: 'zipper'}
train["label"] = train["sign"].map(lambda sign: label_index[sign.lower()])
print(train.shape)
# train.head() =>
#                                             path  participant_id  sequence_id  sign  label  
# 0  train_landmark_files/26734/1000035562.parquet           26734   1000035562  blow     25  
# 1  train_landmark_files/28656/1000106739.parquet           28656   1000106739  wait    232  
# 2   train_landmark_files/16069/100015657.parquet           16069    100015657  cloud    48



# split over participant_id
if CFG.fold_group:
    print(f'FOLD SPLIT USING GROUPS')
    split = StratifiedGroupKFold(CFG.n_fold, random_state=42, shuffle=True) #rs = 42

    for k, (_, test_idx) in enumerate(split.split(train, train.sign, groups=train.participant_id)):
        train.loc[test_idx, 'fold'] = k
else:
    print(f'FOLD SPLIT ONLY ON SIGN')
    split = StratifiedKFold(CFG.n_fold, random_state=42, shuffle=True) #rs = 42

    for k, (_, test_idx) in enumerate(split.split(train, train.sign)):
        train.loc[test_idx, 'fold'] = k

train.fold = train.fold.astype(int)
display(train.groupby('fold').size())

(94477, 5)
FOLD SPLIT USING GROUPS


fold
0    23866
1    16691
2    18582
3    16229
4    19109
dtype: int64

In [10]:
train.head()

Unnamed: 0,path,participant_id,sequence_id,sign,label,fold
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow,25,0
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait,232,2
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud,48,2
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird,23,1
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie,164,3


## Create TF-Record

In [42]:
def create_record(coo, sn, pid, sid):
    dic = {}
    dic["coordinates"] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[coo]))
    dic["sign"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[sn]))
    dic["participant_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[pid]))
    dic["sequence_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[sid]))
    record_bytes = tf.train.Example(features=tf.train.Features(feature=dic)).SerializeToString()
    return record_bytes
    

def decode_tfrec(record_bytes):
    features = tf.io.parse_single_example(record_bytes, {
        'coordinates': tf.io.FixedLenFeature([], tf.string),
        'sign': tf.io.FixedLenFeature([], tf.int64),
    })
    out = {}
    out['coordinates']  = tf.reshape(tf.io.decode_raw(features['coordinates'], tf.float32), (-1,CFG.ROWS_PER_FRAME,3))
    out['sign'] = features['sign']
    return out

In [86]:
for fold in train.fold.unique():
    df = train[train.fold == fold]
    lps = int(len(df) / CFG.sequence_length)
    
    for k in range(lps):        
        save_path = f"islr_records/fold{fold}-{k}-{CFG.sequence_length}.tfrecords"
        
        # tf.io.TFRecordWriter => a class to write records to a TFRecords file for high throughput data retrieval, generally in conjunction with tf.data.        
        with tf.io.TFRecordWriter(save_path, options=tf.io.TFRecordOptions(compression_type="GZIP")) as file_writer:            
            for i in tqdm(range(k*CFG.sequence_length,(k+1)*CFG.sequence_length)):  
            # for i in tqdm(range(3,4)):
                path = f"{CFG.data_path}{df.iloc[i].path}"
                # coordinates = load_relevant_data_subset_with_imputation(path)
                coordinates = load_relevant_data_subset(path)
                
                # type(coordinates) => <class 'numpy.ndarray'>
                # coordinates.shape => (23, 543, 3)
                # coordinates = tf.image.resize(tf.constant(coordinates), (CFG.sequence_length, CFG.ROWS_PER_FRAME)).numpy().reshape(-1)
                coordinates = coordinates.reshape(-1)
                sign = int(df.iloc[i].label)
                file_writer.write(create_record(coordinates.tobytes(), sign, df.iloc[i].participant_id, df.iloc[i].sequence_id))

100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:12<00:00, 42.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:16<00:00, 30.26it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:18<00:00, 28.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:17<00:00, 29.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:17<00:00, 29.67it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:16<00:00, 30.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:16<00:00, 30.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:16<00:00, 31.50it/s]
100%|███████████████████████████████████

100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:18<00:00, 27.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:18<00:00, 27.42it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:20<00:00, 25.18it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:18<00:00, 27.47it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:18<00:00, 27.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:18<00:00, 27.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:18<00:00, 27.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [00:19<00:00, 26.94it/s]
100%|███████████████████████████████████

In [85]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94477 entries, 0 to 94476
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   path            94477 non-null  object
 1   participant_id  94477 non-null  int64 
 2   sequence_id     94477 non-null  int64 
 3   sign            94477 non-null  object
 4   label           94477 non-null  int64 
 5   fold            94477 non-null  int32 
dtypes: int32(1), int64(3), object(2)
memory usage: 4.0+ MB
