In [1]:
# This file is a modified version of the original:
# https://www.kaggle.com/code/darraghdog/asl-fingerspelling-preprocessing-train

In [2]:
from tqdm import tqdm
import multiprocessing as mp
import pandas as pd
import numpy as np
import os
import shutil
import argparse
import json

In [3]:
class args:
    input_dir='/kaggle/input/asl-fingerspelling/train_landmarks'
    output_dir='./train_landmarks_npy/'
    n_cores=4
    train_df='/kaggle/input/asl-fingerspelling/train.csv'

In [5]:
train = pd.read_csv(args.train_df)
# train.head() => 
#   	path                            	file_id 	sequence_id 	participant_id  	phrase
# 0 	train_landmarks/5414471.parquet 	5414471 	1816796431  	217             	3 creekhouse
# 1 	train_landmarks/5414471.parquet 	5414471 	1816825349  	107             	scales/kuhaylah
# 2 	train_landmarks/5414471.parquet 	5414471 	1816909464  	1               	1383 william lanier
# 3 	train_landmarks/5414471.parquet 	5414471 	1816967051  	63              	988 franklin lane
# 4 	train_landmarks/5414471.parquet 	5414471 	1817123330  	89              	6920 northeast 661st road

In [None]:

# train_cols in right order
all_cols = [f'face_{i}' for i in range(468)] 
all_cols += [f'left_hand_{i}' for i in range(21)] 
all_cols += [f'pose_{i}' for i in range(33)]
all_cols += [f'right_hand_{i}' for i in range(21)]
all_cols = np.array(all_cols)


# kept landmarks

NOSE=[
    1,2,98,327
]
LNOSE = [98]
RNOSE = [327]
LIP = [ 0, 
    61, 185, 40, 39, 37, 267, 269, 270, 409,
    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
    95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
]
LLIP = [84,181,91,146,61,185,40,39,37,87,178,88,95,78,191,80,81,82]
RLIP = [314,405,321,375,291,409,270,269,267,317,402,318,324,308,415,310,311,312]

POSE = [500, 502, 504, 501, 503, 505, 512, 513]
LPOSE = [513,505,503,501]
RPOSE = [512,504,502,500]

LARMS = [501, 503, 505, 507, 509, 511]
RARMS = [500, 502, 504, 506, 508, 510]

REYE = [
    33, 7, 163, 144, 145, 153, 154, 155, 133,
    246, 161, 160, 159, 158, 157, 173,
]
LEYE = [
    263, 249, 390, 373, 374, 380, 381, 382, 362,
    466, 388, 387, 386, 385, 384, 398,
]

LHAND = np.arange(468, 489).tolist()
RHAND = np.arange(522, 543).tolist()

# len(LIP), len(LHAND), len(RHAND), len(NOSE), len(REYE), len(LEYE), len(LARMS), len(RARMS)) => 
# 40, 21, 21, 4, 16, 16, 6, 6
POINT_LANDMARKS = LIP + LHAND + RHAND + NOSE + REYE + LEYE + LARMS + RARMS
# sorted(POINT_LANDMARKS) => [0, 1, 2, 7, 13, 14, 17, 33, 37, 39, 40, 61, 78, 80, 81, ..., 538, 539, 540, 541,542]

# len(all_cols) => 543
kept_cols = all_cols[POINT_LANDMARKS]

n_landmarks = len(kept_cols)
# n_landmarks => 130

kept_cols_xyz = np.array(['x_' + c for c in kept_cols] + ['y_' + c for c in kept_cols] + ['z_' + c for c in kept_cols])


TARGET_FOLDER = args.output_dir

file_ids = train['file_id'].unique()

In [6]:
# len(file_ids) => 68
file_ids

array([   5414471,  105143404,  128822441,  149822653,  152029243,
        169560558,  175396851,  234418913,  296317215,  349393104,
        388576474,  425182931,  433948159,  450474571,  474255203,
        495378749,  522550314,  527708222,  532011803,  546816846,
        566963657,  568753759,  614661748,  638508439,  649779897,
        654436541,  683666742,  871280215,  882979387,  933868835,
        939623093, 1019715464, 1021040628, 1098899348, 1099408314,
       1133664520, 1134756332, 1255240050, 1320204318, 1341528257,
       1358493307, 1365275733, 1365772051, 1405046009, 1448136004,
       1497621680, 1552432300, 1557244878, 1562234637, 1643479812,
       1647220008, 1662742697, 1664666588, 1726141437, 1785039512,
       1865557033, 1880177496, 1905462118, 1906357076, 1920330615,
       1967755728, 1969985709, 1997878546, 2026717426, 2036580525,
       2072296290, 2072876091, 2118949241])

In [14]:
def do_one(file_id):
    os.makedirs(TARGET_FOLDER + f'{file_id}/', exist_ok=True)
    df = pd.read_parquet(f'{args.input_dir}/{file_id}.parquet').reset_index()
    # df.columns => 
    # Index(['sequence_id', 'frame', 'x_face_0', 'x_face_1', 'x_face_2', 'x_face_3',
    #        'x_face_4', 'x_face_5', 'x_face_6', 'x_face_7',
    #        ...
    #        'z_right_hand_17', 'z_right_hand_18', 'z_right_hand_19',
    #        'z_right_hand_20'],
    #       dtype='object', length=1631)    
    
    sequence_ids = df['sequence_id'].unique()
    for sequence_id in sequence_ids:
        df_seq = df[df['sequence_id']==sequence_id].copy()
        vals = df_seq[kept_cols_xyz].values
        # print(vals.shape) => (x, 390)                    (for all loops) (130*3 = 390)
        np.save(TARGET_FOLDER + f'{file_id}/{sequence_id}.npy',vals)

In [6]:
# do_one(5414471)

In [12]:
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)
#shutil.copy(args.train_df, args.output_dir + '../')
#shutil.copy('/kaggle/input/asl-fingerspelling/character_to_prediction_index.json', args.output_dir + '../')

In [15]:
# multiprocessing.freeze_support()
with mp.Pool(args.n_cores) as p:
    res = list(tqdm(p.imap(do_one,file_ids), total=len(file_ids)))

selected_columns_dict = {"selected_columns": kept_cols_xyz.tolist()}

with open(f'{TARGET_FOLDER}inference_args.json', "w") as f:
    json.dump(selected_columns_dict, f)

np.save(TARGET_FOLDER + 'columns.npy',kept_cols_xyz)

  0%|          | 0/68 [00:36<?, ?it/s]


KeyboardInterrupt: 