In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.notebook import tqdm
import os
import json
import torch.nn as nn
import torch


competition = 'asl-signs'


plt.style.use("ggplot")

In [2]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('..') / 'input'  / 'competition'
    # !pip install -Uqq fastai
else:
    import zipfile, kaggle
    path = Path.home() / '.data' / 'asl-signs'
    if not path.exists():
        path.mkdir(exist_ok=True)
        kaggle.api.competition_download_cli(competition, path=path)
        zipfile.ZipFile(path / f'{competition}.zip').extractall(path)

# Labeled Data

In [3]:
with open(path / 'sign_to_prediction_index_map.json') as f:
    sign_labels = json.load(f)

In [4]:
train = (pd.read_csv(path / 'train_with_meta.csv')
         .assign(idx=lambda x: x.participant_id.astype(str) + '_' + x.sequence_id.astype(str))
         .set_index('idx')
        )
train.head()

Unnamed: 0_level_0,path,participant_id,sequence_id,sign,cnt_partial_nulls,cnt_partial_nulls_by_frame,total_frames,face,left_hand,pose,right_hand
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
26734_1000035562,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow,0.0,0.0,23.0,23.0,0.0,23.0,11.0
28656_1000106739,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait,0.0,0.0,11.0,11.0,0.0,11.0,2.0
16069_100015657,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud,0.0,0.0,105.0,105.0,28.0,105.0,0.0
25571_1000210073,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird,0.0,0.0,12.0,12.0,0.0,12.0,12.0
62590_1000240708,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie,0.0,0.0,18.0,18.0,0.0,18.0,18.0


# Competition Data Loader

In [5]:
ROWS_PER_FRAME = 543  # number of landmarks per frame

def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

# Find record where both left and right hand first values are null

In [142]:
n = len(train)
for i in tqdm(range(n)):
    record = train.iloc[i]
    record_id = str(record['participant_id']) + '_' + str(record['sequence_id'])

    pq_path = path / record['path']
    landmarks = pd.read_parquet(pq_path)
    
    
    first_frame_mask = landmarks.loc[:, 'frame'] == landmarks.frame.min()
    hands_mask = landmarks.loc[:, 'type'].isin(['left_hand', 'right_hand'])
    landmarks = (landmarks.loc[first_frame_mask & hands_mask, :]
     .set_index(['frame', 'type', 'landmark_index'])
     .drop('row_id', axis=1)
     .unstack('type')
    )
    if landmarks.isna().all().all():
        break
        
record_id

  0%|          | 0/94477 [00:00<?, ?it/s]

'49445_1001499433'

# Data Loader

In [10]:
class ASLRecord:
    def __init__(self, record_id):
        self.record_id = record_id
        record = train.loc[self.record_id, :]

        self.landmarks = pd.read_parquet(path / self.record['path'])
        
    @property
    def hand(self):
        return record[['left_hand', 'right_hand']].idxmax(axis=1).values[0]
    
    def transform_landmarks(self):
        landmarks = self.landmarks
        
        landmarks = self.filter_types(landmarks)
        
        landmarks = self.flip_left_hand(landmarks)
        
        return landmarks_trans
    
    def filter_types(self, landmarks):
        landmark_types = ['pose', self.hand]
        type_mask = landmarks.type.isin(landmark_types)
        landmarks = landmarks.loc[type_mask, :]
        return landmarks
        
    def flip_left_hand(self, landmarks):
        if hand == 'left_hand':
            left_hand_mask = landmarks.loc[:, 'type'] == 'left_hand'
            landmarks.loc[left_hand_mask, ['x', 'y']] = -landmarks.loc[left_hand_mask, ['x', 'y']]
        return landmarks

#### load landmarks

In [13]:
# sign = 'cloud'
# sign_mask = train.loc[:, 'sign'] == sign
# sample = train.loc[sign_mask, :].sample(n=1, random_state=73)
sample = train.loc['16069_100015657', :]

landmarks = pd.read_parquet(path / sample['path'])
landmarks

Unnamed: 0,frame,row_id,type,landmark_index,x,y,z
0,103,103-face-0,face,0,0.437886,0.437599,-0.051134
1,103,103-face-1,face,1,0.443258,0.392901,-0.067054
2,103,103-face-2,face,2,0.443997,0.409998,-0.042990
3,103,103-face-3,face,3,0.435256,0.362771,-0.039492
4,103,103-face-4,face,4,0.443780,0.381762,-0.068013
...,...,...,...,...,...,...,...
57010,207,207-right_hand-16,right_hand,16,,,
57011,207,207-right_hand-17,right_hand,17,,,
57012,207,207-right_hand-18,right_hand,18,,,
57013,207,207-right_hand-19,right_hand,19,,,


#### determine which hand has more frames

In [170]:
hand = sample[['left_hand', 'right_hand']].idxmax(axis=1).values[0]
hand

'left_hand'

#### filter for hand with more frames and pose

In [171]:
landmark_types = ['pose', hand]
type_mask = landmarks.type.isin(landmark_types)
landmarks = landmarks.loc[type_mask, :]
landmarks.groupby('type').frame.nunique()

type
left_hand    105
pose         105
Name: frame, dtype: int64

#### flip left hand if necessary

In [172]:
if hand == 'left_hand':
    left_hand_mask = landmarks.loc[:, 'type'] == 'left_hand'
    landmarks.loc[left_hand_mask, ['x', 'y']] = -landmarks.loc[left_hand_mask, ['x', 'y']]

#### drop all pose landmarks except necessary ones

In [173]:
pose_landmark_indices = [23, 11, 13, 15,
                         24, 12, 14, 16]
mask = ((landmarks.type == hand) | 
        ((landmarks.type == 'pose') & (landmarks.landmark_index.isin(pose_landmark_indices)))
       )
landmarks = landmarks.loc[mask, :]
landmarks.groupby('type').landmark_index.nunique()

type
left_hand    21
pose          8
Name: landmark_index, dtype: int64

#### interpolate values

In [174]:
mask = (landmarks.type == hand) & (landmarks.landmark_index == 0)
landmarks.loc[mask, :].head(10)

Unnamed: 0,frame,row_id,type,landmark_index,x,y,z
468,103,103-left_hand-0,left_hand,0,-0.90385,-0.686351,-2.993881e-07
1011,104,104-left_hand-0,left_hand,0,-0.885604,-0.680159,-3.012174e-07
1554,105,105-left_hand-0,left_hand,0,-0.871571,-0.672559,-3.034679e-07
2097,106,106-left_hand-0,left_hand,0,-0.868469,-0.668436,-2.915684e-07
2640,107,107-left_hand-0,left_hand,0,,,
3183,108,108-left_hand-0,left_hand,0,-0.896639,-0.687552,4.655252e-07
3726,109,109-left_hand-0,left_hand,0,,,
4269,110,110-left_hand-0,left_hand,0,,,
4812,111,111-left_hand-0,left_hand,0,-0.85857,-0.617604,1.73176e-07
5355,112,112-left_hand-0,left_hand,0,-0.875549,-0.618447,1.150485e-07


In [175]:
dims = ['type', 'landmark_index']
landmarks = (landmarks.set_index(['frame', 'type', 'landmark_index'])
             .sort_index(level=[2, 1, 0])
             .assign(x=lambda x: x.groupby(dims).x.apply(lambda g: g.interpolate()),
                     y=lambda x: x.groupby(dims).y.apply(lambda g: g.interpolate()),
                     z=lambda x: x.groupby(dims).z.apply(lambda g: g.interpolate()),
              )
             .sort_index(level=[0, 1, 2])
             .reset_index()
            )
mask = (landmarks.type == hand) & (landmarks.landmark_index == 0)
landmarks.loc[mask, :].head(10)

Unnamed: 0,frame,type,landmark_index,row_id,x,y,z
0,103,left_hand,0,103-left_hand-0,-0.90385,-0.686351,-2.993881e-07
29,104,left_hand,0,104-left_hand-0,-0.885604,-0.680159,-3.012174e-07
58,105,left_hand,0,105-left_hand-0,-0.871571,-0.672559,-3.034679e-07
87,106,left_hand,0,106-left_hand-0,-0.868469,-0.668436,-2.915684e-07
116,107,left_hand,0,107-left_hand-0,-0.882554,-0.677994,8.69784e-08
145,108,left_hand,0,108-left_hand-0,-0.896639,-0.687552,4.655252e-07
174,109,left_hand,0,109-left_hand-0,-0.883949,-0.664236,3.680755e-07
203,110,left_hand,0,110-left_hand-0,-0.871259,-0.64092,2.706257e-07
232,111,left_hand,0,111-left_hand-0,-0.85857,-0.617604,1.73176e-07
261,112,left_hand,0,112-left_hand-0,-0.875549,-0.618447,1.150485e-07


#### drop na

In [176]:
landmarks

Unnamed: 0,frame,type,landmark_index,row_id,x,y,z
0,103,left_hand,0,103-left_hand-0,-0.903850,-0.686351,-2.993881e-07
1,103,left_hand,1,103-left_hand-1,-0.812678,-0.626594,4.002348e-03
2,103,left_hand,2,103-left_hand-2,-0.728513,-0.588752,-6.392229e-03
3,103,left_hand,3,103-left_hand-3,-0.651862,-0.586241,-2.300651e-02
4,103,left_hand,4,103-left_hand-4,-0.588590,-0.591355,-3.957435e-02
...,...,...,...,...,...,...,...
3040,207,pose,14,207-pose-14,-0.000111,0.984456,-5.291088e-01
3041,207,pose,15,207-pose-15,0.920563,0.557810,-1.808138e+00
3042,207,pose,16,207-pose-16,-0.023938,1.331330,-1.068886e+00
3043,207,pose,23,207-pose-23,0.716362,1.332831,-3.693836e-02


In [178]:
na_mask = landmarks.loc[:, ['x', 'y', 'z']].isna().any(axis=1)
na_frames = landmarks.loc[na_mask, 'frame'].unique().tolist()

na_frames_mask = landmarks.loc[:, 'frame'].isin(na_frames)
landmarks = landmarks.loc[~na_frames_mask, :]

landmarks.frame.nunique()

105

#### filter for only first [25] frames

#### reshape values into frames

In [179]:
n_frames = landmarks.frame.nunique()
n_landmarks = landmarks[['type', 'landmark_index']].drop_duplicates().shape[0]
landmarks = landmarks[['x', 'y', 'z']].values.reshape(n_frames, n_landmarks, 3)
landmarks.shape

(105, 29, 3)

In [180]:
landmarks = np.expand_dims(landmarks.reshape(n_frames, -1), axis=1)
landmarks.shape

(105, 1, 87)

In [181]:
t = torch.from_numpy(landmarks).float()

# Creating the network

In [182]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

    
n_categories = len(sign_labels)
input_size = landmarks.shape[-1]
n_hidden = 128
rnn = RNN(87, n_hidden, n_categories)

In [183]:
input_ = t[0]
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input_, hidden)
output, next_hidden

(tensor([[-5.5894, -5.4058, -5.3417, -5.6006, -5.8240, -5.0637, -5.3456, -5.2896,
          -5.3976, -5.8262, -5.4935, -5.6289, -5.4381, -5.6393, -5.6580, -5.5651,
          -5.7112, -5.8029, -5.1383, -5.5868, -5.8601, -5.8673, -5.1613, -5.4638,
          -5.5701, -5.6110, -5.1036, -5.3559, -5.6054, -5.0678, -5.8407, -5.6069,
          -5.7044, -5.4494, -5.3268, -5.4290, -5.4359, -4.8492, -5.3329, -5.5252,
          -5.2975, -5.8166, -5.2127, -5.8265, -5.4571, -5.1762, -5.8186, -5.0081,
          -5.7258, -5.8300, -5.3736, -5.4473, -5.4402, -5.6560, -5.4799, -5.2937,
          -5.7398, -5.6416, -5.4367, -5.0709, -5.6134, -5.1219, -5.2797, -5.6966,
          -5.2607, -5.8485, -5.8670, -5.3501, -5.5460, -5.7203, -5.2430, -6.0076,
          -5.4949, -5.6375, -6.2089, -5.3503, -5.5782, -5.5460, -5.4479, -5.7929,
          -5.8593, -5.6136, -5.2465, -5.8910, -5.4977, -5.5413, -5.7424, -5.8996,
          -5.7615, -5.2673, -5.8692, -5.5273, -5.2324, -5.5683, -5.9701, -5.6533,
          -6.050

# Training

## Preparing the Network

In [184]:
sign_labels_inv = {v: k for k, v in sign_labels.items()}

In [185]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return sign_labels_inv[category_i], category_i

print(categoryFromOutput(output))

('carrot', 37)


## Training the Network

In [186]:
def train_one_record(category_tensor, landmarks_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(landmarks_tensor.size()[0]):
        output, hidden = rnn(landmarks_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [195]:
criterion = nn.NLLLoss()
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

In [196]:
sign_idx = sign_labels[sample.sign.values[0]]
category_tensor = torch.tensor([sign_idx], dtype=torch.long)

output, loss = train_one_record(category_tensor, t)
output, loss

(tensor([[-5.5232, -5.2619, -5.3445, -5.4194, -5.9100, -5.0852, -5.4301, -5.4576,
          -5.4191, -5.7611, -5.4190, -5.7401, -5.4913, -5.5051, -5.7752, -5.4804,
          -5.6500, -5.8049, -5.2098, -5.6500, -5.7285, -5.8559, -5.1742, -5.4735,
          -5.6378, -5.8003, -5.2513, -5.3858, -5.6089, -4.9929, -5.8531, -5.5081,
          -5.7392, -5.6099, -5.3243, -5.2845, -5.4441, -4.8316, -5.3344, -5.4918,
          -5.3481, -5.6928, -5.2664, -5.9864, -5.4494, -5.3089, -5.7555, -5.1302,
          -5.5694, -5.7502, -5.1991, -5.3975, -5.3705, -5.6784, -5.5866, -5.1510,
          -5.6411, -5.3955, -5.4678, -5.1863, -5.7339, -5.4093, -5.3133, -5.8244,
          -5.1101, -6.0080, -5.7383, -5.3812, -5.4127, -5.7971, -5.2081, -5.8999,
          -5.4943, -5.5199, -6.1898, -5.3097, -5.6290, -5.5954, -5.5532, -5.7289,
          -5.8330, -5.7020, -5.4065, -6.0114, -5.4673, -5.6993, -5.7046, -5.9451,
          -5.9300, -5.2900, -5.8698, -5.6828, -5.1874, -5.5981, -5.7674, -5.5925,
          -6.045

# Error Analysis

In [237]:
with torch.no_grad():
    for X, y in test_dataloader:
        pred = model(X)

In [238]:
with torch.no_grad():
    X, y = test_dataloader.dataset.X, test_dataloader.dataset.y
    pred = model(X).argmax(1)

In [239]:
preds = pd.DataFrame(y_test).assign(predicted=pred)

In [240]:
sign_labels_inverse = {v: k for k, v in sign_labels.items()}

In [241]:
preds = preds.assign(label=lambda x: x.label.map(sign_labels_inverse),
                     predicted=lambda x: x.predicted.map(sign_labels_inverse),
                     correct=lambda x: (x.label==x.predicted).astype(int)
                    )

In [242]:
(preds.groupby('label')
 .agg(
     correct=pd.NamedAgg('correct', 'sum'),
     total=pd.NamedAgg('label', 'count')
 )
 .assign(accuracy=lambda x: x.correct.div(x.total))
 .sort_values('accuracy', ascending=False)
 .to_csv('accuracy_by_word_actual.csv')
)

In [243]:
(preds.groupby('predicted')
 .agg(
     correct=pd.NamedAgg('correct', 'sum'),
     total=pd.NamedAgg('label', 'count')
 )
 .assign(accuracy=lambda x: x.correct.div(x.total))
 .sort_values('accuracy', ascending=False)
 .to_csv('accuracy_by_word_predicted.csv')
)

# Tensorflow Conversion

In [244]:
!pip install onnx-tf
!pip install tflite-runtime

Defaulting to user installation because normal site-packages is not writeable
Collecting onnx-tf
  Downloading onnx_tf-1.10.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting typeguard>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons, onnx-tf
[0mSuccessfully installed onnx-tf-1.10.0 tensorflow-addons-0.19.0 typeguard-2.13.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m

# Submissions

In [None]:
import tflite_runtime.interpreter as tflite
interpreter = tflite.Interpreter(model_path)

found_signatures = list(interpreter.get_signature_list().keys())

if REQUIRED_SIGNATURE not in found_signatures:
    raise KernelEvalException('Required input signature not found.')

prediction_fn = interpreter.get_signature_runner("serving_default")
output = prediction_fn(inputs=frames)
sign = np.argmax(output["outputs"])