In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.notebook import tqdm
import os
import json
from ipywidgets import widgets
import plotly.express as px
import plotly.graph_objects as go

plt.style.use("ggplot")

In [2]:
competition = 'asl-signs'

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('..') / 'input'  / 'competition'
    # !pip install -Uqq fastai
else:
    import zipfile, kaggle
    path = Path.home() / '.data' / 'asl-signs'
    if not path.exists():
        path.mkdir(exist_ok=True)
        kaggle.api.competition_download_cli(competition, path=path)
        zipfile.ZipFile(path / f'{competition}.zip').extractall(path)

In [3]:
with open(path / 'sign_to_prediction_index_map.json') as f:
    sign_labels = json.load(f)

In [4]:
train = (pd.read_csv(path / 'train.csv')
         .assign(idx=lambda x: x.participant_id.astype(str) + '_' + x.sequence_id.astype(str))
         .set_index('idx')
        )
train.head()

Unnamed: 0_level_0,path,participant_id,sequence_id,sign
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26734_1000035562,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow
28656_1000106739,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait
16069_100015657,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud
25571_1000210073,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
62590_1000240708,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie


In [5]:
COORD_COLS = ['x', 'y', 'z']

# Are Any Rows Partially Populated?

one of x, y or z has a value while another has a missing value

In [6]:
n = len(train)
for i in tqdm(range(n)):
    record = train.iloc[i]
    record_id = str(record['participant_id']) + '_' + str(record['sequence_id'])

    pq_path = path / record['path']
    try:
        landmarks = pd.read_parquet(pq_path)
    except OSError:
            train.loc[record_id, 'cnt_partial_nulls'] = np.nan
    else:
        cnt_partial_nulls = (~landmarks.loc[:, COORD_COLS].isna().all(axis=1) &
                             landmarks.loc[:, COORD_COLS].isna().any(axis=1)).sum()
        train.loc[record_id, 'cnt_partial_nulls'] = cnt_partial_nulls    

  0%|          | 0/94477 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
train.cnt_partial_nulls.value_counts()

## Answer

Yes, `x`, `y` & `z` either all have values or none do

# Are all landmarks whenever any landmark is populated in a frame by type?

We know that sometimes a frame only contains a certain type (e.g. there's a left hand but no right hand).

However; if there's a type (e.g. a left hand) in the frame, does that mean all landmarks have data?

In [26]:
n = len(train)
for i in tqdm(range(n)):
    record = train.iloc[i]
    record_id = str(record['participant_id']) + '_' + str(record['sequence_id'])

    pq_path = path / record['path']
    try:
        landmarks = pd.read_parquet(pq_path)
    except OSError:
            pass
    else: 
        cnt_partial_nulls_by_frame = (landmarks.assign(has_coords=lambda x: (~x[COORD_COLS].isna()).all(axis=1))
                                      .loc[:, ['frame', 'type', 'landmark_index', 'has_coords']]
                                      .groupby(['type', 'frame'])
                                      .apply(lambda g: (~g.has_coords.all() & g.has_coords.any()).sum())
                                      ).sum()
        train.loc[record_id, 'cnt_partial_nulls_by_frame'] = cnt_partial_nulls_by_frame  

  0%|          | 0/94477 [00:00<?, ?it/s]

In [29]:
train.cnt_partial_nulls_by_frame.value_counts()

0.0    94477
Name: cnt_partial_nulls_by_frame, dtype: int64

## Answer

Yes, whenever a frame has a type in it (e.g. a right hand), all landmarks have data.

# Frames Summary

We know there are a different number of frames by type in each record. How many frames are in each on average?

How many frames per type (e.g. left hand, right hand, pose and face)?

In [57]:
n = len(train)
for i in tqdm(range(n)):
    record = train.iloc[i]
    record_id = str(record['participant_id']) + '_' + str(record['sequence_id'])

    pq_path = path / record['path']
    try:
        landmarks = pd.read_parquet(pq_path)
    except OSError:
            pass
    else:
        train.loc[record_id, 'total_frames'] = landmarks.frame.nunique()
        
        type_frame_cnts = (landmarks.assign(has_coords=lambda x: ~x[COORD_COLS].isna().all(axis=1))
         .loc[:, ['frame', 'type', 'landmark_index', 'has_coords']]
         .groupby(['type', 'frame'], as_index=False)
         [['has_coords']]
         .all()
         .groupby('type')
         .has_coords
         .sum()
         .rename('cnt_frames')
        )


        train.loc[record_id, ['face', 'left_hand', 'pose', 'right_hand']] = type_frame_cnts.sort_index().tolist() 

  0%|          | 0/94477 [00:00<?, ?it/s]

# Save Meta Data

In [62]:
train.to_csv(path / 'train_with_meta.csv')

# Sign Summary

In [9]:
train = pd.read_csv(path / 'train_with_meta.csv')

In [20]:
POSES = ['face', 'left_hand', 'pose', 'right_hand']
g = train.groupby(['sign'])
medians = (g[POSES + ['total_frames']]
           .median()
           .rename(columns=lambda x: x + '_median')
          )

means = (g[POSES + ['total_frames']]
         .mean()
         .rename(columns=lambda x: x + '_mean')
        )

percent_zeros = (g[POSES]
                 .apply(lambda g: (g <= 0).sum() / g.size)
                 .rename(columns=lambda x: x + '_percent_zeros')
                )

frames = [medians, means, percent_zeros]
sign_summary = (pd.concat(frames, axis=1)
                .sort_index(axis=1)
               )

sign_summary.to_excel('Sign Summary.xlsx')

sign_summary

Unnamed: 0_level_0,face_mean,face_median,face_percent_zeros,left_hand_mean,left_hand_median,left_hand_percent_zeros,pose_mean,pose_median,pose_percent_zeros,right_hand_mean,right_hand_median,right_hand_percent_zeros,total_frames_mean,total_frames_median
sign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
TV,36.755844,22.0,0.0,13.641558,0.0,0.135714,36.875325,22.0,0.0,10.981818,3.0,0.106494,36.875325,22.0
after,30.181556,16.0,0.0,7.521614,0.0,0.140490,30.391931,16.0,0.0,6.613833,2.0,0.095821,30.391931,16.0
airplane,43.633588,27.0,0.0,15.526718,0.0,0.138677,43.954198,27.0,0.0,15.290076,7.0,0.098601,43.954198,27.0
all,35.981865,20.0,0.0,8.227979,0.0,0.137953,36.502591,21.0,0.0,9.839378,4.0,0.092617,36.502591,21.0
alligator,45.297436,26.0,0.0,12.433333,0.0,0.135256,46.207692,27.0,0.0,11.466667,4.0,0.101923,46.207692,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yesterday,39.346734,23.0,0.0,11.723618,0.0,0.148241,39.459799,23.0,0.0,15.472362,10.0,0.094849,39.459799,23.0
yourself,34.924675,18.0,0.0,11.046753,0.0,0.138961,35.062338,18.0,0.0,10.301299,4.0,0.099351,35.062338,18.0
yucky,37.377309,23.0,0.0,11.709763,0.0,0.141821,37.440633,23.0,0.0,13.195251,5.0,0.094327,37.440633,23.0
zebra,38.354667,23.0,0.0,9.458667,0.0,0.140000,38.397333,23.0,0.0,12.762667,5.0,0.092000,38.397333,23.0


# Participant Summary


In [23]:
POSES = ['face', 'left_hand', 'pose', 'right_hand']
g = train.groupby(['participant_id'])
medians = (g[POSES + ['total_frames']]
           .median()
           .rename(columns=lambda x: x + '_median')
          )

means = (g[POSES + ['total_frames']]
         .mean()
         .rename(columns=lambda x: x + '_mean')
        )

percent_zeros = (g[POSES]
                 .apply(lambda g: (g <= 0).sum() / g.size)
                 .rename(columns=lambda x: x + '_percent_zeros')
                )

frames = [medians, means, percent_zeros]
sign_summary = (pd.concat(frames, axis=1)
                .sort_index(axis=1)
               )

sign_summary.to_excel('Sign Summary.xlsx')

sign_summary

Unnamed: 0_level_0,face_mean,face_median,face_percent_zeros,left_hand_mean,left_hand_median,left_hand_percent_zeros,pose_mean,pose_median,pose_percent_zeros,right_hand_mean,right_hand_median,right_hand_percent_zeros,total_frames_mean,total_frames_median
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2044,15.608108,15.0,0.0,0.005613,0.0,0.249168,15.613098,15.0,0.0,11.269231,11.0,0.0,15.613098,15.0
4718,36.810517,21.0,0.000143,0.179766,0.0,0.241426,36.967705,21.0,0.0,19.520149,11.0,0.003572,36.967705,21.0
16069,44.900165,25.0,0.0,25.479167,15.0,0.000103,44.906766,25.0,0.0,0.088078,0.0,0.246493,44.906766,25.0
18796,34.588806,14.0,0.0,0.012564,0.0,0.249072,34.631354,14.0,0.0,23.170474,11.0,0.0,34.631354,14.0
22343,43.026085,26.0,0.0,23.368399,16.0,0.005131,43.04597,26.0,0.0,0.556767,0.0,0.239309,43.04597,26.0
25571,19.16119,12.0,0.0,0.127296,0.0,0.247865,19.17956,12.0,0.0,12.384994,9.0,0.001488,19.17956,12.0
26734,45.113613,30.0,0.0,0.023755,0.0,0.24623,45.117538,30.0,0.0,28.730428,22.0,0.000103,45.117538,30.0
27610,53.675789,30.0,0.000175,25.045614,16.0,0.045322,55.287719,31.0,0.0,4.993918,0.0,0.20117,55.287719,31.0
28656,51.42735,30.0,0.0,3.58799,0.0,0.228468,51.430638,30.0,0.0,30.722989,16.0,0.010574,51.430638,30.0
29302,37.285684,33.0,0.0,5.1241,0.0,0.128653,37.289072,33.0,0.0,15.077298,12.0,0.002594,37.289072,33.0
