In [2]:
import os
import re
import json
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf
print(tf.__version__)
import tensorflow_io as tfio
print(tfio.__version__)

import einops
from tensorflow.keras import layers
from tensorflow.keras import models

import wandb
from wandb.keras import WandbMetricsLogger
from wandb.keras import WandbModelCheckpoint

2.10.0
0.27.0


In [4]:
import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [5]:
data_path = "../data/tfrecord_heatmaps"


def natural_keys(text):
    ""
    def atoi(text):
        return int(text) if text.isdigit() else text
    
    return [atoi(c) for c in re.split(r'(\d+)', text)]

tfrecords = sorted(glob(f"{data_path}/*.tfrec"), key=natural_keys)

In [6]:
import json

with open("../data/sign_to_prediction_index_map.json") as f:
    data = json.load(f)
id2label = {v:k for k, v in data.items()}

In [7]:
from argparse import Namespace

configs = Namespace(
    batch_size = 64,
    epochs = 15,
    learning_rate = 1e-3,
    label_smoothing=0.2,
    num_steps=0.8,
)

In [8]:
train_tfrecords, valid_tfrecords = tfrecords[:20], tfrecords[20:]
print(len(train_tfrecords), len(valid_tfrecords))

20 4


In [22]:
def parse_sequence(serialized_sequence):
    return tf.io.parse_tensor(
        serialized_sequence,
        out_type=tf.float16,
    )


def parse_tfrecord_fn(example):
    feature_description = {
        "n_frames": tf.io.FixedLenFeature([], tf.float32),
        "frames": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64),
    }

    return tf.io.parse_single_example(example, feature_description)


def preprocess_frames(frames):
    """This is where different preprocessing logics will be experimented."""
#     frames = (frames - tf.reduce_min(frames))/(tf.reduce_max(frames)-tf.reduce_min(frames))
    frames = tf.cast(frames, dtype=tf.float32)
    frames = tf.transpose(frames, (0,3,2,1))

    return frames


def parse_data(example):
    # Parse Frames
    frames = tf.reshape(parse_sequence(example["frames"]), shape=(28, 61, 32, 32))
    frames = preprocess_frames(frames)

    # Parse Labels
    label = tf.one_hot(example["label"], depth=250)

    return frames, label

In [23]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = tf.data.TFRecordDataset(train_tfrecords)
valid_ds = tf.data.TFRecordDataset(valid_tfrecords)

trainloader = (
    train_ds
    .shuffle(configs.batch_size*4)
    .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
    .map(parse_data, num_parallel_calls=AUTOTUNE)
    .batch(configs.batch_size)
    .prefetch(AUTOTUNE)
)

validloader = (
    valid_ds
    .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
    .map(parse_data, num_parallel_calls=AUTOTUNE)
    .batch(configs.batch_size)
    .prefetch(AUTOTUNE)
)

In [24]:
sample, label = next(iter(trainloader))
sample.shape

TensorShape([64, 28, 32, 32, 61])

In [25]:
from tensorflow import keras


class Conv2Plus1D(keras.layers.Layer):
  def __init__(self, filters, kernel_size, padding):
    """
      A sequence of convolutional layers that first apply the convolution operation over the
      spatial dimensions, and then the temporal dimension. 
    """
    super().__init__()
    self.seq = keras.Sequential([  
        # Spatial decomposition
        layers.Conv3D(filters=filters,
                      kernel_size=(1, kernel_size[1], kernel_size[2]),
                      padding=padding),
        # Temporal decomposition
        layers.Conv3D(filters=filters, 
                      kernel_size=(kernel_size[0], 1, 1),
                      padding=padding)
        ])

  def call(self, x):
    return self.seq(x)


class ResidualMain(keras.layers.Layer):
  """
    Residual block of the model with convolution, layer normalization, and the
    activation function, ReLU.
  """
  def __init__(self, filters, kernel_size):
    super().__init__()
    self.seq = keras.Sequential([
        Conv2Plus1D(filters=filters,
                    kernel_size=kernel_size,
                    padding='same'),
        layers.LayerNormalization(),
        layers.ReLU(),
        Conv2Plus1D(filters=filters, 
                    kernel_size=kernel_size,
                    padding='same'),
        layers.LayerNormalization()
    ])

  def call(self, x):
    return self.seq(x)


class Project(keras.layers.Layer):
  """
    Project certain dimensions of the tensor as the data is passed through different 
    sized filters and downsampled. 
  """
  def __init__(self, units):
    super().__init__()
    self.seq = keras.Sequential([
        layers.Dense(units),
        layers.LayerNormalization()
    ])

  def call(self, x):
    return self.seq(x)


def add_residual_block(inputs, filters, kernel_size):
  """
    Add residual blocks to the model. If the last dimensions of the input data
    and filter size does not match, project it such that last dimension matches.
  """
  out = ResidualMain(filters, 
                     kernel_size)(inputs)

  res = inputs
  # Using the Keras functional APIs, project the last dimension of the tensor to
  # match the new filter size
  if out.shape[-1] != inputs.shape[-1]:
    res = Project(out.shape[-1])(res)

  return layers.add([res, out])


class ResizeHeatmap(tf.keras.layers.Layer):
  def __init__(self, height, width):
    super().__init__()
    self.height = height
    self.width = width
    self.resizing_layer = layers.Resizing(self.height, self.width)

  def call(self, video):
    """
      Use the einops library to resize the tensor.  

      Args:
        video: Tensor representation of the video, in the form of a set of frames.

      Return:
        A downsampled size of the video according to the new height and width it should be resized to.
    """
    # b stands for batch size, t stands for time, h stands for height, 
    # w stands for width, and c stands for the number of channels.
    old_shape = einops.parse_shape(video, 'b t h w c')
    images = einops.rearrange(video, 'b t h w c -> (b t) h w c')
    images = self.resizing_layer(images)
    videos = einops.rearrange(
        images, '(b t) h w c -> b t h w c',
        t = old_shape['t'])
    return videos

In [31]:
HEIGHT = 32
WIDTH = 32


def Conv3DModel():
    inputs = layers.Input(shape=(28,32,32,61))
    x = inputs

    x = Conv2Plus1D(filters=64, kernel_size=(1, 7, 7), padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    # Block 2
    x = add_residual_block(x, 64, (1, 7, 7))
    x = ResizeHeatmap(HEIGHT // 2, WIDTH // 2)(x)

    # Block 3
    x = add_residual_block(x, 128, (1, 7, 7))
    x = ResizeHeatmap(HEIGHT // 4, WIDTH // 4)(x)

    # Block 4
    x = add_residual_block(x, 256, (1, 7, 7))
    x = ResizeHeatmap(HEIGHT // 8, WIDTH // 8)(x)

    x = layers.GlobalAveragePooling3D()(x)
    x = layers.Dense(250)(x)

    model = keras.Model(inputs, x)
    
    return model

In [32]:
tf.keras.backend.clear_session()

model = Conv3DModel()
model.summary(expand_nested=False)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 28, 32, 32,  0           []                               
                                 61)]                                                             
                                                                                                  
 conv2_plus1d (Conv2Plus1D)     (None, 28, 32, 32,   195520      ['input_1[0][0]']                
                                64)                                                               
                                                                                                  
 batch_normalization (BatchNorm  (None, 28, 32, 32,   256        ['conv2_plus1d[0][0]']           
 alization)                     64)                                                           

In [33]:
total_steps = 1231*configs.epochs
decay_steps = total_steps*configs.num_steps

cosine_decay_scheduler = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate = configs.learning_rate,
    decay_steps = decay_steps,
    alpha=0.1
)

In [34]:
model.compile(
    tf.keras.optimizers.Adam(learning_rate=cosine_decay_scheduler),
    tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
    metrics=["acc"]
)

In [30]:
run = wandb.init(
    project="kaggle-asl",
    job_type="train_poseconv3d",
    config=configs,
)

[34m[1mwandb[0m: Currently logged in as: [33mayush-thakur[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [35]:
earlystopper = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=8,
    verbose=0,
    mode="auto",
    restore_best_weights=True,
)

callbacks = [
    earlystopper,
    WandbMetricsLogger(log_freq=2),
    WandbModelCheckpoint(
        filepath=f"model",
        save_best_only=True,
    ),
]

model.fit(
    trainloader,
    epochs=configs.epochs,
    validation_data=validloader,
    callbacks=callbacks
)



Epoch 1/15


2023-04-28 00:22:53.768589: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8200


    559/Unknown - 532s 932ms/step - loss: 6.5127 - acc: 0.0035

KeyboardInterrupt: 

In [33]:
eval_loss, eval_acc = model.evaluate(validloader)
wandb.log({"eval_loss": eval_loss, "eval_acc": eval_acc})



In [34]:
run.config.update(configs)

In [35]:
run.finish()

VBox(children=(Label(value='869.765 MB of 869.765 MB uploaded (0.823 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
batch/acc,▁▁▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████
batch/batch_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch/learning_rate,█████▇▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch/loss,█▇▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/acc,▁▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇████████████
epoch/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch/learning_rate,████▇▇▇▆▆▅▅▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_acc,▁▃▅▆▆▆▇▇▇▇▇▇▇█████████████████
epoch/val_loss,█▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch/acc,0.79102
batch/batch_step,18478.0
batch/learning_rate,0.0001
batch/loss,2.7087
epoch/acc,0.791
epoch/epoch,29.0
epoch/learning_rate,0.0001
epoch/loss,2.70877
epoch/val_acc,0.6518
epoch/val_loss,3.08889
