In [None]:
import os
import json
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt

import tensorflow as tf
print(tf.__version__)
import tensorflow_io as tfio
print(tfio.__version__)

from tensorflow.keras import layers
from tensorflow.keras import models

In [None]:
# train.csv file
df = pd.read_csv("train.csv")

# Get labels 2 id
with open("sign_to_prediction_index_map.json") as f:
    label2id = json.load(f)
    
df["sign"] = df["sign"].apply(lambda sign: label2id[sign])
df.head()

In [None]:
paths = df.path.values
labels = df.sign.values

NUM_REPS = 543

for path, label in zip(paths, labels):
    print(path, label)
    frames_df = pd.read_parquet(path)[["x", "y", "z"]]
    print(len(frames_df))
    break

In [None]:
tfrecords_dir = "data/tfrecords"

num_samples = 4096
num_tfrecords = len(df) // num_samples
if len(df) % num_samples:
    num_tfrecords += 1  # add one record if there are any remaining samples

if not os.path.exists(tfrecords_dir):
    os.makedirs(tfrecords_dir)  # creating TFRecords output folder

In [None]:
print(num_tfrecords)

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=num_tfrecords, random_state=None, shuffle=False)

stratified_labels = {}

for i, (_, test_index) in enumerate(skf.split(paths, labels)):
    print(f"Fold {i}:")
    print(f"  Test:  index={test_index}")
    
    stratified_labels[i] = test_index

In [None]:
stratified_labels

In [None]:
for k, v in stratified_labels.items():
    split_paths = paths[v]
    split_labels = labels[v]
    
    for path, label in zip(split_paths, split_labels):
        frames = pd.read_parquet(path)[["x", "y", "z"]].values.astype(np.float32)
        n_frames = len(frames)/543
        
        break
    break

In [None]:
n_frames

In [None]:
frames.shape

In [None]:
label

In [None]:
frames.reshape(int(n_frames), 543, 3)

In [None]:
st = tf.io.serialize_tensor(frames)

In [None]:
tf.train.Feature(bytes_list=tf.train.BytesList(value=[st.numpy()]))

In [None]:
pt = tf.io.parse_tensor(st, out_type=tf.float32)

In [None]:
pt.numpy()

In [None]:
frames

In [None]:
def float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def float_sequence(sequence):
    """Returns a list of float_list from a float / double."""
    feature_list = [tf.train.Feature(float_list=tf.train.FloatList(value=value.tolist())) for value in sequence]
    return tf.train.FeatureList(feature=feature_list)


def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.numpy()]))


def serialize_sequence(sequence):
    """Serialize the multidimentional tensor"""
    return tf.io.serialize_tensor(sequence)


def parse_sequence(serialized_sequence):
    return tf.io.parse_tensor(
        serialized_sequence,
        out_type=tf.float32,
    )


def create_example(n_frames, sequence, label):
    feature = {
        "n_frames": float_feature(n_frames),
        "frames": bytes_feature(serialize_sequence(frames)),
        "label": int64_feature(label),
    }

    return tf.train.Example(features=tf.train.Features(feature=feature))


# def create_example(n_frame, seq, label):
#     sequence_features = tf.train.FeatureLists(feature_list={"frames": float_sequence(frames)})

#     context_features = tf.train.Features(feature = {
#         "n_frames": float_feature(n_frames),
#         "label": int64_feature(label),
#     })

#     example = tf.train.SequenceExample(context=context_features, feature_lists=sequence_features)
    
#     return example

# sequence_features = {
#   "frames": tf.io.FixedLenSequenceFeature([], dtype=tf.float32)
# }

# context_features = {
#   "n_frames": tf.io.FixedLenFeature([], tf.float32),
#   "label": tf.io.FixedLenFeature([], tf.int64),
# }


# context, sequence = tf.io.parse_single_sequence_example(
#     example,
#     context_features=context_features, 
#     sequence_features=sequence_features
# )


def parse_tfrecord_fn(example):
    feature_description = {
        "n_frames": tf.io.FixedLenFeature([], tf.float32),
        "frames": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    
    n_frames = example["n_frames"]
    label = tf.one_hot(example["label"], depth=250)
    frames = tf.reshape(parse_sequence(example["frames"]), shape=(n_frames, 543, 3))
    
    return example

In [None]:
example = create_example(
    n_frames,
    frames,
    label
)
example

In [None]:
with tf.io.TFRecordWriter(
    tfrecords_dir + "sample.tfrec"
) as writer:
    example = create_example(
        n_frames,
        frames,
        label
    )
    writer.write(example.SerializeToString())

In [None]:
tfrec_path = tfrecords_dir + "sample.tfrec"
tfrec_path

In [None]:
raw_dataset = tf.data.TFRecordDataset(tfrec_path)
raw_dataset

In [None]:
example = parse_tfrecord_fn(next(iter(raw_dataset)))

In [None]:
example["n_frames"]

In [None]:
tf.one_hot(example["label"], depth=250)

In [None]:
tf.reshape(parse_sequence(example["frames"]), shape=(example["n_frames"], 543, 3))

In [None]:
np.isnan(parse_sequence(example["frames"]).numpy()).sum()

In [None]:
trp = tfrecords_dir+f"/stratified_split_{1}.tfrec"

In [None]:
raw_dataset = tf.data.TFRecordDataset(tfrec_path)
raw_dataset

In [None]:
example = parse_tfrecord_fn(next(iter(raw_dataset)))

In [None]:
example