In [9]:
import os
import cv2
import numpy as np
import tensorflow as tf
import random

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()  # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image, label):
    feature = {
        'image': _bytes_feature(image),
        'label': _int64_feature(label),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


def split_data(video_paths, labels, train_ratio=0.8):
    """Split the data into training and testing sets."""
    # Shuffle the data first (make sure to shuffle video_paths and labels in unison)
    combined = list(zip(video_paths, labels))
    random.shuffle(combined)
    video_paths[:], labels[:] = zip(*combined)
    
    # Split the data
    total_videos = len(video_paths)
    train_size = int(total_videos * train_ratio)
    
    train_video_paths = video_paths[:train_size]
    train_labels = labels[:train_size]
    
    test_video_paths = video_paths[train_size:]
    test_labels = labels[train_size:]
    
    return (train_video_paths, train_labels), (test_video_paths, test_labels)

def video_to_tfrecord(video_paths, labels, tfrecord_file):
    with tf.io.TFRecordWriter(tfrecord_file) as writer:
        for video_path, label in zip(video_paths, labels):
            cap = cv2.VideoCapture(video_path)
            success, frame = cap.read()
            while success:
                # Normalize pixel values to be between 0 and 1
                frame = frame.astype(np.float32) / 255.0
                _, buffer = cv2.imencode('.jpg', frame * 255)  # Convert back to byte format
                example = serialize_example(buffer.tobytes(), label)
                writer.write(example)
                success, frame = cap.read()
            cap.release()

def create_dataset(source_dir, train_tfrecord_file, test_tfrecord_file, train_ratio=0.8):
    classes = os.listdir(source_dir)
    class_labels = {class_name: i for i, class_name in enumerate(classes)}
    
    video_paths = []
    labels = []
    
    for class_name, class_label in class_labels.items():
        class_dir = os.path.join(source_dir, class_name)
        if os.path.isfile(class_dir):
            continue
        for video_name in os.listdir(class_dir):
            video_path = os.path.join(class_dir, video_name)
            if video_path.endswith('.mp4'):
                video_paths.append(video_path)
                labels.append(class_label)
    
    # Split the data into training and testing
    (train_video_paths, train_labels), (test_video_paths, test_labels) = split_data(video_paths, labels, train_ratio)
    
    # Create TFRecord for training and testing datasets
    video_to_tfrecord(train_video_paths, train_labels, train_tfrecord_file)
    video_to_tfrecord(test_video_paths, test_labels, test_tfrecord_file)

# Example usage
WSL_PATH = './project/movinet'
TRAIN_DATASET_NAME = os.path.join(WSL_PATH,'videov3_train_dataset.tfrecord')
TEST_DATASET_NAME = os.path.join(WSL_PATH,'videov3_test_dataset.tfrecord')
source_dir = os.path.join(WSL_PATH, 'videos_v3')
create_dataset(source_dir, TRAIN_DATASET_NAME, TEST_DATASET_NAME, train_ratio=0.8)
