# Dataset Iteration Notebook

## Setup Python environment with Anaconda.

To create a conda environment from an environment.yml file, you can use the following bash command:

```bash
conda env create -f environment.yml
```


## Setup .env file

It is necessary to setup a .dotenv file for connection with Label Studio and Docker. The .env file must have the following keys:

```bash
# Label Studio environment variables for API
LABEL_STUDIO_URL=foo
LABEL_STUDIO_API_KEY=foo
LABEL_STUDIO_CONTAINER_ID=foo
LABEL_STUDIO_CONTAINER_DATA_DIR=fo
LABEL_STUDIO_DOWNLOAD_DIR=fo
LABEL_STUDIO_PROJECT_ID=foo
```

- `LABEL_STUDIO_URL`: The URL of the Label Studio instance for API communication. Example: "localhost:8000"
- `LABEL_STUDIO_API_KEY`: The API key used for authentication with the Label Studio instance.
- `LABEL_STUDIO_CONTAINER_ID`: The ID of the Docker container used by Label Studio.
- `LABEL_STUDIO_CONTAINER_DATA_DIR`: The directory path where the container stores data. Example: "/label-studio/data/media/upload/"
- `LABEL_STUDIO_DOWNLOAD_DIR`: The directory path where downloaded files are stored. Example: "./data/lsvideos/"
- `LABEL_STUDIO_PROJECT_ID`: The ID of the project in Label Studio. Example: "6"

These keys are used to configure the connection and interaction between the interpreter and the Label Studio instance for recovering and loading data.

## Imports

### Changing to repository's root directory

In [8]:
import os
import sys

import dotenv

os.chdir(os.getcwd().split("test")[0])
print(f"cwd: {os.getcwd()}")
dotenv.load_dotenv()
sys.path.append(os.getenv("PACKAGEPATH"))

cwd: /home/andrems2305/bioma-cow-breathing


### Main Imports

In [9]:
print(os.getenv("HOME"))

/home/andrems2305


In [10]:
import argparse
from datetime import datetime
from functools import partial
import os
import random

import dotenv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision.models.video import R2Plus1D_18_Weights
from torchvision.transforms.v2 import Compose, InterpolationMode, Normalize, Resize
from torchvision.io import write_video
from torchvision.transforms._presets import VideoClassification
from tqdm import tqdm

import config
from datasets import VideoDataset

## Constants and arguments

In [11]:
# Load environment variables from .dotenv
dotenv.load_dotenv(dotenv_path=".env", verbose=True, override=True)
# Set torch precision
torch.set_float32_matmul_precision("high")
# Set constants
# Set argument constants
LABEL_STUDIO_URL: str = os.getenv("LABEL_STUDIO_URL")
LABEL_STUDIO_API_KEY: str = os.getenv("LABEL_STUDIO_API_KEY")
LABEL_STUDIO_CONTAINER_ID: str = os.getenv("LABEL_STUDIO_CONTAINER_ID")
LABEL_STUDIO_CONTAINER_DATA_DIR: str = os.getenv("LABEL_STUDIO_CONTAINER_DATA_DIR")
LABEL_STUDIO_DOWNLOAD_DIR: str = os.getenv("LABEL_STUDIO_DOWNLOAD_DIR")
LABEL_STUDIO_PROJECT_ID: str = os.getenv("LABEL_STUDIO_PROJECT_ID")
TARGET_FPS: float = 5.0
SAMPLE_SIZE: int = 16
HOP_LENGTH: int = 8
FILTER_TASK_IDS: list | None = None
BBOX_TRANSFORM: bool = False
BBOX_TRANSFORM_CORNERS: bool = False
DOWNLOAD_VIDEOS: bool = True
DOWNLOAD_VIDEOS_OVERWRITE: bool = False
VERBOSE: bool = True
MODEL_DIR: str = "models/"
LOG_DIR: str = "logs/"
NUM_WORKERS: int = 1
BATCH_SIZE: int = 16
OPTIMIZER: str = "adamw"
LEARNING_RATE: float = 0.001
WEIGHT_DECAY: float = 0.01
MAX_EPOCHS: int = 1000
PATIENCE: int = 8
SEED: int = 42
MODEL_NAME: str = "r2plus1d18" + "_regression"
PRETRAINED: bool = True
# Set random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x74f3b4411410>

### Important: Change bounding box parameter values to change data loading methodology

In [12]:
BBOX_TRANSFORM: bool = True # ! Adjust this variable
BBOX_TRANSFORM_CORNERS: bool = True # ! Adjust this variable

In [13]:
# Create directories if they don't exist
if not os.path.exists(LABEL_STUDIO_DOWNLOAD_DIR):
    os.makedirs(LABEL_STUDIO_DOWNLOAD_DIR, exist_ok=True)
if not os.path.exists(
    LOG_DIR,
):
    os.makedirs(LOG_DIR, exist_ok=True)
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR, exist_ok=True)

## Load dataset and dataloaders

In [14]:
# Get transforms from weights
if BBOX_TRANSFORM:
    # Use r2plus1d18 transforms, without center crop
    transform = partial(
        VideoClassification, crop_size=(112, 112), resize_size=(112, 112)
    )()
else:
    # Use r2plus1d18 default transforms
    transform = R2Plus1D_18_Weights.DEFAULT.transforms()
# Load dataset
dataset = VideoDataset(
    url=LABEL_STUDIO_URL,
    api_key=LABEL_STUDIO_API_KEY,
    project_id=int(LABEL_STUDIO_PROJECT_ID),
    data_dir=LABEL_STUDIO_DOWNLOAD_DIR,
    container_id=LABEL_STUDIO_CONTAINER_ID,
    container_data_dir=LABEL_STUDIO_CONTAINER_DATA_DIR,
    fps=TARGET_FPS,
    sample_size=SAMPLE_SIZE,
    hop_length=HOP_LENGTH,
    filter_task_ids=FILTER_TASK_IDS,
    bbox_transform=BBOX_TRANSFORM,
    bbox_transform_corners=BBOX_TRANSFORM_CORNERS,
    download_videos=DOWNLOAD_VIDEOS,
    download_videos_overwrite=DOWNLOAD_VIDEOS_OVERWRITE,
    classification=False,
    prune_invalid=False,
    transform=transform,
    target_transform=lambda x: torch.tensor(x).unsqueeze(0),
    verbose=VERBOSE,
)
# Get task ids
task_ids = dataset.annotations["id"].unique()
# Split task ids into train and test
train_task_ids, test_task_ids = train_test_split(
    task_ids, test_size=0.2, random_state=SEED
)
# Split dataset into train and test
# del dataset
train_dataset = VideoDataset(
    url=LABEL_STUDIO_URL,
    api_key=LABEL_STUDIO_API_KEY,
    project_id=int(LABEL_STUDIO_PROJECT_ID),
    data_dir=LABEL_STUDIO_DOWNLOAD_DIR,
    container_id=LABEL_STUDIO_CONTAINER_ID,
    container_data_dir=LABEL_STUDIO_CONTAINER_DATA_DIR,
    fps=TARGET_FPS,
    sample_size=SAMPLE_SIZE,
    hop_length=HOP_LENGTH,
    filter_task_ids=train_task_ids,
    bbox_transform=BBOX_TRANSFORM,
    bbox_transform_corners=BBOX_TRANSFORM_CORNERS,
    download_videos=DOWNLOAD_VIDEOS,
    download_videos_overwrite=DOWNLOAD_VIDEOS_OVERWRITE,
    classification=False,
    prune_invalid=True,
    transform=transform,
    target_transform=lambda x: torch.tensor(x).unsqueeze(0),
    verbose=VERBOSE,
)
test_dataset = VideoDataset(
    url=LABEL_STUDIO_URL,
    api_key=LABEL_STUDIO_API_KEY,
    project_id=int(LABEL_STUDIO_PROJECT_ID),
    data_dir=LABEL_STUDIO_DOWNLOAD_DIR,
    container_id=LABEL_STUDIO_CONTAINER_ID,
    container_data_dir=LABEL_STUDIO_CONTAINER_DATA_DIR,
    fps=TARGET_FPS,
    sample_size=SAMPLE_SIZE,
    hop_length=HOP_LENGTH,
    filter_task_ids=test_task_ids,
    bbox_transform=BBOX_TRANSFORM,
    bbox_transform_corners=BBOX_TRANSFORM_CORNERS,
    download_videos=DOWNLOAD_VIDEOS,
    download_videos_overwrite=DOWNLOAD_VIDEOS_OVERWRITE,
    classification=False,
    prune_invalid=True,
    transform=transform,
    target_transform=lambda x: torch.tensor(x).unsqueeze(0),
    verbose=VERBOSE,
)
print(f"[{datetime.now()}]: Loaded datasets")
# Create dataloaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)
print(f"[{datetime.now()}]: Created data loaders")
# Create model
# Set dataloaders (for generic use)
train_dataloaders: list[DataLoader] = [train_dataloader]
test_dataloaders: list[DataLoader] = [test_dataloader]

[2024-05-14 10:54:04.155997]: Loaded datasets
[2024-05-14 10:54:04.156238]: Created data loaders


## Dataset iteration methods

### Get a sample from the dataset and save it to .mp4

In [15]:
# Get a sample from the dataset and save it
train_dataset.transform = None
sample, _ = train_dataset[0]
print(f"[{datetime.now()}]: Got sample")
# Save sample as video
write_video(
    os.path.join("sample.mp4"),
    # sample.permute(1, 2, 3, 0),
    sample.permute(0, 2, 3, 1),
    fps=TARGET_FPS,
)
print(f"[{datetime.now()}]: Saved sample as video {os.path.join('sample.mp4')}")

[2024-05-14 10:54:05.174049]: Got sample
[2024-05-14 10:54:05.488286]: Saved sample as video sample.mp4


### Iterate through dataset and save samples to output folder

In [24]:
output_folder: str = "data/output"
# Create directories if they don't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)
for idx, (sample, target) in enumerate(tqdm(dataset)):
    task_id = dataset.samples.iloc[idx].task_id
    segment_id = dataset.samples.iloc[idx].segment_id
    sample_id = dataset.samples.iloc[idx].sample_id
    write_video(
        os.path.join(output_folder, f"{task_id}_{segment_id}_{sample_id}.mp4"),
        # [C, T, H, W] -> [T, H, W, C]
        sample.permute(1, 2, 3, 0),
        # sample.permute(0, 2, 3, 1),
        fps=TARGET_FPS,
    )

  2%|▏         | 379/15469 [04:49<3:12:20,  1.31it/s]


### Iterate through entire dataloaders to test functionality

In [17]:
# Iterate through train and test datasets with dataloaders
break_early: bool = True
for train_dataloader, test_dataloader in zip(train_dataloaders, test_dataloaders):
    print(f"[{datetime.now()}]: Got dataloaders")
    # Iterate through samples in dataloader
    print(f"[{datetime.now()}]: Iterating through train dataloader")
    for i, (x, y) in enumerate(train_dataloader):
        print(f"[{datetime.now()}]: Got batch {i}")
        # Print shapes
        print(f"x.shape: {x.shape}")
        print(f"y.shape: {y.shape}")
        if break_early:
            break
    print(f"[{datetime.now()}]: Finished iterating through train dataloader")
    print(f"[{datetime.now()}]: Iterating through test dataloader")
    for i, (x, y) in enumerate(test_dataloader):
        print(f"[{datetime.now()}]: Got batch {i}")
        # Print shapes
        print(f"x.shape: {x.shape}")
        print(f"y.shape: {y.shape}")
        if break_early:
            break
    print(f"[{datetime.now()}]: Finished iterating through test dataloader")

[2024-05-14 11:14:53.193357]: Got dataloaders
[2024-05-14 11:14:53.193405]: Iterating through train dataloader


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/andrems2305/anaconda3/envs/torch/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/andrems2305/anaconda3/envs/torch/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/andrems2305/anaconda3/envs/torch/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/andrems2305/anaconda3/envs/torch/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 144, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/andrems2305/anaconda3/envs/torch/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 144, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/andrems2305/anaconda3/envs/torch/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 121, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/andrems2305/anaconda3/envs/torch/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 173, in collate_tensor_fn
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Trying to resize storage that is not resizable
