<a href="https://colab.research.google.com/github/NVIDIA/synthda/blob/main/synthda_slowfast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/MyDrive')

# Installation

In [None]:
!pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio===0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
!pip install 'iopath'

!pip install pytorchvideo==0.1.5 fvcore==0.1.5.post20221221
!pip install tensorboard
!pip install setuptools==59.5.0
!pip install torchinfo

# Libraries

In [None]:
import os
import glob
import cv2
import gc
import json
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.tensorboard import SummaryWriter
import torch
import torchvision
import pytorchvideo
import torchinfo

from pytorchvideo.models.hub import c2d_r50, i3d_r50, slow_r50, slowfast_r50
from pytorchvideo.models.hub import x3d_m
from torchvision.transforms.functional import normalize, crop, hflip
from torchvision.transforms._functional_video import center_crop
#from pytorchvideo.transforms.functional import short_side_scale

In [None]:
!pip install --upgrade pytorchvideo
#from pytorchvideo.transforms.functional import short_side_scale

In [None]:
def short_side_scale(video: torch.Tensor, size: int):
    """
    Resize so the shorter spatial side == `size`, keeping aspect ratio.
    video: (T, C, H, W) or (C, T, H, W) tensor
    """
    from torchvision.transforms.functional import resize
    t, c, h, w = video.shape if video.ndim == 4 else (None,)*4
    short, long = (h, w) if h < w else (w, h)
    new_short, new_long = size, int(size * long / short)
    # output: (T,C,H,W)
    return resize(video, [new_short, new_long])

In [None]:
!pip install --upgrade pip
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install iopath fvcore pytorchvideo tensorboard setuptools torchinfo opencv-python seaborn numpy Pillow scikit-learn

# Helper Functions

In [None]:
# function to create directories
def create_dir(target_dir):
    if not os.path.exists(target_dir):
        try:
            os.makedirs(target_dir)
        except:
            pass

In [None]:
def write_action(action_filepath,action_name,action_id,action_type,tabsize=2):
    action_name_str = '\tname: "{}"'.format(action_name)
    action_id_str = '\tlabel_id: {}'.format(action_id)
    action_type_str = '\tlabel_type: {}'.format(action_type)

    with open(action_filepath, 'a') as action_file:
        action_file.write('label {\n')
        action_file.write(action_name_str.expandtabs(tabsize))
        action_file.write('\n')
        action_file.write(action_id_str.expandtabs(tabsize))
        action_file.write('\n')
        action_file.write(action_type_str.expandtabs(tabsize))
        action_file.write('\n')
        action_file.write('}\n')

In [None]:
# function to get videoinfo
def get_videoinfo(videofile):

    stream = cv2.VideoCapture(videofile)
    assert stream.isOpened(), 'Cannot capture source'

    datalen = int(stream.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = stream.get(cv2.CAP_PROP_FPS)
    frameSize = (int(stream.get(cv2.CAP_PROP_FRAME_WIDTH)), int(stream.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    fourcc = int(stream.get(cv2.CAP_PROP_FOURCC))
    videoinfo = {'no_images': datalen, 'fps': fps, 'frameSize': frameSize, 'fourcc': decode_fourcc(fourcc)}

    stream.release()

    return videoinfo

# function to decode fourcc
def decode_fourcc(cc):
    return "".join([chr((int(cc) >> 8 * i) & 0xFF) for i in range(4)])

In [None]:
# function to split df into train and test sets
def split(split_data,split_target,split_size,split_label_1,split_label_2):

    # get the locations
    X = split_data.drop(columns=[split_target])
    y = split_data[split_target]

    # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_size, random_state=297,stratify=y)

    # determine fold
    X_train[split_target] = y_train
    X_train['fold'] = split_label_1

    X_test[split_target] = y_test
    X_test['fold'] = split_label_2

    # concat
    split_df = pd.concat([X_train,X_test])

    return split_df, X_train, X_test

In [None]:
# convert video into images
def vid_to_img(videofile,img_dest_dir):

  # create img_dest_dir
  os.makedirs(img_dest_dir,exist_ok=True)

  # read video and save images
  img_paths = []
  stream = cv2.VideoCapture(videofile)
  assert stream.isOpened(), 'Cannot capture source'

  datalen = int(stream.get(cv2.CAP_PROP_FRAME_COUNT))

  for i in tqdm(range(datalen),total=datalen):
    (grabbed, frame) = stream.read()
    img_path = os.path.join(img_dest_dir,'{}.jpg'.format(i))
    cv2.imwrite(img_path,frame)
    img_paths.append(img_path)

  stream.release()

  return img_paths

In [None]:
def create_frames_dir(in_df_video_info,in_frames_dir):

    out_df_frame_info = pd.DataFrame(columns=['vid_id','frame_id','training_frame_path'])

    # obtain video_ids
    video_ids = list(in_df_video_info['vid_id'])

    # copy images into frames_dir
    for i, video_id in enumerate(video_ids):
        df_video = in_df_video_info.loc[in_df_video_info['vid_id']==video_id]

        if df_video.empty:
            print(f"Warning: No video found with vid_id {video_id}. Skipping...")
            continue

        assert len(df_video) == 1

        print('Extracting images for video {} of {}'.format(i+1,len(video_ids)))
        img_dest_dir = os.path.join(in_frames_dir,str(video_id))
        os.makedirs(img_dest_dir,exist_ok=True)

        # obtain video_images
        image_paths = vid_to_img(df_video['video_path'].iloc[0],img_dest_dir)

        for image_path in image_paths:

            # rename image
            image_name = os.path.basename(image_path)
            frame_id = int(image_name.split('.')[0])

            # store info
            df_dict = pd.DataFrame.from_dict({'vid_id':[video_id],
                                              'frame_id':[frame_id],
                                              'training_frame_path':[image_path]})
            out_df_frame_info = pd.concat([out_df_frame_info,df_dict],ignore_index=True)

    return out_df_frame_info

In [None]:
def create_frame_lists_csv(in_df):

    out_df = pd.DataFrame(columns=['vid_id','frame_id','rel_path'])

    for i, row_i in tqdm(in_df.iterrows(),total=len(in_df)):

        # obtain video info
        vid_id = row_i['vid_id']
        video_path = row_i['video_path']

        # obtain images
        dst_folder = os.path.join(FRAMES_DIR,str(vid_id))
        video_images = sorted(glob.glob(str(os.path.join(dst_folder,'*.jpg'))))

        for image in video_images:
            image_name = os.path.basename(image)
            frame_id = int(image_name.split('.')[0])
            rel_path = str(os.path.join(str(vid_id),image_name))

            # store info
            df_dict = pd.DataFrame.from_dict({'vid_id':[vid_id],
                                              'frame_id':[frame_id],
                                              'rel_path':[rel_path]})
            out_df = pd.concat([out_df,df_dict],ignore_index=True)

    return out_df

In [None]:
def create_annotations_csv(in_df):

    out_df = pd.DataFrame(columns=['vid_id', 'frame_id',
                                   'frame_timestamp',
                                   'action_label'])

    for i, row_i in tqdm(in_df.iterrows(),total=len(in_df)):

        # obtain necessary video info
        vid_id = row_i['vid_id']
        action_id = row_i['action_id']
        video_fps = row_i['fps']

        # obtain images
        dst_folder = os.path.join(FRAMES_DIR,str(vid_id))
        video_images = sorted(glob.glob(str(os.path.join(dst_folder,'*.jpg'))))

        for image in video_images:

            # obtain variables
            image_name = os.path.basename(image)
            frame_id = int(image_name.split('.')[0].split('_')[-1])
            frame_timestamp = frame_id / video_fps

            # store
            df_dict = pd.DataFrame.from_dict({'vid_id':[vid_id],
                                              'frame_id':[frame_id],
                                              'frame_timestamp':[frame_timestamp],
                                              'action_label':[action_id]})

            out_df = pd.concat([out_df,df_dict],ignore_index=True)

    return out_df

In [None]:
def clip_sampler_random(in_last_clip_end_time,in_is_last_clip,in_frame_ids):
    """
    Randomly samples clip of size CLIP_DURATION (in terms of number of frames) from video frames.
    Args:
        in_last_clip_end_time (int): the last frame id of last clip that was sampled
        in_is_last_clip (int): current clip count for random sampling
        in_frame_ids (list): list of all frame_ids for video
    Returns:
        out_clip_start_frame (int): starting frame id of the sampled clip
        out_clip_end_frame (int): ending frame id of the sampled clip
        out_is_last_clip: indicator to control when clip sampling ends
    """

    max_possible_clip_start = max(len(in_frame_ids) - CLIP_DURATION, 0)

    out_clip_start = random.randint(0, max_possible_clip_start)
    out_clip_end = out_clip_start + CLIP_DURATION

    if in_is_last_clip == None:
        out_is_last_clip = 1
    elif isinstance(in_is_last_clip, int) and in_is_last_clip <= 3:
        out_is_last_clip = in_is_last_clip + 1
    else:
        out_is_last_clip = 'Yes'

    return out_clip_start, out_clip_end, out_is_last_clip

def clip_sampler_uniform(in_last_clip_end_time,in_is_last_clip,in_frame_ids):

    if in_is_last_clip==None:
        out_clip_start = 0
        out_clip_end = CLIP_DURATION
    else:
        out_clip_start = in_last_clip_end_time
        out_clip_end = out_clip_start + CLIP_DURATION

    if out_clip_end + CLIP_DURATION >= len(in_frame_ids):
        out_is_last_clip = 'Yes'
    else:
        out_is_last_clip = 'No'

    return out_clip_start, out_clip_end, out_is_last_clip

In [None]:
def RandomShortSideScale(in_video_frames,in_min_size,in_max_size):
    size = torch.randint(in_min_size, in_max_size + 1, (1,)).item()
    return short_side_scale(in_video_frames, size)

In [None]:
def read_label_map(in_label_map_file):
    """
    Read label map and class ids.
    Args:
    in_label_map_file (str): Path to a .pbtxt containing class id's and class names
    Returns:
    out_label_map (dict): A dictionary mapping class id to the associated class names.
    out_class_ids (set): A set of integer unique class id's
    """
    out_label_map = {}
    out_class_ids = set()
    name = ""
    class_id = ""
    with open(in_label_map_file, "r") as f:
        for line in f:
            if line.startswith("  name:"):
                name = line.split('"')[1]
            elif line.startswith("  id:") or line.startswith("  label_id:"):
                class_id = int(line.strip().split(" ")[-1])
                out_label_map[class_id] = name
                out_class_ids.add(class_id)
    return out_label_map, out_class_ids

def load_image_lists(in_frame_paths_file, in_video_path_prefix):
    """
    Loading image paths from the corresponding file.
    Args:
    in_frame_paths_file (str): Path to a file containing relative paths
        to all the frames in the video. Each line in the file is of the
        form <video_name frame_id rel_path>
    in_video_path_prefix (str): Path to be augumented to the each relative
        frame path to get the global frame path
    Returns:
    out_image_paths_list: A dictionary of list containing absolute frame paths.
        Wherein the outer dictionary is per video and inner dictionary is per frame id.
    """

    out_image_paths = {}

    with open(in_frame_paths_file, "r") as f:
        for line in f:
            row = line.split()
            assert len(row) == 3

            # obtain vid_id
            vid_id = row[0]

            # add info to dictionary
            if vid_id not in out_image_paths:
                out_image_paths[vid_id] = {}

            # obtain frame_id and absolute frame paths
            frame_id = int(row[1])
            frame_path = os.path.join(in_video_path_prefix, row[2])

            # store absolute frame paths
            out_image_paths[vid_id][frame_id] = frame_path

    # sort frame_paths by frame_id
    out_image_paths_list = {}
    for vid_id in out_image_paths:
        out_image_paths_list[vid_id] = {}
        sorted_frame_ids = sorted(out_image_paths[vid_id])
        for frame_id in sorted_frame_ids:
            out_image_paths_list[vid_id][frame_id] = out_image_paths[vid_id][frame_id]

    return out_image_paths_list

def load_and_parse_labels_csv(in_frame_labels_file,in_allowed_class_ids=None):
    """
    Parses Kinetics per frame labels .csv file.
    Args:
    in_frame_labels_file (str): Path to the file containing labels per key frame. The file format is given by
        <video_name, frame_id, frame_timestamp, action_label>
    in_allowed_class_ids (set): A set of integer unique class id's that are allowed in the dataset.
        If none, all class id's are allowed in the bbox labels.
    Returns:
    out_labels_dict: A dictionary of dictionary containing labels per each keyframe in each video.
        Here, the label for each keyframe is again a dict of the form,
        {
            'frame_timestamp': timestamp of keyframe
            'labels': a list of action labels for the bounding box
        }
    """
    out_labels_dict = {}
    with open(in_frame_labels_file, "r") as f:
        for line in f:
            row = line.strip().split(",")
            assert len(row) == 4

            # obtain info
            vid_id = row[0]
            frame_id = int(row[1])
            frame_timestamp = float(row[2])
            label = -1 if row[3] == "" else int(row[3])

            # Continue if the current label is not in allowed labels
            if (in_allowed_class_ids is not None) and (label not in in_allowed_class_ids):
                continue

            # add info to dictionaries
            if vid_id not in out_labels_dict:
                out_labels_dict[vid_id] = {}
            if frame_id not in out_labels_dict[vid_id]:
                out_labels_dict[vid_id][frame_id] = {}

            out_labels_dict[vid_id][frame_id]["frame_timestamp"] = frame_timestamp
            out_labels_dict[vid_id][frame_id]["labels"] = label

    return out_labels_dict

def read_kinetics_data_from_csv(in_frame_paths_file,in_frame_labels_file,in_video_path_prefix,in_label_map_file=None):
    """
    Args:
        in_frame_paths_file (str): Path to a file containing relative paths
            to all the frames in the video. Each line in the file is of the form
                <video_name frame_id rel_path>
        in_frame_labels_file (str): Path to the file containing containing labels
            per key frame. The file format is given by
                <video_name, frame_id, frame_timestamp, bbox_x_1, bbox_y_1, bbox_x_2, bbox_y_2, action_label> #bbox info not used
        in_video_path_prefix (str): Path to be augumented to the each relative frame
            path to get the global frame path.
        in_label_map_file (str): Path to a .pbtxt containing class id's and class names.
            If not defined, label_map is not loaded and bbox labels are not pruned based on allowable class_id's in label_map.
    Returns:
        out_labeled_frame_paths: A dictionary of dictionary containing labels per each keyframe in each video.
            Here, the label for each keyframe is again a dict of the form,
            {
                'frame_path': absolute location of video frame
                'frame_timestamp': timestamp of the keyframe
                'labels': a list of action labels for the bounding box
            }
    """
    if in_label_map_file is not None:
        _, allowed_class_ids = read_label_map(in_label_map_file)
    else:
        allowed_class_ids = None

    # load image paths
    image_paths = load_image_lists(in_frame_paths_file, in_video_path_prefix)

    # load frame labels
    frame_labels = load_and_parse_labels_csv(in_frame_labels_file,in_allowed_class_ids=allowed_class_ids)

    # combine all info for output
    out_labeled_frame_paths = {}
    for vid_id in image_paths:
        out_labeled_frame_paths[vid_id] = {}
        for frame_id in image_paths[vid_id]:

            # get frame timestamp, labels
            labels_info_dict = frame_labels[vid_id][frame_id]

            # add frame path
            labels_info_dict["frame_path"] = image_paths[vid_id][frame_id]

            # store information
            out_labeled_frame_paths[vid_id][frame_id] = labels_info_dict

    return out_labeled_frame_paths

In [None]:
def thwc_to_cthw(in_tensor):
    """
    Permute tensor from (time, height, width, channel) to (channel, time, height, width).
    """
    return in_tensor.permute(3, 0, 1, 2)

def load_clip_frames(in_clip_start,in_clip_end,in_frames_info_dict):
    '''
    Args:
        in_clip_start_frame (int): starting frame id of clip
        in_clip_end_frame (int): ending frame id of clip
        in_frames_info_dict (dictionary): Here, the frame_ids serve as keys to a dict of the form,
            {
                'frame_path': absolute location of video frame
                'frame_timestamp': timestamp of the keyframe
                'labels': a list of action labels for the bounding box
            }
    Returns:
        out_clip_dict (dictionary):
            {
                'video': a list of video frames
                'labels': a list of action labels for each bounding box
            }

    '''
    out_video_frames = []
    out_labels = []

    for frame_id in range(in_clip_start,in_clip_end):

        # get info
        frame_path = in_frames_info_dict[frame_id]['frame_path']
        frame_label = in_frames_info_dict[frame_id]['labels']

        # read image
        img_bgr = cv2.imread(frame_path)
        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

        # store
        out_video_frames.append(img_rgb)
        out_labels.append([frame_label])

    # check
    assert len(out_video_frames) == CLIP_DURATION

    # convert to tensor with right shape
    out_video_frames = torch.as_tensor(np.stack(out_video_frames))
    out_video_frames = thwc_to_cthw(out_video_frames)

    out_clip_dict = {"video": out_video_frames,
                    "labels": out_labels}

    return out_clip_dict

In [None]:
from torch.utils.data import DistributedSampler
import torch.distributed as dist

class MultiProcessSampler(DistributedSampler):
    """
    MultiProcessSampler handles the storage, loading, decoding and clip sampling for a video dataset.
    It assumes each video is stored as a frame video (e.g. a folder of jpg, or png)
    """

    def __init__(self, dataset):
        """
        Args:
          dataset: An iterable dataset
        """
        super().__init__(dataset)

In [None]:
class KineticsVideoDataset(torch.utils.data.IterableDataset):
    """
    KineticsVideoDataset handles the storage, loading, decoding and clip sampling for a video dataset.
    It assumes each video is stored as a frame video (e.g. a folder of jpg, or png)
    """

    def __init__(self,in_labeled_frame_paths,in_clip_sampler,in_video_sampler,in_transform_fn=None):
        """
        Args:
          in_labeled_frame_paths: A dictionary of dictionary containing labels per each keyframe in each video.
            Here, the label for each keyframe is again a dict of the form,
            {
                'frame_path': absolute location of video frame
                'frame_timestamp': timestamp of the keyframe
                'labels': a list of action labels for the bounding box
            }
          in_clip_sampler (ClipSampler): Defines how clips should be sampled from each video.
          in_video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal video container.
            This defines the order videos are decoded and, if necessary, the distributed split.
          in_transform_fn (Callable): This callable is evaluated on the dataset output before
            the dataset is returned. It can be used for user defined preprocessing and
            augmentations on the clips. The dataset output format is described in __next__()
        """

        # Initialize inputs
        self._MAX_CONSECUTIVE_FAILURES = 10
        self._labeled_videos = in_labeled_frame_paths
        self._clip_sampler = in_clip_sampler

        # Deal with video sampler
        # If a RandomSampler is used we need to pass in a custom random generator that ensures all PyTorch multiprocess workers have the same random seed.
        self._video_sampler_random_generator = None
        if in_video_sampler == torch.utils.data.RandomSampler:
            self._video_sampler_random_generator = torch.Generator()
            self._video_sampler = in_video_sampler(self._labeled_videos, generator=self._video_sampler_random_generator)
        else:
            self._video_sampler = in_video_sampler(self._labeled_videos)

        self._transform = in_transform_fn

        # Initialize other variables needed
        self._video_sampler_iter = None  # Initialized on first call at self.__next__()

        # Depending on the clip sampler type, we may want to sample multiple clips
        # from one video. In that case, we keep the stored video, label and previous sampled
        # clip time in these variables.
        self._loaded_video = None
        self._loaded_clip = None
        self._last_clip_end_time = None
        self._is_last_clip = None

    @property
    def num_videos(self):
        """
        Returns: Number of videos in dataset
        """
        return len(self._video_sampler)

    @property
    def num_clips(self):
        """
        Returns: Number of clips in dataset
        """

        if self._clip_sampler == clip_sampler_random:
            return len(self._video_sampler) * 5
        elif self._clip_sampler == clip_sampler_uniform:
            total_clips_count = 0
            for key, value in self._labeled_videos.items():
                no_of_clips = len(value) // CLIP_DURATION
                total_clips_count+= no_of_clips
            return total_clips_count


    def __iter__(self):
        self._video_sampler_iter = None  # Reset video sampler

        # If we're in a PyTorch DataLoader multiprocessing context, we need to use the same seed for each worker's RandomSampler generator.
        # The workers at each __iter__ call are created from the unique value: worker_info.seed - worker_info.id, which we can use for this seed.
        worker_info = torch.utils.data.get_worker_info() #  If worker_info is None, then this is single-process data loading
        if self._video_sampler_random_generator is not None and worker_info is not None:
            base_seed = worker_info.seed - worker_info.id
            self._video_sampler_random_generator.manual_seed(base_seed)

        return self

    def __next__(self):
        """
        Retrieves the next dataset based on the video sampler and clip sampling strategy.
        Returns: A dictionary with the following format.
        sample_dict = {"vid_id": id of video,
                      "clip_index": clip_index (frame_id of first frame in clip),
                      "video": video frames,
                      "labels": labels
                        }
        """

        # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned
        if not self._video_sampler_iter:
            self._video_sampler_iter = iter(MultiProcessSampler(self._video_sampler))

        # try to load next dataset for _MAX_CONSECUTIVE_FAILURES
        for i_try in range(self._MAX_CONSECUTIVE_FAILURES):

            # Reuse previously stored video if there are still clips to be sampled from the last loaded video
            if self._loaded_video:
                vid_id, frame_ids, frames_info_dict = self._loaded_video
            else:
                video_idx = next(self._video_sampler_iter)
                try:
                    # get info for frames (dictionary of dictionary)
                    # Key - frame_id (int):
                    # Value - {'frame_timestamp': float,
                    #'labels': action_id (int),
                    #'frame_path': absolute frame path (str)}

                    vid_id = list(self._labeled_videos.keys())[video_idx]
                    frames_info_dict = self._labeled_videos[vid_id]
                    frame_ids = sorted(frames_info_dict)

                    self._loaded_video = (vid_id, frame_ids, frames_info_dict)

                except:
                    continue

            # subsample video for clips
            clip_start,clip_end,is_last_clip = self._clip_sampler(self._last_clip_end_time,self._is_last_clip,frame_ids)

            # load the next clip
            self._loaded_clip = load_clip_frames(clip_start,clip_end,frames_info_dict)
            self._last_clip_end_time = clip_end
            self._is_last_clip = is_last_clip

            # store necessary outputs
            sample_dict = {"vid_id": vid_id,
                           "clip_index": clip_start,
                           "video": self._loaded_clip['video'],
                           "labels": self._loaded_clip['labels']}

            # carry out transformation
            if self._transform is not None:
                sample_dict = self._transform(sample_dict)

            # Close the loaded video if last clip and reset parameters
            if is_last_clip=='Yes':
                self._loaded_video = None
                self._loaded_clip = None
                self._last_clip_end_time = None
                self._is_last_clip = None

                # Force garbage collection to release video container immediately otherwise memory can spike.
                gc.collect()

            # return sample_dict as next dataset
            return sample_dict

        # raise error after running through i_tries
        else:
            raise RuntimeError(f"Failed to load video after {self._MAX_CONSECUTIVE_FAILURES} retries.")


In [None]:
# helper function to create Kinetics dataset
def create_kinetics_dataset(in_frame_paths_file,in_frame_labels_file,in_video_path_prefix,in_clip_sampler,in_video_sampler,
                       in_label_map_file=None,in_transform_fn=None):
    """
    Args:
        in_frame_paths_file (str): Path to a file containing relative paths
            to all the frames in the video. Each line in the file is of the form
                <video_name frame_id rel_path>
        in_frame_labels_file (str): Path to the file containing containing labels
            per key frame. The file format is given by
                <video_name, frame_id, frame_timestamp, bbox_x_1, bbox_y_1, bbox_x_2, bbox_y_2, action_label> #bbox info not used
        in_video_path_prefix (str): Path to be augumented to the each relative frame
            path to get the global frame path.
        in_clip_sampler (ClipSampler): Defines how clips should be sampled from each video.
        in_video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal video container.
            This defines the order videos are decoded and, if necessary, the distributed split.
        in_label_map_file (str): Path to a .pbtxt containing class id's and class names.
            If not defined, label_map is not loaded and bbox labels are not pruned based on allowable class_id's in label_map.
        in_transform_fn (Optional[Callable]): This callable is evaluated on the clip output before the clip are returned.
            It can be used for user defined preprocessing and augmentations to the clips.
            If transform is None, the clips are returned as it is.
    """

    labeled_frame_paths = read_kinetics_data_from_csv(in_frame_paths_file,in_frame_labels_file,in_video_path_prefix,in_label_map_file)

    return KineticsVideoDataset(in_labeled_frame_paths=labeled_frame_paths,
                           in_clip_sampler=in_clip_sampler,in_video_sampler=in_video_sampler,
                           in_transform_fn=in_transform_fn)

In [None]:
def temporal_subsample(in_video_frames,in_labels,num_samples,temporal_dim=-3):
    '''
    Uniformly subsamples num_samples indices from the temporal dimension of the video.
    When num_samples is larger than the size of temporal dimension of the video,
    it will sample frames based on nearest neighbor interpolation
    Args:
        in_video_frames (torch tensor): A video tensor with a temporal dimension
        in_labels (list): list of labels correspoinding to each video frame
        num_samples (int): The number of equispaced samples to be selected
        temporal_dim (int): dimension of temporal to perform temporal subsample
    Returns:
        Corresponding subsampled temporal outputs
    '''

    t = in_video_frames.shape[temporal_dim]
    assert num_samples > 0 and t > 0

    # Sample by nearest neighbor interpolation if num_samples > t
    indices = torch.linspace(0,t-1,num_samples)
    indices =  torch.clamp(indices,0,t-1).long()

    # Carry out sampling
    out_video_frames = torch.index_select(in_video_frames,temporal_dim,indices)
    out_labels = in_labels[indices]

    return out_video_frames,out_labels

def Normalize(in_video_frames,in_mean,in_std):
    out_video_frames = in_video_frames.permute(1, 0, 2, 3)  # CTHW to TCHW
    out_video_frames = normalize(out_video_frames, mean=in_mean, std=in_std)
    out_video_frames = out_video_frames.permute(1, 0, 2, 3)  # TCHW to CTHW

    return out_video_frames


def ShortSideScale(
    x: torch.Tensor,
    in_size: int) -> torch.Tensor:
    """
    Determines the shorter spatial dim of the video (i.e. width or height) and scales
    it to the given size. To maintain aspect ratio, the longer side is then scaled
    accordingly.
    Args:
        x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
        size (int): The size the shorter side is scaled to.
        interpolation (str): Algorithm used for upsampling,
            options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
    Returns:
        An x-like Tensor with scaled spatial dims.
    """
    assert len(x.shape) == 4
    assert x.dtype == torch.float32
    c, t, h, w = x.shape

    if w < h:
        new_h = int(math.floor((float(h) / w) * size))
        new_w = size
    else:
        new_h = size
        new_w = int(math.floor((float(w) / h) * size))

    return torch.nn.functional.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=False)

In [None]:
# for data transformations
def transform_fn(single_input):

    """
      works on a record level
    """

    video_index, clip_index, video, labels = single_input["vid_id"], single_input["clip_index"], single_input["video"], single_input["labels"]

    # convert labels to arrays
    labels = np.array(labels)

    # Sample frames together with labels
    video, labels = temporal_subsample(video,labels,NUM_FRAMES,temporal_dim=-3)

    # Normalize the video; [0, 255] --> [0, 1]
    video = video.float()
    video = video / 255.0
    height, width = video.shape[2], video.shape[3]

    # Normalize images by mean and std
    video = Normalize(video, in_mean=np.array(MEAN, dtype=np.float32), in_std=np.array(STD, dtype=np.float32))


    if MODEL == "x3d_m":

      # Short Side Scale
      video = ShortSideScale(video,in_size=SIDE_SIZE)

      # Center Crop Video
      video = center_crop(video, (CROP_SIZE,CROP_SIZE))


    else:

       # Random Short Side Scale
       video = RandomShortSideScale(video,in_min_size=MIN_SIDE_SIZE,in_max_size=MAX_SIDE_SIZE)

       # Random Crop
       video = RandomCrop(video,in_crop_size=CROP_SIZE)

       # Random Horizontal Flip
       video = RandomHorizontalFlip(video,in_p=0.5)

    # Incase of slowfast, generate both pathways
    if MODEL == "slowfast":

         fast_pathway = video

         # Perform temporal sampling from the fast pathway.
         slow_pathway = torch.index_select(video,1,torch.linspace(0, video.shape[1] - 1, video.shape[1] // SLOWFAST_ALPHA).long())

         video = [slow_pathway, fast_pathway]

    return video_index, clip_index, video, torch.from_numpy(np.array(labels)).type(torch.LongTensor)

In [None]:
# collate function
def collate_fn(batch):

    if MODEL in ["c2d","i3d","slow"]:

        video_names_merged, clip_indexes_merged, videos_merged, labels_merged = [], [], [], []

        for clip_i, clip in enumerate(batch):

            video, labels = clip
            video_names_merged.append(clip_i)  # Using clip index as a placeholder for video name
            videos_merged.append(video)  # Video tensor
            labels_merged.append(labels)  # Label tensor
            clip_indexes_merged.append(clip_i)  # Clip index

        videos_merged = torch.stack(videos_merged)
        labels_merged = torch.vstack(labels_merged)

    elif MODEL == "slowfast":

        video_names_merged, clip_indexes_merged, slow_merged, fast_merged, labels_merged = [], [], [], [], []

        for clip_i, clip in enumerate(batch):

            name, clip_index, video, labels = clip

            video_names_merged.append(name)
            slow_merged.append(video[0])
            fast_merged.append(video[1])
            labels_merged.append(labels)
            clip_indexes_merged.append(clip_index)

        slow_merged = torch.stack(slow_merged)
        fast_merged = torch.stack(fast_merged)
        videos_merged = [slow_merged, fast_merged]
        labels_merged = torch.vstack(labels_merged)

    return video_names_merged, clip_indexes_merged, videos_merged, labels_merged

In [None]:
# added collate function for tuples for synthda
"""
def dict_collate_fn(batch):

    Converts a list of tuples from Dataset.__getitem__()
    into a single dictionary batch for the training loop.
    Assumes each item in `batch` is either:
        (video_tensor, label_int)  OR
        (video_tensor, label_int, metadata...)

    videos , labels_full = zip(*[(b[0], b[3]) for b in batch]) # ignore any extras

    videos  = torch.stack(videos).float()          # [B, C, T, H, W]
    labels  = torch.stack([l[0] for l in labels_full]).long()  # [B]

    return {"video": videos, "label": labels}
"""

def dict_collate_fn(batch):
    """
    Convert PyTorchVideo tuples into the dict expected by the training loop.
    Handles single-path and SlowFast two-path inputs.
    Returns
    -------
    dict(video=Tensor or list[Tensor], label=Tensor)
    """
    videos, labels = [], []

    for item in batch:
        # item is either (video, label) or (name, clip_idx, video, label)
        video = item[0] if len(item) == 2 else item[2]
        label = item[1] if len(item) == 2 else item[3]

        # ── CASE A: video is already a tensor ───────────────────────────
        if torch.is_tensor(video):
            videos.append(video)
        # ── CASE B: SlowFast – list of two tensors ─────────────────────
        elif isinstance(video, list) and torch.is_tensor(video[0]):
            # stack slow tensors together, fast tensors together
            if not videos:
                videos = [[], []]          # videos[0] = slow, videos[1] = fast
            videos[0].append(video[0])
            videos[1].append(video[1])
        # ── CASE C: plain Python list/ndarray ───────────────────────────
        else:
            videos.append(torch.tensor(video))

        # collapse per-frame labels -> single class id
        if torch.is_tensor(label):
            if label.ndim > 1:
                label = torch.mode(label.squeeze(), 0).values
            label = label.long()
        else:
            label = torch.tensor(label).long()
        labels.append(label)

    # -------- convert videos to batched tensor(s) ----------------------
    if isinstance(videos, list) and len(videos) == 2 and isinstance(videos[0], list):
        # SlowFast: stack slow path and fast path separately
        slow_batch = torch.stack(videos[0]).float()
        fast_batch = torch.stack(videos[1]).float()
        videos = [slow_batch, fast_batch]
    else:
        videos = torch.stack(videos).float()   # single-path

    labels = torch.stack(labels).long()        # [B]

    return {"video": videos, "label": labels}

In [None]:
# helper function to create dataloader
def create_dataloader(dataloader_type):
    '''
    dataloader_type: train, val, test
    '''

    clip_sampler_fn_mapper = {"train": clip_sampler_random,
                            "val": clip_sampler_uniform,
                            "test": clip_sampler_uniform}

    kinetics_dataset = create_kinetics_dataset(in_frame_paths_file = os.path.join(FRAME_PATHS_FOLDER, "{}.tsv".format(dataloader_type)),
                                     in_frame_labels_file = os.path.join(FRAME_LABELS_FOLDER, "{}_predicted_boxes.csv".format(dataloader_type)),
                                     in_video_path_prefix = VIDEO_PATH_PREFIX,
                                     in_clip_sampler = clip_sampler_fn_mapper[dataloader_type],
                                     in_video_sampler = torch.utils.data.RandomSampler,
                                     in_label_map_file = LABEL_MAP_FILE,
                                     in_transform_fn = transform_fn)

    #dataloader = torch.utils.data.DataLoader(kinetics_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=collate_fn)
    dataloader = torch.utils.data.DataLoader(kinetics_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=dict_collate_fn )


    return dataloader

In [None]:
# helper function to freeze model weights
def freeze_weights(in_model):

    assert in_model is not None

    for name, param in in_model.named_parameters():
        if "proj" in name:
            continue
        else:
            param.requires_grad = False

    return in_model

In [None]:
# create model
def create_model(in_n_classes, model_type="c2d", freeze_body=False):

    """
      in_n_classes (int): The number of classes to be predicted
      model_type (str): The type of model to load (either slow or slowfast). Default: c2d, Others: i3d, slow, slowfast
    """

    if model_type not in ["c2d","i3d","slow","slowfast"]: raise Exception("Please check that type of model is either c2d, i3d, slow or slowfast")

    # input pretrained model
    if model_type == "c2d":
        video_model = c2d_r50(pretrained=True)
        model_last_layer = 6
    elif model_type == "i3d":
        video_model = i3d_r50(pretrained=True)
        model_last_layer = 6
    elif model_type == "slow":
        video_model = slow_r50(pretrained=True)
        model_last_layer = 5
    elif model_type == "slowfast":
        video_model = slowfast_r50(pretrained=True)
        model_last_layer = 6

    # put model to train mode
    video_model = video_model.train()

    # freeze body layers
    if freeze_body:
        video_model = freeze_weights(video_model)

    # Change the prediction head to the input number of classes
    emb_dim = video_model.blocks[model_last_layer].proj.in_features
    video_model.blocks[model_last_layer].proj = torch.nn.Linear(emb_dim, in_n_classes)

    # Check if GPU is present, if not use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # Place model on device
    video_model = video_model.to(device)

    return video_model, device


In [None]:
# helper functions to write and load arguments
def write_args(params, output_path):

    with open(output_path, "w", encoding = "UTF-8") as f:
        json.dump(params, f, indent=4)

def load_args(path):

    with open(path, "r", encoding = "UTF-8") as f:
        data = json.load(f)

    return data

In [None]:
# helper function to get predictions
def get_predictions(in_batch, in_model, in_device):
    """
    Supports both:
      • new dict form  {"video": tensor|list[tensor], "label": tensor}
      • old tuple form (video_idx, clip_idx, video, label)
    Returns
    -------
    logits, labels
    """
    # ── 1. unpack --------------------------------------------------------
    if isinstance(in_batch, dict):                 # new collate
        video  = in_batch["video"]
        labels = in_batch["label"]
    else:                                          # old 4-tuple
        _, _, video, labels = in_batch

    # ── 2. move video(s) to device --------------------------------------
    if MODEL == "slowfast":                        # two-path model
        # make sure we have a list [slow, fast]
        if not isinstance(video, list):
            raise ValueError("SlowFast expects a list [slow, fast]")
        video = [v.to(in_device, non_blocking=True) for v in video]
    else:                                          # single-path model
        video = video.to(in_device, non_blocking=True)

    # ── 3. labels to device & squeeze -----------------------------------
    #labels = labels.to(in_device, non_blocking=True).squeeze()
    labels = labels.to(in_device, non_blocking=True).view(-1)

    # ── 4. forward ------------------------------------------------------
    logits = in_model(video)
    return logits, labels

# helper function to calculate model loss
def get_loss(in_labels_pred, in_labels):

    labels_OH = torch.nn.functional.one_hot(in_labels[0],num_classes=N_CLASSES).float()
    labels_OH = torch.reshape(labels_OH,(1,-1))

    out_loss = torch.nn.functional.cross_entropy(in_labels_pred,labels_OH)

    return out_loss

In [None]:
# helper function for validation
def validate(in_model, in_dataloader, in_device):

    print("Validating model...")

    in_model.eval() # To turn off gradient and dropout
    total_loss, total_batch = 0, 0
    all_labels_pred, all_labels = [],[]
    n_iter = int(np.ceil(in_dataloader.dataset.num_clips / in_dataloader.batch_size))

    # We do not need to compute gradient since there's no backward pass involved in validation
    # torch.no_grad ensures that no gradients are computed and stored
    pbar = tqdm(total=n_iter)
    with torch.no_grad():

        for batch in in_dataloader:

            labels_pred, labels = get_predictions(batch, in_model, in_device)

            loss = get_loss(labels_pred, labels)

            # get prediction
            labels_pred = torch.nn.functional.softmax(labels_pred,dim=1)
            labels_pred = torch.argmax(labels_pred, dim=-1)

            all_labels_pred.extend(labels_pred.detach().cpu().numpy().tolist()*labels.shape[0])
            all_labels.extend(labels.detach().cpu().numpy().tolist())

            total_loss += loss.item()
            total_batch += 1

            pbar.update(1)

        pbar.close()
        in_model.train()

        avg_loss = total_loss / total_batch
        acc = accuracy_score(all_labels, all_labels_pred) * 100
        f_score = f1_score(all_labels, all_labels_pred, average = "macro") * 100

        return avg_loss, acc, f_score

In [None]:
# helper function to save model weights
def save_model(save_folder, in_model, save_name):

    output_path = os.path.join(save_folder, "{}.pth".format(save_name))
    if hasattr(in_model, "module"):
        torch.save(in_model.module.state_dict(), output_path)
    else:
        torch.save(in_model.state_dict(), output_path)

In [None]:
def train(in_model, in_optimizer, in_dataloaders, in_model_save_dir, in_log_dir, in_device):

    # for data logging
    writer = SummaryWriter(log_dir=in_log_dir)
    best_loss, best_acc, best_f_score = 1e3, 0, 0

    print("num_clips:", in_dataloaders["train"].dataset.num_clips)  # Should print 1
    print("batch_size:", in_dataloaders["train"].batch_size)  # Should be an integer

    n_iter = int(np.ceil(in_dataloaders["train"].dataset.num_clips / in_dataloaders["train"].batch_size))
    for epoch in range(N_EPOCHS):

        print('Epoch {} of {}'.format(epoch+1,N_EPOCHS))
        for i, batch in tqdm(enumerate(in_dataloaders["train"]), total=n_iter):
            #assert isinstance(batch, dict), f"batch is {type(batch)}"
            # --- inside the training loop, just after you fetch `batch` -------------
            if isinstance(batch["video"], list):                     # SlowFast
                vid_shapes = [tuple(t.shape) for t in batch["video"]]
            else:                                                    # single-path
                vid_shapes = tuple(batch["video"].shape)

            print(f"\nBatch {i}: video {vid_shapes}, label {batch['label'].shape}")

        for i, batch in tqdm(enumerate(in_dataloaders["train"]),total=n_iter):
            in_model.train()
            in_optimizer.zero_grad()

            labels_pred, labels = get_predictions(batch, in_model, in_device)
            loss = get_loss(labels_pred, labels)

            # Backprop here
            loss.backward()
            in_optimizer.step()

        # Calculate metrics after every epoch
        avg_train_loss, train_acc, train_f_score = validate(in_model, in_dataloaders["train"], in_device)
        writer.add_scalar("Loss/train", avg_train_loss, epoch)
        writer.add_scalar("Accuracy/train", train_acc, epoch)
        writer.add_scalar("FScore/train", train_f_score, epoch)

        avg_val_loss, val_acc, val_f_score = validate(in_model, in_dataloaders["val"], in_device)
        writer.add_scalar("Loss/val", avg_val_loss, epoch)
        writer.add_scalar("Accuracy/val", val_acc, epoch)
        writer.add_scalar("FScore/val", val_f_score, epoch)

        if avg_val_loss <= best_loss:
            best_loss = avg_val_loss
            writer.add_scalar("Saved_models/val_loss", best_loss, epoch)
            save_model(in_model_save_dir, in_model, "best_model_loss")

        if val_acc >= best_acc:
            best_acc = val_acc
            writer.add_scalar("Saved_models/val_acc", best_acc, epoch)
            save_model(in_model_save_dir, in_model, "best_model_acc")

        if val_f_score >= best_f_score:
            best_f_score = val_f_score
            writer.add_scalar("Saved_models/val_f_score", best_f_score, epoch)
            save_model(in_model_save_dir, in_model, "best_model_f_score")

    writer.flush()
    writer.close()

In [None]:
def temporal_subsample(in_video_frames,in_labels,num_samples,temporal_dim=-3):
    '''
    Uniformly subsamples num_samples indices from the temporal dimension of the video.
    When num_samples is larger than the size of temporal dimension of the video,
    it will sample frames based on nearest neighbor interpolation
    Args:
        in_video_frames (torch tensor): A video tensor with a temporal dimension
        in_labels (list): list of labels correspoinding to each video frame
        num_samples (int): The number of equispaced samples to be selected
        temporal_dim (int): dimension of temporal to perform temporal subsample
    Returns:
        Corresponding subsampled temporal outputs
    '''

    t = in_video_frames.shape[temporal_dim]
    assert num_samples > 0 and t > 0

    # Sample by nearest neighbor interpolation if num_samples > t
    indices = torch.linspace(0,t-1,num_samples)
    indices =  torch.clamp(indices,0,t-1).long()

    # Carry out sampling
    out_video_frames = torch.index_select(in_video_frames,temporal_dim,indices)
    out_labels = [in_labels[i] for i in indices.tolist()]

    return out_video_frames,out_labels

def get_dimensions(in_tensor):
    if not in_tensor.ndim >= 2:
        raise TypeError("Tensor is not a torch image")
    channels = 1 if in_tensor.ndim == 2 else in_tensor.shape[-3]
    height, width = in_tensor.shape[-2:]
    return [channels, height, width]

def Normalize(in_video_frames,in_mean,in_std):
    out_video_frames = in_video_frames.permute(1, 0, 2, 3)  # CTHW to TCHW
    out_video_frames = normalize(out_video_frames, mean=in_mean, std=in_std)
    out_video_frames = out_video_frames.permute(1, 0, 2, 3)  # TCHW to CTHW

    return out_video_frames

def RandomShortSideScale(in_video_frames,in_min_size,in_max_size):
    size = torch.randint(in_min_size, in_max_size + 1, (1,)).item()
    return short_side_scale(in_video_frames, size)

def RandomCrop(in_video_frames,in_crop_size):
    size = tuple((int(in_crop_size),int(in_crop_size)))

    _, h, w = get_dimensions(in_video_frames)
    th, tw = size

    if h < th or w < tw:
        raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")

    if w == tw and h == th:
        i = 0
        j = 0
    else:
        i = torch.randint(0, h - th + 1, size=(1,)).item()
        j = torch.randint(0, w - tw + 1, size=(1,)).item()

    return crop(in_video_frames, i, j, th, tw)

def RandomHorizontalFlip(in_video_frames,in_p=0.5):
    if torch.rand(1) < in_p:
        return hflip(in_video_frames)

    return in_video_frames

In [None]:
# helper function for getting model predictions
def obtain_model_predictions(in_model, in_dataloader, in_device):

    print("Obtaining predictions...")

    in_model.eval() # To turn off gradient and dropout
    out_all_labels_pred, out_all_labels = [],[]
    n_iter = int(np.ceil(in_dataloader.dataset.num_clips / in_dataloader.batch_size))

    # We do not need to compute gradient since there's no backward pass involved in validation
    # torch.no_grad ensures that no gradients are computed and stored
    pbar = tqdm(total=n_iter)
    with torch.no_grad():

        for batch in in_dataloader:

            labels_pred, labels = get_predictions(batch, in_model, in_device)

            labels_pred = torch.nn.functional.softmax(labels_pred,dim=1)
            labels_pred = torch.argmax(labels_pred, dim=-1)

            out_all_labels_pred.extend(labels_pred.detach().cpu().numpy().tolist()*labels.shape[0])
            out_all_labels.extend(labels.detach().cpu().numpy().tolist())

            pbar.update(1)

        pbar.close()
        in_model.train()

    # store relevant outputs as df
    assert len(out_all_labels) == len(out_all_labels_pred)

    out_df_model_preds = pd.DataFrame()
    out_df_model_preds['label'] = out_all_labels
    out_df_model_preds['prediction'] = out_all_labels_pred

    out_df_model_preds['result'] = ''
    for i, row_i in out_df_model_preds.iterrows():
        if row_i['label'] == row_i['prediction']:
            out_df_model_preds.at[i,'result'] = 'correct'
        else:
            out_df_model_preds.at[i,'result'] = 'incorrect'

    return out_df_model_preds

def obtain_other_model_predictions(in_model, in_dataloader, in_device, in_other_to_model_id_dict):

    print("Obtaining predictions...")

    in_model.eval() # To turn off gradient and dropout
    out_all_labels_pred, out_all_labels = [],[]
    n_iter = int(np.ceil(in_dataloader.dataset.num_clips / in_dataloader.batch_size))

    # We do not need to compute gradient since there's no backward pass involved in validation
    # torch.no_grad ensures that no gradients are computed and stored
    pbar = tqdm(total=n_iter)
    with torch.no_grad():

        for batch in in_dataloader:

            labels_pred, labels = get_predictions(batch, in_model, in_device)

            labels_pred = torch.nn.functional.softmax(labels_pred,dim=1)
            labels_pred = torch.argmax(labels_pred, dim=-1)

            out_all_labels_pred.extend(labels_pred.detach().cpu().numpy().tolist()*labels.shape[0])
            out_all_labels.extend(labels.detach().cpu().numpy().tolist())

            pbar.update(1)

        pbar.close()
        in_model.train()

    # store relevant outputs as df
    assert len(out_all_labels) == len(out_all_labels_pred)

    out_df_model_preds = pd.DataFrame()
    out_df_model_preds['label'] = out_all_labels
    out_df_model_preds['label'] = out_df_model_preds['label'].map(in_other_to_model_id_dict)
    out_df_model_preds['prediction'] = out_all_labels_pred

    out_df_model_preds['result'] = ''
    for i, row_i in out_df_model_preds.iterrows():
        if row_i['label'] == row_i['prediction']:
            out_df_model_preds.at[i,'result'] = 'correct'
        else:
            out_df_model_preds.at[i,'result'] = 'incorrect'

    return out_df_model_preds

In [None]:
def per_class_accuracy(in_df_results,in_target_class):

    target_classes = sorted(list(set(in_df_results[in_target_class])))

    out_class_accuracy = pd.DataFrame(columns=[in_target_class]+['total','correct','incorrect','per_correct','per_incorrect'])

    for target_class in target_classes:
        df_class = in_df_results[in_df_results[in_target_class]==target_class]
        total = len(df_class)

        df_correct = df_class[df_class['result']=='correct']
        correct = len(df_correct)

        df_incorrect = df_class[df_class['result']=='incorrect']
        incorrect = len(df_incorrect)

        assert correct + incorrect == total

        per_correct = '{} %'.format(round(correct/total*100,2))
        per_incorrect = '{} %'.format(round(incorrect/total*100,2))

        # store results
        df_dict = pd.DataFrame.from_dict({in_target_class:[target_class],
                                          'total':[total],
                                          'correct':[correct],
                                          'incorrect':[incorrect],
                                          'per_correct':[per_correct],
                                          'per_incorrect':[per_incorrect]})
        out_class_accuracy = pd.concat([out_class_accuracy,df_dict],ignore_index=True)

    # get action
    out_class_accuracy['action'] = out_class_accuracy[in_target_class].map(ACTION_ID_DICT)
    out_class_accuracy = out_class_accuracy[[in_target_class,'action','total','correct','incorrect','per_correct','per_incorrect']]

    # determine overall
    total_sum = out_class_accuracy['total'].sum()
    correct_sum = out_class_accuracy['correct'].sum()
    incorrect_sum = out_class_accuracy['incorrect'].sum()

    assert correct_sum + incorrect_sum == total_sum

    per_correct_sum = '{} %'.format(round(correct_sum/total_sum*100,2))
    per_incorrect_sum = '{} %'.format(round(incorrect_sum/total_sum*100,2))

    df_dict = pd.DataFrame.from_dict({in_target_class:['Overall'],
                                      'action':['Overall'],
                                      'total':[total_sum],
                                      'correct':[correct_sum],
                                      'incorrect':[incorrect_sum],
                                      'per_correct':[per_correct_sum],
                                      'per_incorrect':[per_incorrect_sum]})
    out_class_accuracy = pd.concat([out_class_accuracy,df_dict],ignore_index=True)

    return out_class_accuracy

In [None]:
def generate_confusion_data(in_df_match,in_label_name,in_pred_name,remove_correct=False):

    # determine unique class
    unique_classes = sorted(list(set(in_df_match[in_label_name])))
    unique_classes += sorted(list(set(in_df_match[in_pred_name])))
    unique_classes = sorted(list(set(unique_classes)))

    # obtain data for confusion matrix
    out_df_confusion_data = pd.DataFrame(columns=['label','prediction','number','percentage'])

    for unique_label in unique_classes:

        df_label = in_df_match.loc[in_df_match[in_label_name]==unique_label]
        label_total = float(len(df_label))

        if label_total != 0:

            for unique_pred in unique_classes:

                df_pred = df_label.loc[df_label[in_pred_name]==unique_pred]
                pred_total = float(len(df_pred))

                # remove correct predictions
                if remove_correct:
                    if unique_label == unique_pred:
                        pred_total = 0

                # calculate prediction percentage
                pred_percent = round(pred_total/label_total*100,2)

                # store
                df_dict = pd.DataFrame.from_dict({'label':[unique_label],
                                                'prediction':[unique_pred],
                                                'number':[pred_total],
                                                'percentage':[pred_percent]})

                out_df_confusion_data = pd.concat([out_df_confusion_data,df_dict],ignore_index=True)

    return out_df_confusion_data

# Variables

In [None]:
from pathlib import Path
# path variables
pri_dir = '/content/MyDrive/MyDrive/autosynthda/action-recognition/'
raw_data_dir = os.path.join(pri_dir, 'raw_data')
checkpoint_dir = os.path.join(pri_dir, 'checkpoint')

training_data_dir = os.path.join(pri_dir, 'training_data')
FRAMES_DIR = os.path.join(training_data_dir, 'frames')
FRAME_PATHS_FOLDER = os.path.join(training_data_dir, 'frame_lists')
FRAME_LABELS_FOLDER = os.path.join(training_data_dir, 'annotations')
LABEL_MAP_FILE = os.path.join(FRAME_LABELS_FOLDER,"action_list.pbtxt")
VIDEO_PATH_PREFIX = FRAMES_DIR + '/'

# Print directory paths
print("Raw Videos Directory:", raw_data_dir)
print("Frames Directory:", FRAMES_DIR)
print("Annotations Directory:", FRAME_LABELS_FOLDER)
print("Training Data Directory:", training_data_dir)
print("Checkpoint Directory:", checkpoint_dir)

!ls /content/MyDrive/MyDrive/autosynthda/action-recognition/
pri_dir      = Path("/content/MyDrive/MyDrive/autosynthda/action-recognition")
raw_data_dir = pri_dir / "raw_data"

for p in (pri_dir, raw_data_dir):
    p = Path(p)                       # ← guarantees a Path instance
    if not p.exists():
        raise FileNotFoundError(f"{p} does not exist.")
    if not p.is_dir():
        raise NotADirectoryError(f"{p} exists but is not a directory.")

pri_dir      = Path("/content/MyDrive/MyDrive/autosynthda/action-recognition")
raw_data_dir = pri_dir / "raw_data"            # this folder has train/val/test inside
FOLDS        = ["train", "val", "test"]

print("Root project dir :", pri_dir)
print("Raw videos dir   :", raw_data_dir)


In [None]:

# model variables
DATA_TYPE = "kinetics"
DATASET_NAME = "kinetics"
MODEL = "slowfast" #c2d, i3d, slow, slowfast
FREEZE_BODY = True

# transform variables
MIN_SIDE_SIZE = 256
MAX_SIDE_SIZE = 320
CROP_SIZE = 244
MEAN = [0.45, 0.45, 0.45]
STD = [0.225, 0.225, 0.225]

SLOWFAST_ALPHA = 4
if MODEL in ["c2d","i3d","slow"]:
    NUM_FRAMES = 16
elif MODEL == "slowfast":
    NUM_FRAMES = 32
CLIP_SAMPLING_RATE = 1
CLIP_DURATION = (NUM_FRAMES * CLIP_SAMPLING_RATE)

# FPS = 30
CLIP_DURATION = NUM_FRAMES # clip duration is in terms of num of frames

N_CLASSES = 2 # Change the number of classes here

# dataloader variables
NUM_WORKERS = 0
BATCH_SIZE = 1

# training variables
N_EPOCHS = 3
LEARNING_RATE = 1e-3


In [None]:
# set the random seed
RANDOM_SEED = 17

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Prep Training Data

## Define Actions

In [None]:
# define action type
action_type_dict = {'adl':'human',
                    'fall':'human'
                    }
ACTION_ID_DICT = {0:'adl',1:'fall'}


# create a new empty action_filepath
action_filepath = os.path.join(FRAME_LABELS_FOLDER,'action_list.pbtxt')
if os.path.exists(action_filepath):
  os.remove(action_filepath)
with open(action_filepath, 'w') as file:
  pass



# write
for action_name, action_type in action_type_dict.items():
  action_id = list(action_type_dict).index(action_name)
  write_action(action_filepath,action_name,action_id,action_type,tabsize=2)

# check
with open(action_filepath, 'r') as action_file:
    content = action_file.read()
    print(content)

## Obtain Videos [Random Split]

In [None]:
# obtain video information
#df_videoinfo = pd.DataFrame()
#vid_id = 0

#action_names = glob.glob(os.path.join(raw_data_dir,"*"))
#action_names = [os.path.basename(action_name) for action_name in action_names]

#for action_name in action_names:
 # action_dir = os.path.join(raw_data_dir,action_name)
 # video_paths = glob.glob(os.path.join(action_dir,"*"))
 # for video_path in video_paths:
 #   videoinfo = get_videoinfo(video_path)
 #   videoinfo['action_name'] = action_name
 #   videoinfo['video_path'] = video_path
 #   videoinfo['vid_id'] = vid_id
 #   videoinfo['action_id'] = list(action_type_dict).index(action_name)

  #  df_dict = pd.DataFrame.from_dict([videoinfo])
  #  df_videoinfo = pd.concat([df_videoinfo,df_dict],ignore_index=True)

  #  vid_id += 1

# rearrange columns
#df_videoinfo = df_videoinfo[['vid_id','video_path','fps','no_images','frameSize','action_name','action_id']]
#df_videoinfo

In [None]:
# remove videos that has fewer frames that FRAMES_THRES
#df_videoinfo = df_videoinfo.loc[df_videoinfo['no_images']>NUM_FRAMES]
#df_videoinfo.reset_index(inplace=True,drop=True)
#df_videoinfo

In [None]:
# check number of videos
#for action_name in action_names:
  #num_vids = len(df_videoinfo[df_videoinfo['action_name']==action_name])
  #print(action_name,num_vids)

## Create Train Val Test Split [Random Split]

In [None]:
# train val test split
#split_df, X_train, X_test = split(df_videoinfo,'action_id',0.2,'train','test')
#split_df, X_train, X_val = split(X_train,'action_id',0.2,'train','val')

# create new df_videoinfo
#df_videoinfo = pd.concat([X_train,X_val,X_test],ignore_index=True)
#df_videoinfo = df_videoinfo[['vid_id','video_path','fps','no_images','frameSize','action_name','action_id','fold']]

#df_videoinfo.sort_values(by=['vid_id'],axis=0,inplace=True)
#df_videoinfo.reset_index(inplace=True,drop=True)
#df_videoinfo

In [None]:
# save
#df_videoinfo.to_csv(os.path.join(training_data_dir,'video_info.csv'),index=0)

## Obtain Videos [Fixed Split for AutoSynthDa]

In [None]:
from pathlib import Path
import pandas as pd

ACTION_NAME_TO_ID = {"adl": 0, "fall": 1}

rows, vid_id = [], 0
for fold in FOLDS:
    for action_dir in (raw_data_dir / fold).iterdir():
        if not action_dir.is_dir():
            continue
        action_name = action_dir.name
        for video_path in action_dir.glob("*"):
            info = get_videoinfo(str(video_path))
            info.update({
                "vid_id"     : vid_id,
                "video_path" : str(video_path),   # ← contains the fold
                "action_name": action_name,
                "action_id"  : ACTION_NAME_TO_ID[action_name],
                "fold"       : fold
            })
            rows.append(info)
            vid_id += 1



#df_videoinfo = df_videoinfo[['vid_id','video_path','fps','no_images','frameSize','action_name','action_id']]

## Create Train Val Test Split [Fixed Split for AutoSynthDa]

In [None]:
#df_videoinfo = (pd.DataFrame(df_rows)
                  #.loc[lambda d: d["no_images"] > NUM_FRAMES]   # keep long enough
                  #.reset_index(drop=True)
                  #.loc[:, ['vid_id', 'video_path', 'fps', 'no_images',
                          #'frameSize', 'action_name', 'action_id', 'fold']])

FOLDS = ["train", "val", "test"]          # the three constant splits

# ── 1.  keep only videos with enough frames, *per fold* ────────────────
df_videoinfo = (pd.DataFrame(rows)
                  .loc[lambda d: d["no_images"] > NUM_FRAMES]
                  .reset_index(drop=True))

pri_dir            = Path("/content/MyDrive/MyDrive/autosynthda/action-recognition")
training_data_dir  = pri_dir / "training_data"         # ← now a Path object

df_videoinfo.to_csv(training_data_dir / "video_info.csv", index=False)

for fold in FOLDS:
    print(df_videoinfo.query("fold == @fold")["video_path"].head().tolist())

In [None]:
#action_names = glob.glob(os.path.join(raw_data_dir,"*"))
#action_names = [os.path.basename(action_name) for action_name in action_names]
#for action_name in action_names:
  #num_vids = len(df_videoinfo[df_videoinfo['action_name']==action_name])
  #print(action_name,num_vids)


df_videoinfo
#print(df_videoinfo["video_path"].head().tolist())


## Count videos and double check folders

In [None]:
summary = (
    df_videoinfo
      .groupby(["action_name", "fold"])   # multi-index: class & split
      .size()                             # how many rows (i.e. videos)
      .unstack(fill_value=0)              # columns → train / val / test
      .assign(total=lambda d: d.sum(1))   # optional total per class
      .sort_index()                       # alphabetical by class name
)

print(summary)        # nice table view

## Create Frames_dir [all]

In [None]:
# read video info
print(training_data_dir)
df_videoinfo = pd.read_csv(os.path.join(training_data_dir,'video_info.csv'))

print(df_videoinfo["video_path"].head().tolist())
df_videoinfo

In [None]:
from pathlib import Path

bad = [p for p in df_videoinfo["video_path"] if not Path(p).exists()]
print(f"{len(bad)=}")
print(bad[:5])

In [None]:
df_videoinfo
df_frame_info = create_frames_dir(df_videoinfo,FRAMES_DIR)
df_frame_info

In [None]:
df_frame_info.to_csv(os.path.join(training_data_dir,'frame_info.csv'),index=0)

## Prep frame_lists_dir

In [None]:
# read video info
df_videoinfo = pd.read_csv(os.path.join(training_data_dir,'video_info.csv'))
df_videoinfo

In [None]:
# create train.tsv
train_df = df_videoinfo[df_videoinfo['fold']=='train']
df_train = create_frame_lists_csv(train_df)
df_train

In [None]:
# create val.tsv
val = df_videoinfo[df_videoinfo['fold']=='val']
df_val = create_frame_lists_csv(val)
df_val

In [None]:
# create test.tsv
test = df_videoinfo[df_videoinfo['fold']=='test']
df_test = create_frame_lists_csv(test)
df_test

In [None]:
# save
df_train.to_csv(os.path.join(FRAME_PATHS_FOLDER,'train.tsv'),index=0,header=False,sep="\t")
df_val.to_csv(os.path.join(FRAME_PATHS_FOLDER,'val.tsv'),index=0,header=False,sep="\t")
df_test.to_csv(os.path.join(FRAME_PATHS_FOLDER,'test.tsv'),index=0,header=False,sep="\t")

## Prep annotations_dir

In [None]:
# import df_video_info
df_videoinfo = pd.read_csv(os.path.join(training_data_dir,'video_info.csv'))
print(df_videoinfo["video_path"].head().tolist())
df_videoinfo

In [None]:
# train_predicted_boxes.csv
train_df = df_videoinfo[df_videoinfo['fold']=='train']
df_train = create_annotations_csv(train_df)
df_train

In [None]:
# val_predicted_boxes.csv
val = df_videoinfo[df_videoinfo['fold']=='val']
df_val = create_annotations_csv(val)
df_val

In [None]:
# test_predicted_boxes.csv
test = df_videoinfo[df_videoinfo['fold']=='test']
df_test = create_annotations_csv(test)
df_test

In [None]:
# save
df_train.to_csv(os.path.join(FRAME_LABELS_FOLDER,'train_predicted_boxes.csv'),index=0,header=False)
df_val.to_csv(os.path.join(FRAME_LABELS_FOLDER,'val_predicted_boxes.csv'),index=0,header=False)
df_test.to_csv(os.path.join(FRAME_LABELS_FOLDER,'test_predicted_boxes.csv'),index=0,header=False)

# Train

## Create Dataloaders

In [None]:
# create dataloaders
train_dataloader = create_dataloader("train")
val_dataloader = create_dataloader("val")

current_dataloaders = {"train": train_dataloader,
                      "val": val_dataloader}

## Create Model

In [None]:
# create model
current_model, current_device = create_model(N_CLASSES, model_type=MODEL, freeze_body=FREEZE_BODY)

# obtain optimizer and loss
current_optimizer = torch.optim.Adam(current_model.parameters(), lr=LEARNING_RATE)

In [None]:
# unit tests for loaders
def _test_loader(dl, name=""):
    batch = next(iter(dl))

    # Base checks
    assert isinstance(batch, dict), f"{name}: expected dict, got {type(batch)}"
    assert "video" in batch and "label" in batch, f"{name}: missing keys {batch.keys()}"

    video = batch["video"]
    label = batch["label"]

    # Handle single-path vs SlowFast (2-path)
    if isinstance(video, list):
        assert len(video) == 2, f"{name}: expected 2 tensors for SlowFast, got {len(video)}"
        slow, fast = video
        assert slow.dtype == torch.float32 and fast.dtype == torch.float32, f"{name}: video dtype not float32"
        vid_shape = [tuple(slow.shape), tuple(fast.shape)]
    else:
        assert video.dtype == torch.float32, f"{name}: video dtype not float32"
        vid_shape = tuple(video.shape)

    assert label.dtype in (torch.int64, torch.long), f"{name}: label dtype not int64/long"

    print(f"✓ {name} loader OK — video {vid_shape}, label {label.shape}")

_test_loader(train_dataloader, "train")
_test_loader(val_dataloader,   "val")

In [None]:
#check what is inside each loader (debugging)

import torch, inspect

sample_batch = next(iter(current_dataloaders["train"]))

print("type(batch) :", type(sample_batch))
print("tuple length:", len(sample_batch))

for idx, item in enumerate(sample_batch):
    if torch.is_tensor(item):
        print(f"[{idx}] tensor  shape={tuple(item.shape)}  dtype={item.dtype}")
    else:
        print(f"[{idx}] {type(item)} → {item if len(str(item)) < 100 else '...'}")

## Model Training

In [None]:
from datetime import datetime

# create output directory
current_date_time = datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
current_output_dir = os.path.join(checkpoint_dir, "{}".format(current_date_time))
os.makedirs(current_output_dir,exist_ok=True)
model_save_dir = os.path.join(current_output_dir, "saved_models")
create_dir(model_save_dir)
model_log_dir = os.path.join(current_output_dir, "logs")
create_dir(model_log_dir)
model_evaluation_dir = os.path.join(current_output_dir, "model_evaluation")
create_dir(model_evaluation_dir)
args_file = os.path.join(current_output_dir, "args.json")

In [None]:
# save current parameters in a json for easier reference in the future
params = {"args_file": args_file, "current_date_time": current_date_time, "random_seed": RANDOM_SEED,
          "current_output_dir": current_output_dir, "model_save_dir": model_save_dir, "model_log_dir": model_log_dir, "model_evaluation_dir": model_evaluation_dir,
          "frame_paths_folder": FRAME_PATHS_FOLDER, "frame_labels_folder": FRAME_LABELS_FOLDER, "label_map_file": LABEL_MAP_FILE, "video_path_prefix": VIDEO_PATH_PREFIX,
          "data_type": DATA_TYPE, "dataset_name": DATASET_NAME, "model": MODEL, "freeze_body": FREEZE_BODY, "min_side_size": MIN_SIDE_SIZE,  "max_side_size": MAX_SIDE_SIZE, "crop_size": CROP_SIZE, "mean": MEAN, "std": STD,
          "num_frames": NUM_FRAMES, "clip_sampling_rate": CLIP_SAMPLING_RATE, "clip_duration": CLIP_DURATION,
          "n_classes": N_CLASSES,
          "num_workers": NUM_WORKERS, "batch_size": BATCH_SIZE,
          "n_epochs" : N_EPOCHS, "learning_rate": LEARNING_RATE}

write_args(params, args_file)

In [None]:
import os
import torch.distributed as dist

# Set environment variables
os.environ["RANK"] = "0"  # Rank of this process
os.environ["WORLD_SIZE"] = "1"  # Total number of processes
os.environ["MASTER_ADDR"] = "127.0.0.1"  # Master node IP (localhost for single-node)
os.environ["MASTER_PORT"] = "29500"  # Any free port

# Initialize process group
dist.init_process_group(backend="nccl")  # Change to "gloo" for CPU

In [None]:
# note that the number of clips being more than the number of videos u passed in is normal
# Bottom line: the bigger number is normal—the dataset treats every
# (start-time, end-time) pair as a separate sample so you can squeeze multiple
# training steps out of one long video.

In [None]:
# train
train(current_model, current_optimizer, current_dataloaders, model_save_dir, model_log_dir, current_device)

# Test

## Create Test Dataloader

In [None]:
# create dataloaders
test_dataloader = create_dataloader("test")

## Loading Saved Model

In [None]:
evaluation_target = 'test'

In [None]:
import torch.nn as nn

In [None]:
criterion = nn.CrossEntropyLoss()
current_model, current_device = create_model(N_CLASSES, model_type=MODEL, freeze_body=FREEZE_BODY)
model_weights = torch.load("/content/MyDrive/MyDrive/autosynthda/action-recognition/checkpoint/06-22-2025-12-52-04/saved_models/best_model_acc.pth")
current_model.load_state_dict(model_weights)
current_model.eval()


## Obtain Model Predictions

In [None]:
# obtain model predictions
df_model_preds = obtain_model_predictions(current_model, test_dataloader, current_device)
# save
df_model_preds.to_csv(os.path.join(model_evaluation_dir, "predictions_{}.csv").format(evaluation_target),index=0)

df_model_preds
print(df_model_preds.columns.tolist())

In [None]:
import pandas as pd

# Point to the CSV you saved from the best-accuracy checkpoint
csv_path = "/content/MyDrive/MyDrive/autosynthda/action-recognition/checkpoint/06-22-2025-12-52-04/model_evaluation/predictions_test.csv"
# e.g. "/content/drive/.../predictions_test_acc.csv"

df = pd.read_csv(csv_path)

# accuracy  = fraction of rows where prediction matches the ground-truth label
accuracy = (df["label"] == df["prediction"]).mean()

# 0-1 loss  = 1 − accuracy
zero_one_loss = 1 - accuracy

print(f"Accuracy       : {accuracy:.4f}")
print(f"misclassifying loss: {zero_one_loss:.4f}")

## Evaluate Model

In [None]:
# calculate per class accuracy
df_class_accuracy = per_class_accuracy(df_model_preds,'label')

# save
df_class_accuracy.to_csv(os.path.join(model_evaluation_dir, "accuracy_{}.csv").format(evaluation_target),index=0)

df_class_accuracy

In [None]:
# generate confusion matrix
df_confusion_data = generate_confusion_data(df_model_preds,'label','prediction',remove_correct=True)
df_heatmap = df_confusion_data.pivot(index='label',columns='prediction',values='percentage')

# save
df_heatmap.to_csv(os.path.join(model_evaluation_dir, "confusion_{}.csv").format(evaluation_target),index=0)

df_heatmap

## Output to .txt file for export to main AutoSynthDa pipeline

In [None]:
# --------------------------------------------------------------------
# 1) user-defined “weight” to embed in the file names
# --------------------------------------------------------------------
weight_val = 0.4                       # <= set this to 0.2, 0.3, … as needed
print("Columns in df_class_accuracy:", df_class_accuracy.columns.tolist())


# --------------------------------------------------------------------
# 2) where to save them
# --------------------------------------------------------------------
txt_dir = '/content/MyDrive/MyDrive/autosynthda/action-recognition/results/'        # or any other folder you prefer
#txt_dir = "/content/MyDrive/MyDrive/autosynthda/action-recognition/results"

timestamp          = datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
current_output_dir = os.path.join(txt_dir, timestamp)

os.makedirs(current_output_dir, exist_ok=True)   # <-- THIS was missing

# --------------------------------------------------------------------
# 2.  File paths
# --------------------------------------------------------------------
acc_txt_path  = os.path.join(current_output_dir, f"acc_{weight_val}.txt")
loss_txt_path = os.path.join(current_output_dir, f"loss_{weight_val}.txt")

# --------------------------------------------------------------------
# 3.  Write per-class accuracies
# --------------------------------------------------------------------
with open(acc_txt_path, "w") as f:
    for _, row in df_class_accuracy.iterrows():
        cls = row["action"]

        acc_str = str(row["per_correct"]).strip()
        acc_val = (float(acc_str.rstrip("%").strip()) / 100.0
                   if acc_str.endswith("%") else float(acc_str))

        f.write(f"acc_{weight_val}_{cls} = {acc_val:.4f}\n")

# --------------------------------------------------------------------
# 4.  Write overall loss
# --------------------------------------------------------------------
with open(loss_txt_path, "w") as f:
    f.write(f"loss_weight_{weight_val} = {zero_one_loss:.4f}\n")

print("Saved:")
print(" •", acc_txt_path)
print(" •", loss_txt_path)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# display
plt.figure(figsize=(16,9))
sns.heatmap(df_heatmap,annot=True,fmt='g',cmap='inferno',vmax=100,linewidths=4)
plt.title('Incorrect Percentage (%)')
plt.savefig(os.path.join(model_evaluation_dir, "confusion_matrix_{}.jpg").format(evaluation_target))
plt.show()