<a href="https://colab.research.google.com/github/ZsofiaK/masterthesis/blob/main/Implementation/Splitting_clips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Splitting fish videos to clips

In [1]:
# Copying fish data from Drive

from google.colab import drive
drive.mount('/content/drive')

import shutil

source_folder = "/content/drive/My Drive/UvA/M Thesis/Data/Fish data"
destination_folder = "/content/Fish data"

# Copy the folder to destination
shutil.copytree(source_folder, destination_folder)

Mounted at /content/drive


'/content/Fish data'

In [2]:
import subprocess
import os
import pandas as pd
from IPython.display import clear_output

unvalid_videos = []

def split_videos_to_clips(source_folder:str, output_folder:str, clip_length:int=2):
  '''
  Splits videos to an odd number of clips of a specified length.
  The middle clip coincides with the middle of the video.
  A metadata file is written, where the label of the middle video is 1,
    0 otherwise.

  :param: source_folder: the folder of the original videos.
  :param: output_folder: directory where the output should be saved. The clips
    will be saved in a folder named Clips within this directory.
  :clip_length: the length of the desired clips in seconds.
  '''

  # Ensure output directories exist
  clips_folder = os.path.join(output_folder, "Clips")
  os.makedirs(clips_folder, exist_ok=True)

  video_files = [f for f in os.listdir(source_folder) if f.endswith('.MP4')]
  metadata = []

  for video_idx, video_file in enumerate(video_files, start=1):
      video_path = os.path.join(source_folder, video_file)

      # Get video duration
      cmd = ['ffprobe', '-v', 'error', '-show_entries',
              'format=duration', '-of',
              'default=noprint_wrappers=1:nokey=1', video_path]

      result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)

      try:
        duration = float(result.stdout)

      except:
        print(f'WARNING: Unvalid file {video_file}')

        unvalid_videos.append(video_file)

        continue

      # Calculate number of clips and adjust if necessary to be odd
      total_clips = int(duration) // clip_length
      if total_clips % 2 == 0: total_clips -= 1

      # Calculate total duration covered by the clips to center them in the video
      total_clips_duration = total_clips * clip_length

      # Calculate the start time offset to center clips in the video
      start_offset = (duration - total_clips_duration) / 2

      for clip_idx in range(1, int(total_clips) + 1):
          start_time = start_offset + (clip_idx - 1) * clip_length

          clip_name = f"video{video_idx}_clip{clip_idx}.mp4"

          clip_path = os.path.join(clips_folder, clip_name)

          label = 1 if clip_idx == (total_clips // 2) + 1 else 0

          # Save metadata
          metadata.append([clip_name, video_file, round(start_time, 2), label])

          # Use ffmpeg to cut the clip without re-encoding
          cmd = ['ffmpeg', '-ss', str(start_time), '-t', str(clip_length),
                  '-i', video_path, '-c', 'copy', '-y', clip_path]
          subprocess.run(cmd)

          clear_output(wait=True)
          print(f"Processed {clip_name} from {video_file}")

  clear_output(wait=True)
  print('All clips processed.')

  # Save metadata to CSV
  metadata_df = pd.DataFrame(metadata, columns=["clip", "origin_video", "timestamp", "label"])
  metadata_df.to_csv(os.path.join(output_folder, 'clips.csv'), index=False)
  print('Metadata CSV saved.')


In [3]:
source_folder_path = '/content/Fish data'
output_folder_path = '/content/Fish clips'

split_videos_to_clips(source_folder_path, output_folder_path)

All clips processed.
Metadata CSV saved.


In [4]:
if len(unvalid_videos) > 0:
  print('The following videos could not be processed:')

  for video in unvalid_videos:
    print(video)

else:
  print('All videos could be processed.')

All videos could be processed.


In [5]:
# Copying clips to drive.
source_folder = '/content/Fish clips'

destination_folder = '/content/drive/My Drive/UvA/M Thesis/Data/Fish clips'

shutil.copytree(source_folder, destination_folder)

'/content/drive/My Drive/UvA/M Thesis/Data/Fish clips'