<a href="https://colab.research.google.com/github/ZsofiaK/masterthesis/blob/main/Implementation/Train_test_splitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up train and test sets

## Preliminaries

In [33]:
# Setting up dictionaries for later ease of use.
dataset_dict = {'fishClips' : 'Fish clips', 'AK-fish' : 'AK fish'}

In [34]:
dataset_name = 'AK-fish'

dataset_dir = dataset_dict[dataset_name]

test_size = 0.2   # Relative size of the test set

seed = 23   # For reproducability in pseudo-randomness.

In [3]:
# Mount Drive.
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
# Specify data source
drive_dir = f"/content/drive/My Drive/UvA/M Thesis/Data/{dataset_dir}"

# Specify directory to copy to
data_dir = f"/content/{dataset_dir}"

In [36]:
# Copy the folder to destination
import shutil
shutil.copytree(drive_dir, data_dir)

'/content/AK fish'

## Splitting dataset

In [37]:
import pandas as pd

clips_df = pd.read_csv(f'{data_dir}/clips.csv')

In [38]:
# As the AK fish dataset already has pre-assigned train and test sets,
# store these as original (og) sets. In the thesis experiments, the newly
# created random test set will be used.

if dataset_name == 'AK-fish':
  clips_df.rename(columns={'type' : 'og_type'}, inplace=True)

In [39]:
clips_df.head()

Unnamed: 0,S/N,video,og_type,list_animal_action,label,nr_frames,fps,length
0,11,AAFWRPDI,train,"[('Fish', 'Swimming')]",0,96,24.0,4.0
1,18,AAJNFNXN,train,"[('Fish', 'Swimming'), ('Orca', 'Swimming')]",0,72,24.0,3.0
2,42,AATNWZAA,train,"[('Fish', 'Swimming'), ('Fish', 'Sensing')]",0,168,24.0,7.0
3,97,ACFCJUPH,train,"[('Catfish', 'Swimming')]",0,29,24.0,1.208333
4,124,ADBJMPIF,train,"[('Sardine', 'Fleeing'), ('Sea Lion', 'Eating'...",1,56,24.0,2.333333


In [40]:
# Create split.
from sklearn.model_selection import train_test_split

if dataset_name == 'fishClips':
# This dataset has to be handled differently as we must ensure that all clips
# from the same original video go to the same split.

  nr_og_videos = 44   # Number of videos in the original dataset

  video_numbers = [i + 1 for i in range(nr_og_videos)]

  train_video_nrs, test_video_nrs = train_test_split(
      video_numbers, test_size=test_size, random_state=seed)

  X_train = []
  X_test = []
  y_train = []
  y_test = []

  for index, row in clips_df.iterrows():
    video_nr = int(row['video'].split('_')[0].replace('video', ''))

    if video_nr in train_video_nrs:
      X_train.append(row['video'])
      y_train.append(row['label'])

    else:
      X_test.append(row['video'])
      y_train.append(row['label'])

else:
  X = clips_df['video']
  y = clips_df['label']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=seed)

In [41]:
# Add test/train type to clips dataframe.
clips_df['type'] = None

clips_df.loc[clips_df['video'].isin(X_train), 'type'] = 'train'
clips_df.loc[clips_df['video'].isin(X_test), 'type'] = 'test'

In [48]:
# Write clips dataframe to file and save to Drive.
clips_df.to_csv(f'{data_dir}/clips.csv', index=False)

shutil.copy(f'{data_dir}/clips.csv', f'{drive_dir}/clips.csv')

'/content/drive/My Drive/UvA/M Thesis/Data/AK fish/clips.csv'