<a href="https://colab.research.google.com/github/ZsofiaK/masterthesis/blob/main/Implementation/Auxilliary/Setting_up_AK_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up AK sample
This notebook collects a representative sample from the Animal Kingdom dataset, which is used to test MARINE against other models benchmarked on this dataset (MSQNet, CARe).

## Preliminaries

In [15]:
# Set up notebook parameters
dataset_dir = '/content/drive/MyDrive/UvA/M Thesis/Data/AK sample'    # Location of the sample dataset directory
videos_source = '/content/drive/My Drive/UvA/M Thesis/Data/Animal_Kingdom_shortcut/action_recognition/dataset/video.tar.gz'   # Location of the AK dataset.
max_sample_size = 1000    # Maximum size of the representative sample
seed = 42   # Random seed for reproducability.
AK_clips = f'{dataset_dir}/AK_AR_metadata.xlsx'   # Path to the original AK metadata file
clips_save_path = f'{dataset_dir}/clips.csv'    # Path to save sample metadata file

In [2]:
# Mount Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load dataset metadata

In [3]:
# Read original dataset metadata.
import pandas as pd
from sklearn.model_selection import train_test_split

full_og_clips_df = pd.read_excel(AK_clips, sheet_name='AR')

# Extract relevant columns from the original metadata sheet.
og_clips_df = full_og_clips_df[['video_id', 'type', 'labels']]

# Convert labels from comma-separated strings to lists.
og_clips_df['labels'] = og_clips_df['labels'].apply(lambda x: x.split(','))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  og_clips_df['labels'] = og_clips_df['labels'].apply(lambda x: x.split(','))


## Take random sample from dataset

In [4]:
AK_sample, _ = train_test_split(og_clips_df, stratify=og_clips_df['type'], \
                                   test_size=1-(max_sample_size/len(og_clips_df)), \
                                   random_state=seed)

In [5]:
# Confirm sample sizes.
print('Final sample sizes')
print('TOTAL:', len(AK_sample))
print('Train:', len(AK_sample[AK_sample['type']=='train']))
print('Test:', len(AK_sample[AK_sample['type']=='test']))

Final sample sizes
TOTAL: 1000
Train: 797
Test: 203


## Confirm distribution of sample
Chi-squared test to confirm that distribution of labels in the sample is identical to that in Animal Kingdom.

`NOTE:` it is important to realize that due to very small frequencies, some labels may be dropped from the sample (these are shown later). Distribution is tested only for the labels which remain.



In [7]:
# Save labels and counts from original and sample dataset.
from collections import Counter

og_labels = og_clips_df.explode('labels')['labels'].tolist()
sample_labels = AK_sample.explode('labels')['labels'].tolist()

counts_og = dict(Counter(og_labels))
counts_sample = dict(Counter(sample_labels))

In [9]:
from scipy.stats import chi2_contingency
import numpy as np

# Create list of unique labels in sample.
labels = list(counts_sample.keys())

# Prepare values for each sample, aligning by labels
og_counts = [counts_og[label] for label in labels]
sample_counts = [counts_sample[label] for label in labels]

# Perform the chi-squared test
chi2, p, dof, expected = chi2_contingency([og_counts, sample_counts])

print(f"Chi-squared Statistic: {chi2}, p-value: {p}")
print(f'Difference in distribution is significant: {p < 0.05}')

Chi-squared Statistic: 85.13653364520921, p-value: 0.6806998907509849
Difference in distribution is significant: False


In [11]:
dropped_labels = [(label, count) for label, count in counts_og.items() \
                  if label not in counts_sample]

if len(dropped_labels) > 0:
  print('Warning:', len(dropped_labels), 'labels dropped.')
  print(f'Maximum dropped count: {max([item[1] for item in dropped_labels])}')

  # print('LABEL \t ORIGINAL COUNT')
  # for label, count in sorted(dropped_labels, key=lambda x: x[1], reverse=True):
  #   print(f'{label} \t {count}')

Maximum dropped count: 76


## Save files
Save metadata file to Drive.

In [23]:
# Change video column name for consistency with other datasets.
AK_sample = AK_sample.rename(columns={'video_id': 'video'})

# Save metadata file.
AK_sample.to_csv(clips_save_path, index=False)

Copy sample AK videos to sample directory.

In [17]:
# Creating video directory in runtime.
import os

runtime_dest_folder = '/content/Videos'

# Create the destination folder if it does not exist
os.makedirs(runtime_dest_folder, exist_ok=True)

In [18]:
# Extract videos to runtime.
import tarfile

with tarfile.open(videos_source, 'r:gz') as file:
    file.extractall(path=runtime_dest_folder)

    print("Extraction completed.")

Extraction completed.


In [19]:
# Copy relevant videos to drive.

# Setting up directory for relevant videos.
drive_clips_folder = f'{dataset_dir}/Clips'

# Create the destination folder if it does not exist
os.makedirs(drive_clips_folder, exist_ok=True)

In [21]:
# Loop through the video IDs and move each relevant video file to Drive directory
import shutil
from IPython.display import clear_output

not_found_videos = 0

progress = 0

nr_videos = len(AK_sample)

for video_id in AK_sample['video_id']:
    source_file = os.path.join(f'{runtime_dest_folder}/video', f'{video_id}.mp4')

    destination_file = os.path.join(drive_clips_folder, f'{video_id}.mp4')

    # Check if the file exists before trying to copy.
    if os.path.exists(source_file):
        # Check if file had already been copied to Drive.
        if not os.path.exists(destination_file):
          shutil.copy(source_file, destination_file)

    else:
        not_found_videos += 1

    progress += 1
    clear_output(wait=True)
    print(f'Number of videos: {nr_videos}')
    print(f'Progress: {progress / nr_videos * 100:.2f}%')

Number of videos: 1000
Progress: 100.00%


In [22]:
# Checking if all relevant videos were found.
if not_found_videos > 0:
  print(f'WARNING: {len(not_found_videos)} videos not found.')

else:
  print('Successfully copied all sample videos!')

Successfully copied all sample videos!
