In [42]:
import os
import random
from sklearn.model_selection import StratifiedKFold
from collections import Counter

In [43]:
def read_master_file(file_path):
    clips = []
    labels = []
    with open(file_path, 'r') as f:
        for line in f:
            clip, label = line.strip().split()
            clips.append(clip)
            labels.append(int(label))
    return clips, labels

def write_annotation_file(file_path, clips, labels):
    with open(file_path, 'w') as f:
        for clip, label in zip(clips, labels):
            f.write(f"{clip} {label}\n")

def balance_data(clips, labels, n_samples):
    positive = [(c, l) for c, l in zip(clips, labels) if l == 1]
    negative = [(c, l) for c, l in zip(clips, labels) if l == 0]
    
    if len(positive) < n_samples:
        balanced = positive + random.sample(negative, len(positive))
    else:
        balanced = random.sample(positive, n_samples) + random.sample(negative, n_samples)
    
    random.shuffle(balanced)
    return zip(*balanced)

def create_cross_validation_files(master_file, output_dir):
    clips, labels = read_master_file(master_file)

    num_folds = 4
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

    for fold, (train_val_idx, test_idx) in enumerate(skf.split(clips, labels)):
        fold_dir = os.path.join(output_dir, f"fold_{fold+1}")
        os.makedirs(fold_dir, exist_ok=True)

        # Create balanced test set
        test_clips = [clips[i] for i in test_idx]
        test_labels = [labels[i] for i in test_idx]
        n_test_samples = min(Counter(test_labels).values())
        test_clips, test_labels = balance_data(test_clips, test_labels, n_test_samples)

        # Create balanced train+val set
        train_val_clips = [clips[i] for i in train_val_idx]
        train_val_labels = [labels[i] for i in train_val_idx]
        n_train_val_samples = min(Counter(train_val_labels).values())
        train_val_clips, train_val_labels = balance_data(train_val_clips, train_val_labels, n_train_val_samples)

        # Split train+val into train and val
        train_val_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 80-20 split
        train_idx, val_idx = next(train_val_skf.split(train_val_clips, train_val_labels))

        train_clips = [train_val_clips[i] for i in train_idx]
        train_labels = [train_val_labels[i] for i in train_idx]
        val_clips = [train_val_clips[i] for i in val_idx]
        val_labels = [train_val_labels[i] for i in val_idx]

        # Write annotation files
        write_annotation_file(os.path.join(fold_dir, "train.txt"), train_clips, train_labels)
        write_annotation_file(os.path.join(fold_dir, "val.txt"), val_clips, val_labels)
        write_annotation_file(os.path.join(fold_dir, "test.txt"), test_clips, test_labels)

In [44]:
master_file = os.path.join(os.getcwd(), "annotations", "aidan_allclips_annotations.txt")
output_dir = os.path.join(os.getcwd(), "annotations", "4fold_CV")

In [45]:
create_cross_validation_files(master_file, output_dir)

#### Ensure the files were created properly

In [46]:
import os

def count_clips(file_path):
    total = 0
    positive = 0
    negative = 0
    with open(file_path, 'r') as f:
        for line in f:
            total += 1
            if line.strip().endswith('1'):
                positive += 1
            else:
                negative += 1
    return total, positive, negative

def check_annotation_files(base_dir):
    for fold in range(1, 5):  # 4 folds
        fold_dir = os.path.join(base_dir, f"fold_{fold}")
        print(f"\nFold {fold}:")
        
        for split in ['train', 'val', 'test']:
            file_path = os.path.join(fold_dir, f"{split}.txt")
            if os.path.exists(file_path):
                total, positive, negative = count_clips(file_path)
                print(f"  {split.capitalize()}:")
                print(f"    Total clips: {total}")
                print(f"    Positive clips: {positive}")
                print(f"    Negative clips: {negative}")
            else:
                print(f"  {split.capitalize()} file not found!")

In [47]:
check_annotation_files(output_dir)


Fold 1:
  Train:
    Total clips: 1278
    Positive clips: 639
    Negative clips: 639
  Val:
    Total clips: 320
    Positive clips: 160
    Negative clips: 160
  Test:
    Total clips: 534
    Positive clips: 267
    Negative clips: 267

Fold 2:
  Train:
    Total clips: 1278
    Positive clips: 639
    Negative clips: 639
  Val:
    Total clips: 320
    Positive clips: 160
    Negative clips: 160
  Test:
    Total clips: 534
    Positive clips: 267
    Negative clips: 267

Fold 3:
  Train:
    Total clips: 1280
    Positive clips: 640
    Negative clips: 640
  Val:
    Total clips: 320
    Positive clips: 160
    Negative clips: 160
  Test:
    Total clips: 532
    Positive clips: 266
    Negative clips: 266

Fold 4:
  Train:
    Total clips: 1280
    Positive clips: 640
    Negative clips: 640
  Val:
    Total clips: 320
    Positive clips: 160
    Negative clips: 160
  Test:
    Total clips: 532
    Positive clips: 266
    Negative clips: 266
