In [8]:
import os
import random
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict, Counter

In [9]:
master_file = os.path.join(os.getcwd(), 'annotations', 'aidan_allclips_annotations.txt')
output_dir = os.path.join(os.getcwd(), 'annotations', 'Uncropped_LOPO_CV')

In [10]:
import os
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split

def read_master_file(file_path):
    clips = []
    labels = []
    with open(file_path, 'r') as f:
        for line in f:
            clip, label = line.strip().split()
            clips.append(clip)
            labels.append(int(label))
    return clips, labels

def write_annotation_file(file_path, clips, labels):
    with open(file_path, 'w') as f:
        for clip, label in zip(clips, labels):
            f.write(f"{clip} {label}\n")

def balance_data(clips, labels):
    positive = [(c, l) for c, l in zip(clips, labels) if l == 1]
    negative = [(c, l) for c, l in zip(clips, labels) if l == 0]
    
    n_samples = min(len(positive), len(negative))
    balanced = random.sample(positive, n_samples) + random.sample(negative, n_samples)
    random.shuffle(balanced)
    return zip(*balanced)

def group_by_patient(clips, labels):
    patient_groups = defaultdict(lambda: {'clips': [], 'labels': []})
    for clip, label in zip(clips, labels):
        patient = clip.split('/')[1].split('_')[0]  # Extract patient number
        patient_groups[patient]['clips'].append(clip)
        patient_groups[patient]['labels'].append(label)
    return patient_groups

def create_cross_validation_files(master_file, output_dir):
    clips, labels = read_master_file(master_file)
    patient_groups = group_by_patient(clips, labels)

    patients = list(patient_groups.keys())
    random.shuffle(patients)

    for fold, test_patient in enumerate(patients):
        fold_dir = os.path.join(output_dir, f"fold_{fold+1}")
        os.makedirs(fold_dir, exist_ok=True)

        # Separate test patient
        test_clips = patient_groups[test_patient]['clips']
        test_labels = patient_groups[test_patient]['labels']

        # Collect clips and labels for train+val set
        train_val_clips, train_val_labels = [], []
        for patient in patients:
            if patient != test_patient:
                train_val_clips.extend(patient_groups[patient]['clips'])
                train_val_labels.extend(patient_groups[patient]['labels'])

        # Split train+val into train and val (80:20)
        train_clips, val_clips, train_labels, val_labels = train_test_split(
            train_val_clips, train_val_labels, test_size=0.2, stratify=train_val_labels, random_state=42
        )

        # Balance data for each set
        train_clips, train_labels = balance_data(train_clips, train_labels)
        val_clips, val_labels = balance_data(val_clips, val_labels)
        test_clips, test_labels = balance_data(test_clips, test_labels)

        # Write annotation files
        write_annotation_file(os.path.join(fold_dir, "train.txt"), train_clips, train_labels)
        write_annotation_file(os.path.join(fold_dir, "val.txt"), val_clips, val_labels)
        write_annotation_file(os.path.join(fold_dir, "test.txt"), test_clips, test_labels)

        print(f"Fold {fold+1}: Test patient {test_patient}")

In [11]:
create_cross_validation_files(master_file, output_dir)

Fold 1: Test patient 06338772
Fold 2: Test patient 05454991
Fold 3: Test patient 00582992
Fold 4: Test patient 06452950
Fold 5: Test patient 05447543
Fold 6: Test patient 05497695
Fold 7: Test patient 05467817
Fold 8: Test patient 05418761
Fold 9: Test patient 06348578
Fold 10: Test patient 06381028
Fold 11: Test patient 00913367
Fold 12: Test patient 06394294
Fold 13: Test patient 05512494
Fold 14: Test patient 05235825
Fold 15: Test patient 05323733
Fold 16: Test patient 05514820
Fold 17: Test patient 05463487
Fold 18: Test patient 05513119
Fold 19: Test patient 05486196
Fold 20: Test patient 05109836
Fold 21: Test patient 05352576
Fold 22: Test patient 02268547
Fold 23: Test patient 05501184
Fold 24: Test patient 05489744
Fold 25: Test patient 02267738


#### Ensure the files were created properly

In [12]:
def analyze_cross_validation(base_dir):
    for fold in range(1, 26):  # 4 folds
        fold_dir = os.path.join(base_dir, f"fold_{fold}")
        print(f"\nAnalyzing Fold {fold}:")
        
        set_counts = {}
        set_patients = {}
        set_labels = {}
        total_videos = 0
        
        for set_name in ['train', 'val', 'test']:
            file_path = os.path.join(fold_dir, f"{set_name}.txt")
            
            videos = []
            patients = set()
            labels = []
            
            with open(file_path, 'r') as f:
                for line in f:
                    video_path, label = line.strip().split()
                    videos.append(video_path)
                    labels.append(int(label))
                    patient = video_path.split('/')[1].split('_')[0]  # Extract patient number
                    patients.add(patient)
            
            set_counts[set_name] = len(videos)
            set_patients[set_name] = patients
            set_labels[set_name] = labels
            total_videos += len(videos)
        
        # Print counts, percentages, and label ratios
        print(f"Total videos in fold: {total_videos}")
        for set_name in ['train', 'val', 'test']:
            count = set_counts[set_name]
            percentage = (count / total_videos) * 100
            negative_count = set_labels[set_name].count(0)
            positive_count = set_labels[set_name].count(1)
            negative_ratio = (negative_count / count) * 100
            positive_ratio = (positive_count / count) * 100
            
            print(f"{set_name.capitalize()} set: {count} videos ({percentage:.2f}%)")
            print(f"  Negative (0) clips: {negative_count} ({negative_ratio:.2f}%)")
            print(f"  Positive (1) clips: {positive_count} ({positive_ratio:.2f}%)")
        
        # Print patients in each set
        for set_name in ['train', 'val', 'test']:
            print(f"\nPatients in {set_name} set: {', '.join(sorted(set_patients[set_name]))}")
        
        # Check for patient overlap
        all_patients = set()
        for patients in set_patients.values():
            all_patients.update(patients)
        
        patient_count = defaultdict(int)
        for patients in set_patients.values():
            for patient in patients:
                patient_count[patient] += 1
        
        # overlapping_patients = [patient for patient, count in patient_count.items() if count > 1]
        
        # if overlapping_patients:
        #     print("\nWARNING: The following patients appear in multiple sets:")
        #     for patient in overlapping_patients:
        #         print(f"Patient {patient} appears in:")
        #         for set_name, patients in set_patients.items():
        #             if patient in patients:
        #                 print(f"  - {set_name} set")
        # else:
        #     print("\nVerification successful: No patients appear in multiple sets within this fold.")

In [13]:
analyze_cross_validation(output_dir)


Analyzing Fold 1:
Total videos in fold: 2132
Train set: 1672 videos (78.42%)
  Negative (0) clips: 836 (50.00%)
  Positive (1) clips: 836 (50.00%)
Val set: 418 videos (19.61%)
  Negative (0) clips: 209 (50.00%)
  Positive (1) clips: 209 (50.00%)
Test set: 42 videos (1.97%)
  Negative (0) clips: 21 (50.00%)
  Positive (1) clips: 21 (50.00%)

Patients in train set: 00582992, 00913367, 02267738, 02268547, 05109836, 05235825, 05323733, 05352576, 05418761, 05447543, 05454991, 05463487, 05467817, 05486196, 05489744, 05497695, 05501184, 05512494, 05513119, 05514820, 06348578, 06381028, 06394294, 06452950

Patients in val set: 00582992, 00913367, 02267738, 02268547, 05109836, 05235825, 05323733, 05352576, 05418761, 05447543, 05454991, 05463487, 05467817, 05486196, 05489744, 05497695, 05501184, 05512494, 05513119, 05514820, 06348578, 06381028, 06394294, 06452950

Patients in test set: 06338772

Analyzing Fold 2:
Total videos in fold: 2132
Train set: 1672 videos (78.42%)
  Negative (0) clips: 8

: 

### Write individual video annotation files for testing

In [19]:
from collections import defaultdict

# Function to read the annotations file and group entries
def group_annotations(file_path):
    groups = defaultdict(list)
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.split()
            video_path = parts[0]
            label = parts[1]
            # Extract the XXXXXXXX identifier from the file name
            identifier = video_path.split('_')[2]
            groups[identifier].append(line.strip())
    return groups

# Function to write grouped entries to separate files
def write_grouped_annotations(groups, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for identifier, lines in groups.items():
        output_file_path = os.path.join(output_dir, f'{identifier}.txt')
        with open(output_file_path, 'w+') as output_file:
            for line in lines:
                output_file.write(line + '\n')

# Main function
def write_test_annotations(input_file, output_dir):
    groups = group_annotations(input_file)
    write_grouped_annotations(groups, output_dir)
    print(f"Annotations have been successfully grouped and written to '{output_dir}'.")


In [20]:
input_file = os.path.join(os.getcwd(), "annotations", "aidan_allclips_annotations.txt")  # Path to the input annotations file
output_dir = os.path.join(os.getcwd(), "annotations", "video_test_annotations")  # Directory to store the grouped files
write_test_annotations(input_file, output_dir)

Annotations have been successfully grouped and written to 'c:\Users\u251245\CVEpilepsy_remote\annotations\video_test_annotations'.
