### Train, Val, Test Annotation Creation

`collect_data` 
- Input: master text file and a patient ID 
- Output: list of lines from the master file that contain the given patient ID

`get_patient_ids`
- Input: master text file
- Output: list of all unique patient IDs found in the file

`create_split_files`
- Input: master text file, an output directory path, a test patient ID, and an optional train ratio
- Ouput: train, validation, and test annotation files for given patient ID to leave out
- Method: first gets a list of all patient IDs from the master file and removes the test patient ID from the list. It then randomly splits the remaining patient IDs into train and validation sets based on the provided train ratio. The function collects data for the test, train, and validation sets using the collect_data function, and writes the data to separate files in the output directory.

In [1]:
import os
import random

In [4]:
def collect_data(master_txt, patient_id):
    # Open the master_txt file and read all lines
    with open (master_txt, mode = 'r') as master:
        lines = master.readlines()
    # Filter lines that contain the patient_id
    return [line for line in lines if patient_id in line]

def get_patient_ids(master_text):
    with open(master_text, mode = 'r') as master:
        lines = master.readlines()
        patient_ids = set()
        # Extract patient ids from each line
        for line in lines:
            patient_id = line.split('/')[1].split('_')[0]
            patient_ids.add(patient_id)
        return list(patient_ids)

def create_split_files(master_txt, output_dir, test_patient_id, train_ratio = 0.8):
    # Get all patient ids from the master_txt file
    patient_ids  =get_patient_ids(master_txt)
    # Remove the test_patient_id from the list of patient ids
    patient_ids.remove(test_patient_id)

    # Split the remaining patient ids into train and validation sets
    train_patients = random.sample(patient_ids, k= int(train_ratio*len(patient_ids)))
    val_patients = [patient_id for patient_id in patient_ids if patient_id not in train_patients]

    # Collect data for the test, train, and validation sets
    test_data = collect_data(master_txt, test_patient_id)
    train_data = [example for patient_id in train_patients for example in collect_data(master_txt, patient_id)]
    val_data = [example for patient_id in val_patients for example in collect_data(master_txt, patient_id)]

    os.makedirs(output_dir, exist_ok = True)
    
    # Write data to respective files
    with open(os.path.join(output_dir, f"test_{test_patient_id}.txt"), mode='w') as test_file:
        test_file.writelines(test_data)
    
    with open(os.path.join(output_dir, f"train_{test_patient_id}.txt"), mode='w') as train_file:
        train_file.writelines(train_data)
    
    with open(os.path.join(output_dir, f"val_{test_patient_id}.txt"), mode='w') as val_file:
        val_file.writelines(val_data)



In [5]:
master_txt = 'annotations/aidan_allclips_annotations.txt'
output_dir = 'annotations/patients'

patient_ids = get_patient_ids(master_txt)

for test_patient_id in patient_ids:
    create_split_files(master_txt, output_dir, test_patient_id)