In [None]:
import os
import csv
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
def create_csv(root_dir, filename):

    rows = [] # store future CSV file rows
    for _, class_name in enumerate(sorted(os.listdir(root_dir))):

        # lets inspect subdirectories for each class_name
        class_dir = os.path.join(root_dir, class_name)
        if not os.path.isdir(class_dir): # skip single files
            continue
        
        for file_name in sorted(os.listdir(class_dir)):

            # lets find all '_ch1.png' files
            if not file_name.endswith('_ch1.png'): 
                continue
            
            # extract its basename
            basename = file_name.split('_ch1.png')[0]

            # list of basenames with different channels
            channel_paths = [] 
            for i in range(1, 8):
                path = os.path.join(class_dir, f"{basename}_ch{i}.png")
                channel_paths.append(path if os.path.exists(path) else '')

            # final CSV row
            rows.append(channel_paths + [class_name]) 

    with open(root_dir+filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        header = [f'img_ch{i}' for i in range(1, 8)] + ['class']
        writer.writerow(header)
        
        writer.writerows(rows)

    

def subset_and_split_csv(filename, subset_size, train_size):

    df = pd.read_csv(filename)
    csv_base, csv_extension = os.path.splitext(filename)

    print("\nOriginal class distribution:")
    print(df['class'].value_counts())

    # 1. Subset
    subset_df, _ = train_test_split(df, train_size=subset_size, stratify=df['class'], random_state=42)
    subset_filename = csv_base + '_subset_' + str(subset_size) + csv_extension
    subset_df.to_csv(subset_filename, index=False)

    print("\nSubset class distribution:")
    print(subset_df['class'].value_counts())

    # 2. Split
    train_df, val_df = train_test_split(subset_df, train_size=train_size, stratify=subset_df['class'], random_state=42)

    train_filename = csv_base + '_subset_' + str(subset_size) + '_train_' + str(train_size) + csv_extension
    val_filename = csv_base + '_subset_' + str(subset_size) + '_val_' + str(1-train_size) + csv_extension
    train_df.to_csv(train_filename, index=False)
    val_df.to_csv(val_filename, index=False)

    print("\nTraining set class distribution:")
    print(train_df['class'].value_counts())

    print("\nValidation set class distribution:")
    print(val_df['class'].value_counts())

In [5]:
os.chdir('..')

In [None]:
create_csv(root_dir = 'data/deepvariant/', filename='data.csv')
subset_and_split_csv(filename = 'data/deepvariant/data.csv', subset_size = 1000, train_size = 0.8)

CSV file created at data/deepvariant/data.csv
