In [31]:
import os
import csv
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [14]:
os.chdir('/home/agurianova/projects/rolling-in-the-deep/')

In [30]:
def create_csv(root_dir, filename):

    rows = [] # store future CSV file rows
    for _, class_name in enumerate(sorted(os.listdir(root_dir))):

        # lets inspect subdirectories for each class_name
        class_dir = os.path.join(root_dir, class_name)
        if not os.path.isdir(class_dir): # skip single files
            continue
        
        for file_name in sorted(os.listdir(class_dir)):

            # lets find all '_ch1.png' files
            if not file_name.endswith('_ch1.png'): 
                continue
            
            # extract its basename
            basename = file_name.split('_ch1.png')[0]

            # list of basenames with different channels
            channel_paths = [] 
            for i in range(1, 8):
                path = os.path.join(class_dir, f"{basename}_ch{i}.png")
                channel_paths.append(path if os.path.exists(path) else '')

            # final CSV row
            rows.append(channel_paths + [class_name]) 

    with open(root_dir+filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        header = [f'img_ch{i}' for i in range(1, 8)] + ['class']
        writer.writerow(header)
        
        writer.writerows(rows)

    

def subset_and_split_csv(filename, subset_size, train_size):

    df = pd.read_csv(filename)
    csv_base, csv_extension = os.path.splitext(filename)

    print("\nOriginal class distribution:")
    print(df['class'].value_counts())

    # 1. Subset
    subset_df, _ = train_test_split(df, train_size=subset_size, stratify=df['class'], random_state=42)
    #subset_df = df # for 20000
    subset_filename = csv_base + '_subset_' + str(subset_size) + csv_extension
    subset_df.to_csv(subset_filename, index=False)

    print("\nSubset class distribution:")
    print(subset_df['class'].value_counts())

    # 2. Split
    #train_df, val_df = train_test_split(subset_df, train_size=train_size, stratify=subset_df['class'], random_state=42)

    #train_filename = csv_base + '_subset_' + str(subset_size) + '_train_' + str(round(train_size,1)) + csv_extension
    #val_filename = csv_base + '_subset_' + str(subset_size) + '_val_' + str(round(1-train_size,1)) + csv_extension
    #train_df.to_csv(train_filename, index=False)
    #val_df.to_csv(val_filename, index=False)

    #print("\nTraining set class distribution:")
    #print(train_df['class'].value_counts())

    #print("\nValidation set class distribution:")
    #print(val_df['class'].value_counts())

In [25]:
def create_csv_optimized(root_dir, filename):
    output_path = os.path.join(root_dir, filename)
    
    with open(output_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        header = [f'img_ch{i}' for i in range(1, 8)] + ['class']
        writer.writerow(header)

        class_dirs = [entry for entry in os.scandir(root_dir) if entry.is_dir()]
        
        for class_entry in class_dirs:
            class_name = class_entry.name
            class_dir = class_entry.path
            print(f"Class name: {class_name}")

            all_files = {entry.name for entry in os.scandir(class_dir) if entry.is_file()}
            ch1_files = [name for name in all_files if name.endswith('_ch1.png')]

            for ch1_file in tqdm(ch1_files, desc=class_name, leave=False):
                basename = ch1_file[:-8]
                row = []
                for i in range(1, 8):
                    filename = f"{basename}_ch{i}.png"
                    path = os.path.join(class_dir, filename) if filename in all_files else ''
                    row.append(path)
                row.append(class_name)
                writer.writerow(row)


In [26]:
root_dir = 'data/deepvariant/images200k'
filename = 'data.csv'
create_csv_optimized(root_dir, filename)

Class name: 2


                                                           

Class name: 0


                                           

Class name: 1


                                                             

In [29]:
df = pd.read_csv("data/deepvariant/images200k/data.csv")
print(df.shape[0])
df.replace('', pd.NA, inplace=True)
df_clean = df.dropna()
print(df_clean.shape[0])
df_clean.to_csv("data/deepvariant/images200k/data.csv", index=False)

203036
203035


In [3]:
os.chdir('..')

In [15]:
#create_csv(root_dir = 'data/deepvariant/', filename='data.csv')
subset_and_split_csv(filename = 'data/deepvariant/data.csv', subset_size = 20000, train_size = 0.8)


Original class distribution:
class
1    11664
2     8140
0      196
Name: count, dtype: int64

Subset class distribution:
class
1    11664
2     8140
0      196
Name: count, dtype: int64

Training set class distribution:
class
1    9331
2    6512
0     157
Name: count, dtype: int64

Validation set class distribution:
class
1    2333
2    1628
0      39
Name: count, dtype: int64


In [33]:
#create_csv(root_dir = 'data/deepvariant/', filename='data.csv')
subset_and_split_csv(filename = 'data/deepvariant/images200k/data.csv', subset_size = 40000, train_size = 0.8)


Original class distribution:
class
1    121514
2     79810
0      1711
Name: count, dtype: int64

Subset class distribution:
class
1    23940
2    15723
0      337
Name: count, dtype: int64


# Check if files in dataset are unique

In [6]:
df = pd.read_csv("data/deepvariant/data.csv")

In [7]:
# extract the middle part of the string
# "data/deepvariant/0/image_chr1:104427287_G->AAA_label0_ch1.png"
df['middle_part'] = df['img_ch1'].str.extract(r'image_(.*?)(_ch1\.png)')[0]

In [9]:
df['middle_part'].is_unique

True