## Pre-process data


In [5]:
import os
import glob
from PIL import Image
from pathlib import Path
import pandas as pd    
from joblib import Parallel, delayed
import numpy as np
import matplotlib.pyplot as plt

Before performing the pre-process, ensure the following structure within the Data directory.  

<small><pre>
Zooplankton-Data/
├── CSV Data/
│   ├── 20240430_Erie_WNS0904_2mm_Rep1_data.csv
│   ├── 20240430_Erie_WNS0904_2mm_Rep2_data.csv
│   ├── ...
├── Classified Data/
│   ├── 20240430_Erie_WNS0904_2mm_Rep1/
|        ├── 20240430_Erie_WNS0904_2mm_Rep1_000001.tif
|        ├── 20240430_Erie_WNS0904_2mm_Rep1_000002.tif
|        ├── ... 
│   ├── 20240430_Erie_WNS0904_2mm_Rep2/
|        ├── 20240430_Erie_WNS0904_2mm_Rep2_000001.tif
|        ├── 20240430_Erie_WNS0904_2mm_Rep2_000002.tif
|        ├── ... 
│   ├── ...
</pre></small>


In [6]:
pwd

'/Users/ahmedtausif/Desktop/Zooplankton-Data'

In [7]:
# Set your data directory
# DataPath = '/home/sofia/Candu_postdoc/Hierarchical_Model/Zooplankton-Data/'
DataPath = '/Users/ahmedtausif/Desktop/Zooplankton-Data/data/'

# Set directory to tif data
TifPath = DataPath + 'Classified_Data/'
sub_directories = [d for d in os.listdir(TifPath) if os.path.isdir(os.path.join(TifPath, d))]

all_tif_files = []
for directory in sub_directories:
    complete_path = os.path.join(TifPath, directory)
    tif_files = glob.glob(f'{complete_path}/*.tif') 
    all_tif_files.append(tif_files)
    
# Set directory to csv data 
CsvPath = DataPath + 'CSV_Data/'
csv_files =  glob.glob(f'{CsvPath}/*.csv')

In [8]:
# Loop to check number of samples
n_samples = 0

for csv in csv_files:
    curr_csv = pd.read_csv(csv)
    n_samples += curr_csv.shape[0]
    
n_samples

57096

In [9]:
# Loop to check number of mosaics
n_mosaics = 0

for folder in all_tif_files:
    n_mosaics +=len(folder)

n_mosaics

248

In [10]:
# Loop to crop and save individual images

n_images = 0

for list_files in all_tif_files:
    
    for file in list_files:
        
        image = Image.open(file)
        file_name = os.path.basename(file)  # Extract the filename
        cleaned_name = file_name.removesuffix('.tif') # Remove .tiff

        file_value = file.rsplit("/", 1)[-1]
        file_value = '_'.join(file_value.rsplit('_')[2:5])
        
        curr_csv_file = [s for s in csv_files if file_value in s][0]
        curr_csv = pd.read_csv(curr_csv_file)

        curr_csv_clear = curr_csv.copy()
        curr_csv_clear.columns = [
            col.replace(' ', '_').replace('.', '_').lower() for col in curr_csv_clear.columns
        ]

        curr_csv_clear = curr_csv_clear[curr_csv_clear["image_file"] == file_name]
        df_filtered = curr_csv_clear[
            [
                'particle_id','image_x', 'image_y', 
                'image_height', 'image_width','class', 'group'
            ]
        ]
        df_filtered = df_filtered.rename(columns={'class': 'Class'})    
            
        for row in df_filtered.itertuples():
            n_images +=1
                
            cropped_image = image.crop(
                (
                    row.image_x, row.image_y, 
                    row.image_x + row.image_width, 
                    row.image_y + row.image_height
                )
            ) 
            
            # save .tif
            base_path = DataPath + 'Processed_Data/'
            curr_class = row.Class
            if curr_class == 'Nauplius':
                curr_class = curr_class + '_' + row.group
            
            class_path = base_path + str(curr_class)
            os.makedirs(class_path, exist_ok=True)
            
            name_file = (
                base_path +
                str(curr_class) + '/' +
                cleaned_name + '_' +
                str(row.particle_id) + ".tif"
            )
            
            cropped_image.save(name_file)


In [12]:
n_images

57096

After processing the data, you should get the following structure

<small><pre>
Processed Data/
├── Bosminidae/
│   ├── 20240430_Erie_WNS0904_2mm_Rep1_000001_1.tif
│   ├── 20240430_Erie_WNS0904_2mm_Rep1_000001_2.tif
│   ├── ...
├── Brachionidae/
|   ├── 20240430_Erie_WNS0904_2mm_Rep1_000009_1.tif
|   ├── 20240430_Erie_WNS0904_2mm_Rep2_000001_1.tif
|   ├── ... 
├── Bubbles/
|   ├── 20240430_Erie_WNS0904_2mm_Rep1_000001_1.tif
|   ├── 20240430_Erie_WNS0904_2mm_Rep1_000001_2.tif
│   ├── ...
│ ...
</pre></small>

where each tif file is an individual sample