### 2D

In [15]:
import pandas as pd
import numpy as np
import os
import cv2
import glob
from sklearn.model_selection import train_test_split

# Load and preprocess the DataFrame
df_train = pd.read_csv("/Users/arahjou/Downloads/Medical_seg/Backup/train.csv")
df_train = df_train.sort_values(["id", "class"]).reset_index(drop=True)
df_train["patient"] = df_train.id.apply(lambda x: x.split("_")[0])
df_train["days"] = df_train.id.apply(lambda x: "_".join(x.split("_")[:2]))

# Read and process all image files
all_image_files = sorted(
    glob.glob("/Users/arahjou/Downloads/Medical_seg/Backup/train/*/*/scans/*.png"),
    key=lambda x: x.split("/")[3] + "_" + x.split("/")[5]
)
size_x = [int(os.path.basename(x)[:-4].split("_")[-4]) for x in all_image_files]
size_y = [int(os.path.basename(x)[:-4].split("_")[-3]) for x in all_image_files]
spacing_x = [float(os.path.basename(x)[:-4].split("_")[-2]) for x in all_image_files]
spacing_y = [float(os.path.basename(x)[:-4].split("_")[-1]) for x in all_image_files]

# Assign images to DataFrame
df_train["image_files"] = np.repeat(all_image_files, 3)
df_train["spacing_x"] = np.repeat(spacing_x, 3)
df_train["spacing_y"] = np.repeat(spacing_y, 3)
df_train["size_x"] = np.repeat(size_x, 3)
df_train["size_y"] = np.repeat(size_y, 3)
df_train["slice"] = np.repeat([int(os.path.basename(x)[:-4].split("_")[-5]) for x in all_image_files], 3)

# Split data into training, validation, and test sets
train_val, test = train_test_split(df_train['days'].unique(), test_size=0.05, random_state=42)
train, val = train_test_split(train_val, test_size=0.20 / 0.95, random_state=42)  # Adjust proportion to account for earlier split

# Helper function to decode RLE masks
def rle_decode(mask_rle, shape):
    s = np.array(mask_rle.split(), dtype=int)
    starts, lengths = s[0::2] - 1, s[1::2]
    ends = starts + lengths
    h, w = shape
    img = np.zeros((h * w,), dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# Create directories for each set
!mkdir -p ./mmseg_train/{images,labels,splits}
!mkdir -p ./mmseg_val/{images,labels,splits}
!mkdir -p ./mmseg_test/{images,labels,splits}

# Function to process groups of data for 2D images
def process_data(groups, set_name):
    for day, group in tqdm(groups):
        patient = group.patient.iloc[0]
        for file_name in group.image_files.unique():
            img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)
            segms = group.loc[group.image_files == file_name]
            masks = {}
            for segm, label in zip(segms.segmentation, segms["class"]):
                if not pd.isna(segm):
                    mask = rle_decode(segm, img.shape[:2])
                    masks[label] = mask
                else:
                    masks[label] = np.zeros(img.shape[:2], dtype=np.uint8)
            mask_stack = np.stack([masks[k] for k in sorted(masks)], -1)
            new_file_name = f"{day}_{os.path.basename(file_name)}"
            cv2.imwrite(f"./{set_name}/images/{new_file_name}", img)
            cv2.imwrite(f"./{set_name}/labels/{new_file_name}", mask_stack)

# Process each set
process_data(df_train[df_train['days'].isin(train)].groupby('days'), 'mmseg_train')
process_data(df_train[df_train['days'].isin(val)].groupby('days'), 'mmseg_val')
process_data(df_train[df_train['days'].isin(test)].groupby('days'), 'mmseg_test')


  0%|          | 0/205 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

### Removing images with no related run-length values

To modify the code to remove images for which there is no run-length encoding (RLE) value in the CSV, you can add a filtering step before processing the images. This step will exclude any rows from the DataFrame where the 'segmentation' column is NaN (Not a Number), which represents the absence of an RLE value.

Here's the revised version of your code with the necessary adjustments to ensure that images with no corresponding segmentation data are not processed:

In [None]:
import pandas as pd
import numpy as np
import os
import cv2
import glob
from sklearn.model_selection import train_test_split

# Load and preprocess the DataFrame
df_train = pd.read_csv("/Users/arahjou/Downloads/Medical_seg/Backup/train.csv")
df_train = df_train.sort_values(["id", "class"]).reset_index(drop=True)
df_train["patient"] = df_train.id.apply(lambda x: x.split("_")[0])
df_train["days"] = df_train.id.apply(lambda x: "_".join(x.split("_")[:2]))

# Filter out entries without a run-length encoding
df_train = df_train.dropna(subset=['segmentation'])

# Read and process all image files
all_image_files = sorted(
    glob.glob("/Users/arahjou/Downloads/Medical_seg/Backup/train/*/*/scans/*.png"),
    key=lambda x: x.split("/")[3] + "_" + x.split("/")[5]
)
size_x = [int(os.path.basename(x)[:-4].split("_")[-4]) for x in all_image_files]
size_y = [int(os.path.basename(x)[:-4].split("_")[-3]) for x in all_image_files]
spacing_x = [float(os.path.basename(x)[:-4].split("_")[-2]) for x in all_image_files]
spacing_y = [float(os.path.basename(x)[:-4].split("_")[-1]) for x in all_image_files]

# Assign images to DataFrame
df_train["image_files"] = np.repeat(all_image_files, 3)
df_train["spacing_x"] = np.repeat(spacing_x, 3)
df_train["spacing_y"] = np.repeat(spacing_y, 3)
df_train["size_x"] = np.repeat(size_x, 3)
df_train["size_y"] = np.repeat(size_y, 3)
df_train["slice"] = np.repeat([int(os.path.basename(x)[:-4].split("_")[-5]) for x in all_image_files], 3)

# Split data into training, validation, and test sets
train_val, test = train_test_split(df_train['days'].unique(), test_size=0.05, random_state=42)
train, val = train_test_split(train_val, test_size=0.20 / 0.95, random_state=42)

# Helper function to decode RLE masks
def rle_decode(mask_rle, shape):
    s = np.array(mask_rle.split(), dtype=int)
    starts, lengths = s[0::2] - 1, s[1::2]
    ends = starts + lengths
    h, w = shape
    img = np.zeros((h * w,), dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# Create directories for each set
os.makedirs("./mmseg_train/images", exist_ok=True)
os.makedirs("./mmseg_train/labels", exist_ok=True)
os.makedirs("./mmseg_val/images", exist_ok=True)
os.makedirs("./mmseg_val/labels", exist_ok=True)
os.makedirs("./mmseg_test/images", exist_ok=True)
os.makedirs("./mmseg_test/labels", exist_ok=True)

# Function to process groups of data for 2D images
def process_data(groups, set_name):
    for day, group in groups:
        patient = group.patient.iloc[0]
        for file_name in group.image_files.unique():
            img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)
            segms = group.loc[group.image_files == file_name]
            masks = {}
            for segm, label in zip(segms.segmentation, segms["class"]):
                mask = rle_decode(segm, img.shape[:2])
                masks[label] = mask
            mask_stack = np.stack([masks[k] for k in sorted(masks)], -1)
            new_file_name = f"{day}_{os.path.basename(file_name)}"
            cv2.imwrite(f"./{set_name}/images/{new_file_name}", img)
            cv2.imwrite(f"./{set_name}/labels/{new_file_name}", mask_stack)

# Process each set
process_data(df_train[df_train['days'].isin(train)].groupby('days'), 'mmseg_train')
process_data(df_train[df_train['days'].isin(val)].groupby('days'), 'mmseg_val')
process_data(df_train[df_train['days'].isin(test)].groupby('days'), 'mmseg_test')


### 2.5D

In [13]:
import pandas as pd
import numpy as np
import os
import cv2
import glob
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

# Load and preprocess the DataFrame
df_train = pd.read_csv("/Users/arahjou/Downloads/Medical_seg/Backup/train.csv")
df_train = df_train.sort_values(["id", "class"]).reset_index(drop=True)
df_train["patient"] = df_train.id.apply(lambda x: x.split("_")[0])
df_train["days"] = df_train.id.apply(lambda x: "_".join(x.split("_")[:2]))

# Read and process all image files
all_image_files = sorted(
    glob.glob("/Users/arahjou/Downloads/Medical_seg/Backup/train/*/*/scans/*.png"),
    key=lambda x: x.split("/")[3] + "_" + x.split("/")[5]
)
size_x = [int(os.path.basename(x)[:-4].split("_")[-4]) for x in all_image_files]
size_y = [int(os.path.basename(x)[:-4].split("_")[-3]) for x in all_image_files]
spacing_x = [float(os.path.basename(x)[:-4].split("_")[-2]) for x in all_image_files]
spacing_y = [float(os.path.basename(x)[:-4].split("_")[-1]) for x in all_image_files]

# Assign images to DataFrame
df_train["image_files"] = np.repeat(all_image_files, 3)
df_train["spacing_x"] = np.repeat(spacing_x, 3)
df_train["spacing_y"] = np.repeat(spacing_y, 3)
df_train["size_x"] = np.repeat(size_x, 3)
df_train["size_y"] = np.repeat(size_y, 3)
df_train["slice"] = np.repeat([int(os.path.basename(x)[:-4].split("_")[-5]) for x in all_image_files], 3)

# Split data into training, validation, and test sets
train_val, test = train_test_split(df_train['days'].unique(), test_size=0.05, random_state=42)
train, val = train_test_split(train_val, test_size=0.20 / 0.95, random_state=42)  # Adjust proportion to account for earlier split

# Helper function to decode RLE masks
def rle_decode(mask_rle, shape):
    s = np.array(mask_rle.split(), dtype=int)
    starts, lengths = s[0::2] - 1, s[1::2]
    ends = starts + lengths
    h, w = shape
    img = np.zeros((h * w,), dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# Create directories for each set
!mkdir -p ./mmseg_train/{images,labels,splits}
!mkdir -p ./mmseg_val/{images,labels,splits}
!mkdir -p ./mmseg_test/{images,labels,splits}

def process_data(groups, set_name, target_size=(256, 256)):  # Example target size
    for day, group in tqdm(groups):
        patient = group.patient.iloc[0]
        imgs = []
        msks = []
        for file_name in group.image_files.unique():
            img = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, target_size)  # Resize image to target size
            segms = group.loc[group.image_files == file_name]
            masks = {}
            for segm, label in zip(segms.segmentation, segms["class"]):
                if not pd.isna(segm):
                    mask = rle_decode(segm, img.shape)
                    masks[label] = mask
                else:
                    masks[label] = np.zeros(img.shape, dtype=np.uint8)
            masks = np.stack([masks[k] for k in sorted(masks)], -1)
            imgs.append(img)
            msks.append(masks)

        imgs = np.stack(imgs, 0)
        msks = np.stack(msks, 0)
        for i in range(msks.shape[0]):
            img_slice_range = imgs[max(0, i - 2):min(imgs.shape[0], i + 3)]
            img = np.mean(img_slice_range, axis=0)  # Average to maintain one channel
            msk = msks[i]
            new_file_name = f"{day}_{i}.png"
            cv2.imwrite(f"./{set_name}/images/{new_file_name}", img.astype(np.uint8))
            cv2.imwrite(f"./{set_name}/labels/{new_file_name}", msk[:, :, 0].astype(np.uint8))  # Assuming mask is single-channel


# Process each set
process_data(df_train[df_train['days'].isin(train)].groupby('days'), 'mmseg_train')
process_data(df_train[df_train['days'].isin(val)].groupby('days'), 'mmseg_val')
process_data(df_train[df_train['days'].isin(test)].groupby('days'), 'mmseg_test')

  0%|          | 0/205 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

### Removing images with no related run-length values
To modify the given code to exclude images without a run-length encoding (RLE) value for their segmentation data, you need to filter out such entries before proceeding with image processing and data assignment. Here's how you can integrate this filtering step into your existing script:

In [None]:
https://idiotdeveloper.com/step-by-step-guide-to-resnet50-unet-in-tensorflow/

In [None]:
import pandas as pd
import numpy as np
import os
import cv2
import glob
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

# Load and preprocess the DataFrame
df_train = pd.read_csv("/Users/arahjou/Downloads/Medical_seg/Backup/train.csv")
df_train = df_train.sort_values(["id", "class"]).reset_index(drop=True)
df_train["patient"] = df_train.id.apply(lambda x: x.split("_")[0])
df_train["days"] = df_train.id.apply(lambda x: "_".join(x.split("_")[:2]))

# Filter out entries without a run-length encoding
df_train = df_train.dropna(subset=['segmentation'])

# Read and process all image files
all_image_files = sorted(
    glob.glob("/Users/arahjou/Downloads/Medical_seg/Backup/train/*/*/scans/*.png"),
    key=lambda x: x.split("/")[3] + "_" + x.split("/")[5]
)
size_x = [int(os.path.basename(x)[:-4].split("_")[-4]) for x in all_image_files]
size_y = [int(os.path.basename(x)[:-4].split("_")[-3]) for x in all_image_files]
spacing_x = [float(os.path.basename(x)[:-4].split("_")[-2]) for x in all_image_files]
spacing_y = [float(os.path.basename(x)[:-4].split("_")[-1]) for x in all_image_files]

# Assign images to DataFrame
df_train["image_files"] = np.repeat(all_image_files, 3)
df_train["spacing_x"] = np.repeat(spacing_x, 3)
df_train["spacing_y"] = np.repeat(spacing_y, 3)
df_train["size_x"] = np.repeat(size_x, 3)
df_train["size_y"] = np.repeat(size_y, 3)
df_train["slice"] = np.repeat([int(os.path.basename(x)[:-4].split("_")[-5]) for x in all_image_files], 3)

# Split data into training, validation, and test sets
train_val, test = train_test_split(df_train['days'].unique(), test_size=0.05, random_state=42)
train, val = train_test_split(train_val, test_size=0.20 / 0.95, random_state=42)

# Helper function to decode RLE masks
def rle_decode(mask_rle, shape):
    s = np.array(mask_rle.split(), dtype=int)
    starts, lengths = s[0::2] - 1, s[1::2]
    ends = starts + lengths
    h, w = shape
    img = np.zeros((h * w,), dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# Create directories for each set
os.makedirs("./mmseg_train/images", exist_ok=True)
os.makedirs("./mmseg_train/labels", exist_ok=True)
os.makedirs("./mmseg_val/images", exist_ok=True)
os.makedirs("./mmseg_val/labels", exist_ok=True)
os.makedirs("./mmseg_test/images", exist_ok=True)
os.makedirs("./mmseg_test/labels", exist_ok=True)

def process_data(groups, set_name, target_size=(256, 256)):  # Example target size
    for day, group in tqdm(groups):
        patient = group.patient.iloc[0]
        imgs = []
        msks = []
        for file_name in group.image_files.unique():
            img = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, target_size)  # Resize image to target size
            segms = group.loc[group.image_files == file_name]
            masks = {}
            for segm, label in zip(segms.segmentation, segms["class"]):
                mask = rle_decode(segm, img.shape)
                masks[label] = mask
            masks = np.stack([masks[k] for k in sorted(masks)], -1)
            imgs.append(img)
            msks.append(masks)

        imgs = np.stack(imgs, 0)
        msks = np.stack(msks, 0)
        for i in range(msks.shape[0]):
            img_slice_range = imgs[max(0, i - 2):min(imgs.shape[0], i + 3)]
            img = np.mean(img_slice_range, axis=0)  # Average to maintain one channel
            msk = msks[i]
            new_file_name = f"{day}_{i}.png"
            cv2.imwrite(f"./{set_name}/images/{new_file_name}", img.astype(np.uint8))
            cv2.imwrite(f"./{set_name}/labels/{new_file_name}", msk[:, :, 0].astype(np.uint8))  # Assuming mask is single-channel

# Process each set
process_data(df_train[df_train['days'].isin(train)].groupby('days'), 'mmseg_train')
process_data(df_train[df_train['days'].isin(val)].groupby('days'), 'mmseg_val')
process_data(df_train[df_train['days'].isin(test)].groupby('days'), 'mmseg_test')