<a href="https://colab.research.google.com/github/aletyska/crc_wsi_classification/blob/main/Trabalho_Grau_B_Alessandro_Tyska.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colorectal Cancer WSI Classification with CNN Neural Network
* UNISINOS - Universidade do Vale do Rio dos Sinos
* PPGCA - Programa de Pós-Graduação em Computação Aplicada
* Student: Alessandro Tyska
* Supervisor: Dr. Felipe Zeiser
* Course: Deep Learning

In [1]:
# @title Installing Packages
!pip install kagglehub



In [2]:
# @title Imports and Variables

import os
import kagglehub
import shutil

KAGGLE_DATASET_NAME = "mahdiislam/colorectal-cancer-wsi"
KAGGLE_DATASET_PATH = kagglehub.dataset_download(KAGGLE_DATASET_NAME)
KAGGLE_DATASET_PATH = os.path.join(KAGGLE_DATASET_PATH, "EBHI-SEG")

Using Colab cache for faster access to the 'colorectal-cancer-wsi' dataset.


In [3]:
# @title Basic Functions
def print_dataset_folder_structure(root_path):
    for root, dirs, files in os.walk(root_path):
        level = root.replace(root_path, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)

def restructure_dataset(root_path):
    if not os.path.exists(root_path):
        print(f"Path not found: {root_path}")
        return

    # Iterate over the class folders (High-grade IN, Normal, etc.)
    for class_name in os.listdir(root_path):
        class_dir = os.path.join(root_path, class_name)

        # Ensure we are processing directories only
        if os.path.isdir(class_dir):
            print(f"Processing: {class_name}")

            # 1. Remove 'label' folder
            label_dir = os.path.join(class_dir, 'label')
            if os.path.exists(label_dir):
                shutil.rmtree(label_dir)
                print(f"  - Removed label folder")

            # 2. Move files from 'image' folder to the class folder
            image_dir = os.path.join(class_dir, 'image')
            if os.path.exists(image_dir):
                for filename in os.listdir(image_dir):
                    src_file = os.path.join(image_dir, filename)
                    dst_file = os.path.join(class_dir, filename)

                    # Move the file
                    shutil.move(src_file, dst_file)

                print(f"  - Moved images to {class_name}/")

                # 3. Remove the now empty 'image' folder
                os.rmdir(image_dir)
                print(f"  - Removed image folder")

    print("\nRestructuring complete.")

In [4]:
print_dataset_folder_structure(KAGGLE_DATASET_PATH)

EBHI-SEG/
    High-grade IN/
        label/
        image/
    Normal/
        label/
        image/
    Adenocarcinoma/
        label/
        image/
    Serrated adenoma/
        label/
        image/
    Polyp/
        label/
        image/
    Low-grade IN/
        label/
        image/


In [5]:
restructure_dataset(KAGGLE_DATASET_PATH)

Processing: High-grade IN


OSError: [Errno 30] Read-only file system: '/kaggle/input/colorectal-cancer-wsi/EBHI-SEG/High-grade IN/label'

In [None]:
print_dataset_folder_structure(KAGGLE_DATASET_PATH)

In [None]:
from torchvision.datasets import ImageFolder
from torchvision import transforms

transforms_compose = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = ImageFolder(KAGGLE_DATASET_PATH, transform=transforms_compose)

In [None]:
import torch

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [None]:
import pandas as pd

# Prepare data for DataFrame
summary_data = []
for class_name in class_names:
    total_count = total_class_counts.get(class_name, 0)
    train_count = train_class_counts.get(class_name, 0)
    val_count = val_class_counts.get(class_name, 0)

    train_percentage = (train_count / total_count * 100) if total_count > 0 else 0
    val_percentage = (val_count / total_count * 100) if total_count > 0 else 0

    summary_data.append({
        'Class': class_name,
        'Total Samples': total_count,
        'Train Samples': train_count,
        'Train % of Total': f"{train_percentage:.2f}%",
        'Validation Samples': val_count,
        'Validation % of Total': f"{val_percentage:.2f}%"
    })

# Create DataFrame
df_summary = pd.DataFrame(summary_data)

print("Class Distribution Summary:")
print(df_summary.to_markdown(index=False))
