## Copy folder structure

In [None]:
import os

def copy_folder_structure(src_folder, dest_folder):
    for root, dirs, _ in os.walk(src_folder):
        dirs[:] = [d for d in dirs if d != 'rep']  # Filter out folders you don't want
        
        relative_path = os.path.relpath(root, src_folder)
        target_path = os.path.join(dest_folder, relative_path)
        if not os.path.exists(target_path):
            os.makedirs(target_path)  
            print(f"create folder: {target_path}")


src_folder = "/path/to/source_folder"
dest_folder = "/path/to/destination_folder"

copy_folder_structure(src_folder, dest_folder)
print("Folder structure copying completed!")


## Convert the file into an image and place it in the corresponding directory

In [None]:
import os
import numpy as np
import pydicom
from PIL import Image

def convert_dicom_to_png(src_file, dest_file):
    """Convert DICOM file to PNG format and save"""
    try:
        # Read DICOM file
        ds = pydicom.dcmread(src_file)

        # Extract pixel data
        image_data = ds.pixel_array.astype(np.float32)

        # Adjust brightness and contrast if Window Center and Window Width are available
        if hasattr(ds, 'WindowCenter') and hasattr(ds, 'WindowWidth'):
            window_center = ds.WindowCenter
            window_width = ds.WindowWidth

            # If they are lists, take the first value
            if isinstance(window_center, pydicom.multival.MultiValue):
                window_center = window_center[0]
            if isinstance(window_width, pydicom.multival.MultiValue):
                window_width = window_width[0]

            # Adjust brightness and contrast
            image_data = np.clip(
                ((image_data - (window_center - 0.5)) / window_width + 0.5) * 255, 0, 255
            )

        # Convert pixel data to 8-bit image (0-255)
        image_8bit = ((image_data - np.min(image_data)) / 
                      (np.max(image_data) - np.min(image_data)) * 255).astype(np.uint8)

        # Convert numpy array to PIL image
        img = Image.fromarray(image_8bit)

        # Save as PNG format
        img.save(dest_file)
        print(f"Converted and saved: {dest_file}")

    except Exception as e:
        print(f"Conversion failed {src_file}: {e}")

def convert_bmp_to_png(src_file, dest_file):
    """Convert BMP file to PNG format and save"""
    try:
        # Open BMP image
        img = Image.open(src_file)

        # Save as PNG format
        img.save(dest_file)
        print(f"Converted and saved: {dest_file}")

    except Exception as e:
        print(f"Conversion failed {src_file}: {e}")

def copy_and_convert_images(src_folder, dest_folder):
    """Traverse source folder, convert DICOM and BMP files to PNG, and maintain directory structure"""
    for root, dirs, files in os.walk(src_folder):
        # Skip directories named 'rep'
        dirs[:] = [d for d in dirs if d != 'rep']

        # Compute relative path
        relative_path = os.path.relpath(root, src_folder)
        target_path = os.path.join(dest_folder, relative_path)

        # Create corresponding target folder
        if not os.path.exists(target_path):
            os.makedirs(target_path)
            print(f"Created directory: {target_path}")

        # Traverse and convert DICOM and BMP files to PNG
        for file in files:
            src_file = os.path.join(root, file)
            dest_file = os.path.join(target_path, os.path.splitext(file)[0] + '.png')

            # Convert only DICOM and BMP files
            if file.lower().endswith('.dcm'):  # Process DICOM files
                convert_dicom_to_png(src_file, dest_file)
            elif file.lower().endswith('.bmp'):  # Process BMP files
                convert_bmp_to_png(src_file, dest_file)

# Replace with actual source and destination folder paths
src_folder = "/path/to/source_folder"
dest_folder = "/path/to/destination_folder"

# Perform conversion and copying
copy_and_convert_images(src_folder, dest_folder)
print("All DICOM and BMP files have been converted and saved as PNG!")


## How to choose when there are several subfolders with similar contents at the same time

In [None]:
import os
import shutil

def get_target_folder(folders):
    """
    Return the folder name to be retained based on priority.
    Priority order: dicom > bmp > BMP-mark
    Matches are case-insensitive.
    """
    folders_lower = [f.lower() for f in folders]

    if 'dicom' in folders_lower:
        return folders[folders_lower.index('dicom')]
    elif 'bmp' in folders_lower:
        return folders[folders_lower.index('bmp')]
    elif 'bmp-mark' in folders_lower:
        return folders[folders_lower.index('bmp-mark')]

    return None  # Return None if no matching folder found

def delete_folder(folder_path):
    """
    Delete the specified folder and its contents.
    """
    try:
        shutil.rmtree(folder_path)
        print(f"Deleted folder: {folder_path}")
    except Exception as e:
        print(f"Error deleting folder {folder_path}: {e}")

def move_contents_to_parent(src_folder, parent_folder):
    """
    Move contents of src_folder to parent_folder and delete src_folder.
    """
    for item in os.listdir(src_folder):
        src_path = os.path.join(src_folder, item)
        dest_path = os.path.join(parent_folder, item)
        if os.path.exists(dest_path):
            print(f"Skipping existing file or folder: {dest_path}")
            continue
        shutil.move(src_path, dest_path)

    # Delete the now-empty src_folder
    delete_folder(src_folder)

def delete_roi_folders(root_folder):
    """
    Traverse directories and delete folders named 'ROI'.
    """
    for root, dirs, _ in os.walk(root_folder, topdown=False):
        for d in dirs:
            if d.lower() == 'roi':
                roi_path = os.path.join(root, d)
                delete_folder(roi_path)

def process_folder(root_folder):
    """
    Traverse root_folder to find target subfolders and perform operations, 
    while deleting ROI and lower-priority folders.
    """
    delete_roi_folders(root_folder)  # First delete 'ROI' folders

    for root, dirs, _ in os.walk(root_folder, topdown=False):
        target_folder_name = get_target_folder(dirs)

        # Delete all unrelated folders (when target_folder_name is None)
        for d in dirs:
            if target_folder_name is None:
                folder_path = os.path.join(root, d)
                delete_folder(folder_path)
                continue  # Skip this loop iteration

        # If a highest-priority folder exists, move its contents to the parent directory
        if target_folder_name:
            target_folder_path = os.path.join(root, target_folder_name)
            move_contents_to_parent(target_folder_path, root)

# Example invocation
root_folder = "/path/to/example_root_folder"  # Replace with your actual folder path
process_folder(root_folder)


At this point, you have already completed the construction of the data folder structure with the same directory layout and converted the DICOM files into PNG files placed in the corresponding directories. It is important to note that the data folder structure should be adapted and processed based on the specific task. This code only builds the most basic structure, and I have also made dedicated adjustments for my task.

The next step is to select the most representative images, read the corresponding CSV file contents, and pair them together, then format them into a structure that can be used for VLLM training.

## feature

### monai

In [None]:
import torch
import numpy as np
import random
import os
from monai.transforms import (
    Compose, LoadImage, ScaleIntensity, EnsureChannelFirst, Resize, ToTensor
)
from monai.networks.nets import DenseNet121
import torch.nn as nn

# Set random seed
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Define image preprocessing pipeline
transform = Compose([
    LoadImage(image_only=True),  # Load only the image data
    EnsureChannelFirst(),        # Ensure image is in (C, H, W) format
    ScaleIntensity(),            # Normalize intensity to [0, 1]
    Resize((224, 224)),          # Resize to model input size
    ToTensor()                   # Convert to PyTorch Tensor
])

def get_png_files(folder_path):
    """
    Get all PNG file paths in the given folder and its subfolders.
    """
    png_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(".png"):
                png_files.append(os.path.join(root, file))
    return png_files

def extract_key_image_indices_monai(image_dir, top_n=10):
    """
    Extract the most representative slices from a given MRI image folder.

    Parameters:
        image_dir (str): Image folder path.
        top_n (int): Number of most representative slices to return.

    Returns:
        list: List of paths to the most representative slices.
    """
    set_seed()  # Set random seed

    # Load all PNG file paths
    image_paths = get_png_files(image_dir)

    # Process all MRI slices and stack them into a (N, 1, 224, 224) tensor
    images = torch.stack([transform(img) for img in image_paths])
    print(f"Processed image tensor shape: {images.shape}")  # (N, 1, 224, 224)

    # Initialize MONAI's DenseNet121 and set it to evaluation mode
    densenet = DenseNet121(
        spatial_dims=2,     # 2D image support
        in_channels=1,      # Single-channel input (grayscale images)
        out_channels=512    # Output feature vector size (512 dimensions)
    )
    feature_extractor = nn.Sequential(*list(densenet.children())[:-1])
    feature_extractor.eval()

    # Extract feature vectors
    with torch.no_grad():
        features = feature_extractor(images)  # Output shape: (N, 512, 1, 1)
        features = features.view(features.size(0), -1)  # Reshape to (N, 512)

    # Calculate feature center (average feature vector)
    feature_center = features.mean(dim=0)  # Shape: (512,)

    # Compute Euclidean distance of each slice's feature to the center
    distances = torch.norm(features - feature_center, dim=1)  # Shape: (N,)

    # Find the top_n slices closest to the center
    top_indices = torch.topk(-distances, top_n).indices  # Negative sign to find minimum distance

    # Return paths to the most representative slices
    return [image_paths[idx] for idx in top_indices]

# Example usage
image_dir = '/path/to/your/mri/images'  # Replace with your actual image directory path
best_slices = extract_key_image_indices_monai(image_dir, top_n=8)


### ssim

In [None]:
import cv2
import os
from skimage.metrics import structural_similarity as ssim

def extract_key_images_by_ssim(folder_path, top_n=8):
    """
    Use SSIM to compute similarity between adjacent images and return indices of images with the lowest SSIM 
    (i.e., the greatest change between consecutive images).

    Parameters:
        folder_path (str): The folder path containing the PNG images.
        top_n (int): The number of images with the greatest change to return (default is 8).

    Returns:
        list: A list of the paths to the most key images with greatest change.
    """
    # Load and process all PNG images
    image_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.png')])
    images = [cv2.imread(os.path.join(folder_path, img), cv2.IMREAD_GRAYSCALE) for img in image_files]

    # Compute SSIM between adjacent images
    ssim_scores = []
    for i in range(len(images) - 1):
        score, _ = ssim(images[i], images[i + 1], full=True)
        ssim_scores.append((i, score))

    # Sort by SSIM in ascending order to find the largest differences (smallest SSIM values)
    ssim_scores.sort(key=lambda x: x[1])  # Sort by SSIM score (lower SSIM means more difference)

    # Get the indices of the images with the greatest change (lowest SSIM)
    top_indices = [x[0] for x in ssim_scores[:top_n]]

    # Return the file paths of the top N most changed images
    return [os.path.join(folder_path, image_files[i]) for i in top_indices]

# Example usage
folder_path = '/path/to/your/ssim/images'  # Replace with your actual image directory path
top_images = extract_key_images_by_ssim(folder_path, top_n=8)

# Output the result
top_images


### PCA

In [None]:
import cv2
import numpy as np
import os
from sklearn.decomposition import PCA

def extract_key_image_indices_PCA(folder_path, n_components=8, target_size=(336, 336)):
    """
    Extract the indices of images that score highest on each principal component, supporting automatic resizing of images.

    Parameters:
        folder_path (str): The folder path containing PNG images.
        n_components (int): The number of principal components (default is 8).
        target_size (tuple): The target image size (default is (336, 336)).

    Returns:
        list: A list of the paths to the images that score highest on each principal component.
    """
    # Load and resize all PNG images
    image_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.png')])
    images = []

    for img in image_files:
        image = cv2.imread(os.path.join(folder_path, img), cv2.IMREAD_GRAYSCALE)
        if image is not None:
            resized_image = cv2.resize(image, target_size)  # Resize the image
            images.append(resized_image.flatten())  # Flatten and append to list

    # Check if we successfully loaded any images
    if not images:
        raise ValueError(f"No valid PNG images found in the folder {folder_path}.")

    # Convert list of flattened images into a numpy array
    data = np.array(images)

    # Perform PCA (Principal Component Analysis) for dimensionality reduction
    pca = PCA(n_components=n_components)
    pca.fit(data)

    # Transform the data into the principal components' space
    projected_data = pca.transform(data)

    # Find the index of the image with the highest score for each principal component
    important_indices = np.argmax(projected_data, axis=0)  # Get the index of the max value for each component
    
    # Return the file paths of the most important images for each component
    return [os.path.join(folder_path, image_files[i]) for i in important_indices]

# Example usage
folder_path = '/path/to/your/images'  # Replace with your actual folder path
top_images = extract_key_image_indices_PCA(folder_path, n_components=5)

# Print the file paths of the most important images
print(top_images)


## construct data

In [None]:
import os
import re

def find_folder_structures(root_dir):
    single_folders = []  # Store the paths of folders that appear only once and contain only numeric folder names. In this case, each numeric folder represents a patient, and the subfolders are DWI, ADC, etc. Each of these has tens to hundreds of images. We need to select the most representative images.

    # Traverse all folders in the root directory
    for folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder)
        single_folders.append(folder_path)
    return single_folders

# Specify your root directory
root_directory = "/path/to/your/data"  # Replace with your actual root directory

# Call the function and print the result
single = find_folder_structures(root_directory)

print("\nFolder paths:")
for path in single:
    print(path)

all_path = single
all_my_data = []
for item in all_path:
    message = {
            "hospital": "Cervical Cancer - xx Hospital",  # Example hospital name
            "patient": item
    }
    all_my_data.append(message)

# Optionally print the collected data
for data in all_my_data:
    print(data)

In [None]:
import os

def find_all_png_folders(root_dir):
    png_folders = []  # Store the paths of folders containing PNG files

    # Traverse the root directory and its subdirectories
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Check if any file in the current directory ends with '.png'
        if any(filename.lower().endswith('.png') for filename in filenames):
            png_folders.append(dirpath)  # Add the current directory path to the list

    return png_folders


# Iterate over the data collected in all_my_data
for item in all_my_data:
    root_directory = item["patient"]
    png_paths = find_all_png_folders(root_directory)  # Find all PNG folders for the current patient
    item["all_images_path"] = png_paths  # Store the paths of all PNG folders for the patient

# Optionally, print out the results to verify
for item in all_my_data:
    print(f"Patient: {item['patient']}")
    print(f"PNG Folders: {item['all_images_path']}")


In [None]:
"""
    {'hospital': 'xx',
    'patient': 'xx',
    'all_images_path': ['xx/T2S',
    'xx/T2A',
    'xx/ADC',
    'xx/T1CA',
    'xx/T1CS'],
    'best_image': {'T2S': ['xx/ST0/T2S/IM13.png',
    'xx/ST0/T2S/IM0.png',
    'xx/ST0/T2S/IM4.png',
    'xx/ST0/T2S/IM5.png',
    'xx/ST0/T2S/IM3.png',
    'xx/ST0/T2S/IM6.png',
    'xx/ST0/T2S/IM0.png',
    'xx/ST0/T2S/IM8.png'],
    'T2A': ['xx/ST0/T2A/IM3.png',
    'xx/ST0/T2A/IM14.png',
    'xx/ST0/T2A/IM3.png',
    'xx/ST0/T2A/IM2.png',
    'xx/ST0/T2A/IM12.png',
    'xx/ST0/T2A/IM0.png',
    'xx/ST0/T2A/IM4.png',
    'xx/ST0/T2A/IM8.png'],
    'ADC': ['xx/ST0/ADC/IM18.png',
    'xx/ST0/ADC/IM4.png',
    ...
    'xx/ST0/T1CS/IM0.png',
    'xx/ST0/T1CS/IM14.png',
    'xx/ST0/T1CS/IM4.png',
    'xx/ST0/T1CS/IM0.png',
    'xx/ST0/T1CS/IM3.png']}}
"""
