In [1]:
import os
import shutil
import pandas as pd
import numpy as np
from PIL import Image
import pydicom
from rich.progress import Progress

### **COPY ONLY cancer to NEW FOLDER**

* First i did it on cancerous images and now, i am repeating the same for normal images. 
* finally, after this is over, may be we could make the data balanced by removing bad images

In [49]:
df = pd.read_csv('RSNA_breast_cancer_data.csv')
df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,implant,machine_id,difficult_negative_case,image_path
0,1,1014,629904228,L,MLO,76.0,0,0,0,0,49,False,IMG\629904228.dcm
1,1,1014,669597068,L,CC,76.0,0,0,0,0,49,False,IMG\669597068.dcm
2,1,1014,229558076,R,MLO,76.0,0,0,0,0,49,False,IMG\229558076.dcm
3,1,1014,1173679750,R,CC,76.0,0,0,0,0,49,False,IMG\1173679750.dcm
4,1,10208,638273415,L,MLO,56.0,0,0,0,0,49,True,IMG\638273415.dcm


In [54]:
def copy_images(df, src_folder, dest_folder, label):
    # Filter the dataframe for cancerous images (cancer == 1)
    path_images = df[df['cancer'] == label]['image_path']

    # Copy each cancerous image to the destination folder
    for image_path in path_images:
        src_path = os.path.join(src_folder, os.path.basename(image_path))
        dest_path = os.path.join(dest_folder, os.path.basename(image_path))

        # Check if the source image exists before copying
        if os.path.exists(src_path):
            shutil.copy(src_path, dest_path)

    print(f"Copied {len(path_images)} images to '{dest_folder}'.")

In [None]:
src_folder = 'IMG'  # Folder where your DICOM images are located
dest_folder = 'CANCER_IMG'  # Folder where cancerous images will be copied

copy_images(df, src_folder, dest_folder, label=1)

Copied 1156 cancerous images to 'CANCER_IMG'.
Completed copying cancerous images.


In [55]:
# normal images
src_folder = 'IMG'  # Folder where your DICOM images are located
dest_folder = 'NORMAL_IMG'  # Folder where cancerous images will be copied
label = 0

copy_images(df, src_folder, dest_folder, label)

Copied 1218 images to 'NORMAL_IMG'.


### **Convert DCM to TIFF**

In [57]:
# Function to load and convert DICOM image to grayscale TIFF format
def load_dicom_image(path):
    dicom_image = pydicom.dcmread(path)
    img_array = dicom_image.pixel_array

    # Normalize the pixel array values to the 0-255 range for grayscale images
    img_array = img_array.astype(np.float32)  # Convert to float for scaling
    img_array -= np.min(img_array)  # Shift the minimum value to 0
    img_array /= np.max(img_array)  # Scale to a range of 0 to 1
    img_array *= 255.0  # Scale to a range of 0 to 255
    img_array = img_array.astype(np.uint8)  # Convert to unsigned 8-bit integer

    # Create a grayscale image from the array
    img = Image.fromarray(img_array, mode='L')  # 'L' mode is for grayscale images
    return img

In [58]:
# Function to process all DICOM files in the folder
def process_dicom_folder(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through all files in the input folder
    for filename in os.listdir(input_folder):
        # Check if the file is a DICOM file (ends with .dcm)
        if filename.lower().endswith('.dcm'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.tiff")  # Save as tiff

            # Load and convert the DICOM file
            img = load_dicom_image(input_path)
            img.save(output_path, format='TIFF')

    print(f"Converted {len(os.listdir(output_folder))} images into TIFF to '{output_folder}'.")

In [60]:
input_folder = 'NORMAL_IMG'  # Replace with the path to your DICOM files
output_folder = 'NORMAL_IMG_TIFF'  # Folder where converted images will be saved

# Process all DICOM files in the folder
process_dicom_folder(input_folder, output_folder)       

Converted 1218 images into TIFF to 'NORMAL_IMG_TIFF'.


### **BELOW code INVERTS THE COLOUR PIXELS**

In [62]:
# Function to load and convert DICOM image to grayscale TIFF with conditional inversion
def load_dicom_image(path):
    dicom_image = pydicom.dcmread(path)
    img_array = dicom_image.pixel_array

    # Normalize the pixel array values to the 0-255 range for grayscale images
    img_array = img_array.astype(np.float32)  # Convert to float for scaling
    img_array -= np.min(img_array)  # Shift the minimum value to 0
    img_array /= np.max(img_array)  # Scale to a range of 0 to 1
    img_array *= 255.0  # Scale to a range of 0 to 255
    img_array = img_array.astype(np.uint8) 
    img_array = 255 - img_array  # Invert the image

    # Create a grayscale image from the array
    img = Image.fromarray(img_array, mode='L')  # 'L' mode is for grayscale images
    return img

# Load and convert the DICOM file to grayscale TIFF
img = load_dicom_image('IMG/292544568.dcm')

# Save the image as TIFF
img.save('smpl2.tiff', format='TIFF')

In [63]:
# Function to load and invert the DICOM image
def load_dicom_image(path):
    dicom_image = pydicom.dcmread(path)
    img_array = dicom_image.pixel_array

    # Normalize the pixel array values to the 0-255 range for grayscale images
    img_array = img_array.astype(np.float32)  # Convert to float for scaling
    img_array -= np.min(img_array)  # Shift the minimum value to 0
    img_array /= np.max(img_array)  # Scale to a range of 0 to 1
    img_array *= 255.0  # Scale to a range of 0 to 255
    img_array = img_array.astype(np.uint8) 
    img_array = 255 - img_array  # Invert the image

    # Create a grayscale image from the array
    img = Image.fromarray(img_array, mode='L')  # 'L' mode is for grayscale images
    return img

In [64]:
# Function to process DICOM files and invert colors if needed
def process_cancer_dicom_images(input_folder, tiff_folder, inverted_folder):
    # Create the 'CANCER_IMG_INVERTED' folder if it doesn't exist
    if not os.path.exists(inverted_folder):
        os.makedirs(inverted_folder)

    dicom_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.dcm')]
    
    for filename in dicom_files:
        tiff_file = f"{os.path.splitext(filename)[0]}.tiff"
        tiff_path = os.path.join(tiff_folder, tiff_file)
        
        # Check if the corresponding TIFF image already exists
        if not os.path.exists(tiff_path):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(inverted_folder, tiff_file)  # Save inverted image as TIFF
            
            # Invert the DICOM image
            img = load_dicom_image(input_path)
            
            # Save the inverted image to 'CANCER_IMG_INVERTED' folder
            img.save(output_path, format='TIFF')

    print(f"Inverted and saved new images in '{inverted_folder}'.")

In [65]:
input_folder = 'NORMAL_IMG'  # Folder where your cancerous DICOM images are located
tiff_folder = 'NORMAL_IMG_TIFF'  # Folder where already processed TIFF images are located
inverted_folder = 'NORMAL_IMG_INVERTED'  # Folder where new inverted images will be saved

# Process and invert images that are not already in the TIFF folder
process_cancer_dicom_images(input_folder, tiff_folder, inverted_folder)

Inverted and saved new images in 'NORMAL_IMG_INVERTED'.
