In [45]:
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import numpy as np
import os
import random
from PIL import Image

# Download Dataset

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("aryashah2k/breast-ultrasound-images-dataset")

print("Path to dataset files:", path)

Resuming download from 189792256 bytes (14629214 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/aryashah2k/breast-ultrasound-images-dataset?dataset_version_number=1 (189792256/204421470) bytes left.


100%|██████████| 195M/195M [00:19<00:00, 761kB/s]

Extracting files...





Path to dataset files: /home/azhar/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1


# Dataset

In [19]:
# Dataset path
path = '/home/azhar/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1'

In [20]:
os.listdir(path)

['Dataset_BUSI_with_GT']

In [21]:
dataset_dir = path + '/Dataset_BUSI_with_GT/'

In [22]:
os.listdir(dataset_dir)

['benign', 'normal', 'malignant']

# Prepare Data

Menyiapkan Dataset

1. Img => data image (np.array)
2. Mask => data mask (segmentation) (np.array)
3. Label => target

In [23]:
# Global Variable
data_dict = {
    'img': [],
    'mask': [],
    'label': [],
}

# Constanta
SIZE = 128

In [24]:
# Class Directory
benign_dir = dataset_dir + 'benign/'
malignant_dir = dataset_dir + 'malignant/'
normal_dir = dataset_dir + 'normal/'

## Benign

In [25]:
# 1. Store image names
# Get all the file names
benign_data_list = os.listdir(benign_dir)

datas = []
for i in range(len(benign_data_list)):
    datas.append(benign_data_list[i].split(")")[0] + ')')

# Remove duplicate names
datas = list(set(datas))

In [26]:
print('Total Benign Data:', len(datas))

Total Benign Data: 437


In [27]:
# 2. Create Data
for x in range(len(datas)):
    # Image and Resize to 128x128
    img = plt.imread(benign_dir + datas[x] + '.png')
    if len(img.shape) > 2:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    img = cv2.resize(img, (SIZE, SIZE))
    
    # Mask
    name_mask_files = [file for file in benign_data_list if file.startswith(datas[x] + '_mask')]
    # Merge Mask
    mask = np.zeros(shape=(SIZE, SIZE))
    for mask_file in name_mask_files:
        # print(mask_file)
        mask_img = plt.imread(benign_dir + mask_file)
        if len(mask_img.shape) > 2:
            mask_img = cv2.cvtColor(mask_img, cv2.COLOR_RGB2GRAY)
        mask_img = cv2.resize(mask_img, (SIZE, SIZE))
        mask = np.maximum(mask, mask_img)
    
    # Label
    label = 'benign'
    
    # Append Data
    data_dict['img'].append(img)
    data_dict['mask'].append(mask)
    data_dict['label'].append(label)

    # print(name_mask_files)

## Malignant

In [29]:
# 1. Store image names
# Get all the file names
malignant_data_list = os.listdir(malignant_dir)

datas = []
for i in range(len(malignant_data_list)):
    datas.append(malignant_data_list[i].split(")")[0] + ')')

# Remove duplicate names
datas = list(set(datas))

In [30]:
print('Total Malignant Data:', len(datas))

Total Malignant Data: 210


In [31]:
# 2. Create Data
for x in range(len(datas)):
    # Image and Resize to 128x128
    img = plt.imread(malignant_dir + datas[x] + '.png')
    if len(img.shape) > 2:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    img = cv2.resize(img, (SIZE, SIZE))
    
    # Mask
    name_mask_files = [file for file in malignant_data_list if file.startswith(datas[x] + '_mask')]
    # Merge Mask
    mask = np.zeros(shape=(SIZE, SIZE))
    for mask_file in name_mask_files:
        # print(mask_file)
        mask_img = plt.imread(malignant_dir + mask_file)
        if len(mask_img.shape) > 2:
            mask_img = cv2.cvtColor(mask_img, cv2.COLOR_RGB2GRAY)
        mask_img = cv2.resize(mask_img, (SIZE, SIZE))
        mask = np.maximum(mask, mask_img)
    
    # Label
    label = 'malignant'
    
    # Append Data
    data_dict['img'].append(img)
    data_dict['mask'].append(mask)
    data_dict['label'].append(label)

    # print(name_mask_files)

## Normal

In [32]:
# 1. Store image names
# Get all the file names
normal_data_list = os.listdir(normal_dir)

datas = []
for i in range(len(normal_data_list)):
    datas.append(normal_data_list[i].split(")")[0] + ')')

# Remove duplicate names
datas = list(set(datas))

In [33]:
print('Total Normal Data:', len(datas))

Total Normal Data: 133


In [34]:
# 2. Create Data
for x in range(len(datas)):
    # Image and Resize to 128x128
    img = plt.imread(normal_dir + datas[x] + '.png')
    if len(img.shape) > 2:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    img = cv2.resize(img, (SIZE, SIZE))
    
    # Mask
    name_mask_files = [file for file in normal_data_list if file.startswith(datas[x] + '_mask')]
    # Merge Mask
    mask = np.zeros(shape=(SIZE, SIZE))
    for mask_file in name_mask_files:
        # print(mask_file)
        mask_img = plt.imread(normal_dir + mask_file)
        if len(mask_img.shape) > 2:
            mask_img = cv2.cvtColor(mask_img, cv2.COLOR_RGB2GRAY)
        mask_img = cv2.resize(mask_img, (SIZE, SIZE))
        mask = np.maximum(mask, mask_img)
    
    # Label
    label = 'normal'
    
    # Append Data
    data_dict['img'].append(img)
    data_dict['mask'].append(mask)
    data_dict['label'].append(label)

    # print(name_mask_files)

In [35]:
print('Total Image:', len(data_dict['img']))
print('Total Mask', len(data_dict['mask']))
print('Total Label', len(data_dict['label']))

Total Image: 780
Total Mask 780
Total Label 780


# Split Data

In [43]:
class_dict = {
    'benign': [],
    'malignant': [],
    'normal': [],
}

In [44]:
# Group the images by class
for img, mask, label in zip(data_dict['img'], data_dict['mask'], data_dict['label']):
    class_dict[label].append((img, mask, label))

In [46]:
# Shuffle the images in each class
for class_name in class_dict:
    random.shuffle(class_dict[class_name])

In [47]:
# x -> img, y -> mask, z -> label
x_train, y_train, z_train = [], [], []
x_val, y_val, z_val = [], [], []
x_test, y_test, z_test = [], [], []

In [51]:
# train/val/test - 80/10/10
for class_name, images in class_dict.items():
    num_images = len(images)
    split_idx1 = int(num_images * 0.8)
    split_idx2 = int(num_images * 0.9)
    # print(num_images)
    # print(class_name)

    for i, (img, mask, label) in enumerate(images):
        # print(i)
        if i < split_idx1:
            x_train.append(img)
            y_train.append(mask)
            z_train.append(label)
        elif i < split_idx2:
            x_val.append(img)
            y_val.append(mask)
            z_val.append(label)
        else:
            x_test.append(img)
            y_test.append(mask)
            z_test.append(label)

In [52]:
x_train, y_train, z_train = np.array(x_train), np.array(y_train), np.array(z_train)
x_val, y_val, z_val = np.array(x_val), np.array(y_val), np.array(z_val)
x_test, y_test, z_test = np.array(x_test), np.array(y_test), np.array(z_test)

In [53]:
if not os.path.exists("data"+str(SIZE)):
    os.makedirs("data"+str(SIZE))

In [54]:
# Save the data to disk
np.save(f"data{SIZE}/x_train.npy", x_train)
np.save(f"data{SIZE}/x_val.npy", x_val)
np.save(f"data{SIZE}/x_test.npy", x_test)
np.save(f"data{SIZE}/y_train.npy", y_train)
np.save(f"data{SIZE}/y_val.npy", y_val)
np.save(f"data{SIZE}/y_test.npy", y_test)
np.save(f"data{SIZE}/z_train.npy", z_train)
np.save(f"data{SIZE}/z_val.npy", z_val)
np.save(f"data{SIZE}/z_test.npy", z_test)