---
# Import Libraries

In [1]:
import csv
import os
import numpy as np
import cv2
import os
from collections import Counter
from sklearn.model_selection import train_test_split

---
# Split Data

In [2]:
def read_patients(csv_path='INbreast.csv'):
    patients = []
    with open(csv_path, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in sorted(spamreader):
            if(row[1] not in patients):
                patients.append(row[1])
    return patients

def read_label(patients, csv_path='INbreast.csv'):
    labels = []
    for name in patients:
        label = 0
        with open(csv_path, newline='') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for row in spamreader:
                if(row[1] == name):
                    if(int(row[-1][0]) > 3):
                        label = 1
        labels.append(label)
    return labels

patients = sorted(read_patients())
labels = read_label(patients)

In [3]:
x_train, y_train, x_val, y_val, x_test, y_test = [], [], [], [], [], []
x_train, x_val, y_train, y_val = train_test_split(patients, labels, train_size = 0.7, test_size = 0.3, stratify = labels, shuffle = True, random_state = 1)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, train_size = 0.5, test_size = 0.5, stratify = y_val, shuffle = True, random_state = 1)

print(Counter(y_train))
print(Counter(y_val))
print(Counter(y_test))

Counter({0: 41, 1: 34})
Counter({0: 9, 1: 7})
Counter({0: 9, 1: 8})


In [4]:
def read_split(split, csv_path='INbreast.csv'):
    x = []
    with open(csv_path, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in spamreader:
            if(row[1] in split):
                if(int(row[-1][0]) < 4):
                    x.append([row[0]]+[0])
                else:
                    x.append([row[0]]+[1])
    return x

train = read_split(x_train)
val = read_split(x_val)
test = read_split(x_test)

def to_csv(filename, split):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f, delimiter=',')
        for i in range(len(split)):
            writer.writerow(split[i])
                
to_csv('split/train.csv', sorted(train))
to_csv('split/val.csv', sorted(val))
to_csv('split/test.csv', sorted(test))

---
# Apply Otsu Segmentation and CLAHE

In [8]:
def threshold(image):
    otsu = cv2.threshold(image, 0, 1, cv2.THRESH_OTSU)[1]
    contours, hierarchy = cv2.findContours(otsu, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours_areas = [cv2.contourArea(cont) for cont in contours]
    biggest_contour_idx = np.argmax(contours_areas)
    tx, ty, tw, th = cv2.boundingRect(contours[biggest_contour_idx])
    return tx, ty, tw, th

for root, dirs, files in os.walk('inbreast_png/', topdown=False):
    for name in files:
        if (name.endswith('.png') and 'checkpoint' not in name):
            pth = os.path.join(root, name)
            root2 = root.replace('inbreast_png', 'inbreast_preprocessed')
            pth2 = os.path.join(root2, name)

            img = cv2.imread(pth)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            tx, ty, tw, th = threshold(img)
            img = img[ty:ty+th, tx:tx+tw]
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
            img = clahe.apply(img)
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
            cv2.imwrite(pth2, img)

            if(getclass(name) != '1'):  
                root_roi = root.replace('inbreast_png', 'inbreast_roi')
                pth_roi = os.path.join(root_roi, name)
                root2_roi = root.replace('inbreast_png', 'inbreast_roi_preprocessed')
                pth2_roi = os.path.join(root2_roi, name)
                roi = cv2.imread(pth_roi, cv2.IMREAD_GRAYSCALE)
                roi = roi[ty:ty+th, tx:tx+tw]
                cv2.imwrite(pth2_roi, roi)

---
# Apply Cropping

In [None]:
def getclass(name):
    with open('INbreast.csv', newline = '') as csvfile:
        spamreader = csv.reader(csvfile, delimiter = ';', quotechar = '|')
        for row in spamreader:
            if(row[0]+'.png' == name):
                return row[-1]

def read_csv(file):
    x = []
    with open(file, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            x.append(row[0])
    return x

train = read_csv('split/train.csv')
val = read_csv('split/val.csv')

In [None]:
def get_roi_from_mask(mask):
    contours, _ = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours_areas = [cv2.contourArea(cont) for cont in contours]
    biggest_contour_idx = np.argmax(contours_areas)
    return contours, contours[biggest_contour_idx]

def crop_sliding_window(image, abnormality_mask = None, patch_size = (300, 300), step_size = 600, malignant = False):
    crops = []
    overlap = []
    (w_width, w_height) = (600, 600)
    for y in range(0, image.shape[0], step_size):
        if(y > image.shape[0] - w_height):
            y = image.shape[0] - w_height
        for x in range(0, image.shape[1], step_size):
            if(x > image.shape[1] - w_width):
                x = image.shape[1] - w_width
            cropped = image[y:y + w_height, x:x + w_width]
            if(cropped.mean() > image.mean()):
                if(malignant == False or (malignant and check_overlap(x, y, abnormality_mask))):
                    crops.append(cv2.resize(cropped, (patch_size[1], patch_size[0]), interpolation=cv2.INTER_LANCZOS4))
    return crops

def crop_roi(image, abnormality_mask, patch_size = (300, 300)):
    abnormality_roi, biggest_roi = get_roi_from_mask(abnormality_mask)
    abnorm_x, abnorm_y, abnorm_w, abnorm_h = cv2.boundingRect(biggest_roi)
    longest_edge = max(min(min(image.shape[0] // 2, image.shape[1] // 2), max(abnorm_w, abnorm_h)), 150)
    center_x = min(max(longest_edge, abnorm_x + abnorm_w // 2), image.shape[1]-longest_edge)
    center_y = min(max(longest_edge, abnorm_y + abnorm_h // 2), image.shape[0]-longest_edge)
    roi_image = image[center_y-longest_edge:center_y+longest_edge, center_x-longest_edge:center_x+longest_edge]
    return cv2.resize(roi_image, (patch_size[1], patch_size[0]), interpolation=cv2.INTER_LANCZOS4)

def check_overlap(x, y, abnormality_mask, patch_size = 600):
    abnormality_roi, biggest_roi = get_roi_from_mask(abnormality_mask)
    abnorm_x, abnorm_y, abnorm_w, abnorm_h = cv2.boundingRect(biggest_roi)
    if(abnorm_x > x and abnorm_x+abnorm_w < x+patch_size and abnorm_y > y and abnorm_y+abnorm_h < y+patch_size):
        return True
    return False

for root, dirs, files in os.walk(folder_path = 'inbreast_preprocessed/', topdown = False):
    for name in files:
        if (name.endswith('.png') and 'checkpoint' not in name):
            pth = os.path.join(root, name)

            root_roi = root.replace('inbreast_preprocessed', 'inbreast_roi_preprocessed')
            pth_roi = os.path.join(root_roi, name)

            if(name[:-4] in train):
                root_split = root.replace('inbreast_preprocessed', 'crops_train')
            elif(name[:-4] in val):
                root_split = root.replace('inbreast_preprocessed', 'crops_val')
            else:
                continue
            
            full_image = cv2.imread(pth)

            if(getclass(name) == '1'):
                crops = crop_sliding_window(full_image)
                for i in range(len(crops)):
                    cv2.imwrite(root_split + name[:-4] + '_normal_' + str(i) + '.png', crops[i])

            elif(getclass(name) == '2' or getclass(name) == '3'):
                roi_image = cv2.imread(pth_roi, cv2.IMREAD_GRAYSCALE)

                crop = crop_roi(full_image, roi_image)
                cv2.imwrite(root_split + name[:-4] + '_benign_center.png', crop)

                crops = crop_sliding_window(full_image)
                for i in range(len(crops)):
                    cv2.imwrite(root_split + name[:-4] + '_benign_' + str(i) + '.png', crops[i])
                    
            else:
                roi_image = cv2.imread(pth_roi, cv2.IMREAD_GRAYSCALE)
                
                crop = crop_roi(full_image, roi_image)
                cv2.imwrite(root_split + name[:-4] + '_malignant_center.png', crop)
                
                crops = crop_sliding_window(full_image, abnormality_mask = roi_image, step_size = 100, malignant = True)
                for i in range(len(crops)):
                    cv2.imwrite(root_split + name[:-4] + '_malignant_' + str(i) + '.png', crops[i])