In [None]:
import pandas as pd
import os
import shutil
from sklearn.model_selection import GroupShuffleSplit
import tiatoolbox
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
#from tiatoolbox.tools import stainnorm
from tiatoolbox import data
import random
import augmentations

In [None]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [None]:
fold_df = pd.read_csv('/home/meredithc/Transfer-Learning-for-Cancer-Detection/BreakHis/Folds.csv',dtype = {"mag":"string"})

In [None]:
fold_df.dtypes

In [None]:
fold_df['class'] = fold_df['filename'].apply(lambda x:x.split("/")[3])
fold_df['sub_class'] = fold_df['filename'].apply(lambda x:x.split("/")[5])
fold_df['patient_id'] = fold_df['filename'].apply(lambda x:x.split("/")[-1])


In [None]:
fold_df = fold_df[fold_df["fold"]==1]

In [None]:
# Run ONCE to copy files into single directory with new image name

# destination_folder = "Input/"
# for row in range(len(fold_df)):
#     source_file = "BreakHis/BreaKHis_v1/" + fold_df.loc[row,"filename"]
#     destination_path = os.path.join(destination_folder,fold_df.loc[row,"mag"]+"_"+
#                                                        fold_df.loc[row,"class"]+"_"+
#                                                        fold_df.loc[row,"sub_class"]+"_"+
#                                                        fold_df.loc[row,"patient_id"])
#     shutil.copy(source_file, destination_path)

In [None]:
cols = ['mag', 'class', 'sub_class','patient_id']
fold_df['input_path'] = fold_df[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [None]:
fold_df['input_path'][0]

In [None]:
# extracting the patient ID from the image name

fold_df = fold_df.rename(columns = {"patient_id":"image_name"})
# fold_df['patient_id'] = fold_df['image_name'].apply(lambda x:[char for char in x.split("-")[2] if char.isnumeric()])
# fold_df['patient_id'] = fold_df['patient_id'].apply(lambda x:''.join(x))

fold_df['patient_id'] = fold_df['image_name'].apply(lambda x:x.split("-")[2])

In [None]:
fold_df['encoded_class'] = fold_df['class'].apply(lambda x: 0 if x =='benign' else 1)

In [None]:
plt.figure(figsize=(10,6))
sns.set(font_scale = 1.5)
sns.histplot(fold_df['class']);
plt.xlabel("Class")
plt.title("Num Patients Benign and Malignant");

malignant = 1 (true class) benign = 0 (false class) The classes are majorly imabalanced. There are some balancing techniques we could employ (downsampling, upsampling, synthetic data augmentation) to enrich the models predictive power in the benign class, but there would be a cost because we intend to maximize the recall metric. In the domain of cancer classification, we should greatly penalize false positives, which means the system did not detect cancer when there really was cancer present. Something to consider is just balancing the data in the train set, but leaving the validation set imbalanced to try and maximize the recall and match real world scenarios

In [None]:
benign_df = fold_df[fold_df['class'] == 'benign']
malignant_df = fold_df[fold_df['class'] == 'malignant']

In [None]:
plt.figure(figsize = (30,10))
for i in range(0,40):
    plt.subplot(4,10,i+1)
    img = cv2.imread("Input/"+ benign_df['input_path'][i],1)
    plt.imshow(img)
plt.title("Benign Samples")

In [None]:
# splitting into training, testing, validation sets - making sure that the whole set of images from a particular patient are put into either train or test
# 80 train, 10 test, 10 validation
splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
split = splitter.split(fold_df, groups=fold_df['patient_id'])
train_inds, test_inds = next(split)

train = fold_df.iloc[train_inds].reset_index(drop = True)
temp_test = fold_df.iloc[test_inds].reset_index(drop = True)

splitter_2 = GroupShuffleSplit(test_size=.50, n_splits=2, random_state = 8)
split_2 = splitter_2.split(temp_test, groups = temp_test['patient_id'])
test_inds, validation_inds = next(split_2)

test = temp_test.iloc[test_inds].reset_index(drop = True)
validation = temp_test.iloc[validation_inds].reset_index(drop = True)

### Setting up Stain normalization

In [None]:
target_image = data.stain_norm_target()
plt.imshow(target_image)
plt.axis("off")
plt.title('Target Image')
plt.show()

In [None]:
#Can use MacenkoNormalizer, ReinhardNormalizer, RuifrokNormalizer or VahadaneNormalizer
normalizer = stainnorm.ReinhardNormalizer()
normalizer.fit(target_image)

In [None]:
#Plot stain normalized images
img = cv2.imread("Input/"+ benign_df['input_path'][1],1)
plt.imshow(img)
normalized_img = normalizer.transform(img)
plt.imshow(normalized_img)

In [None]:
def normalize_train(image_path: str, dest_path: str, normalizer):
    try:
        img = cv2.imread('Input/' + image_path)
        normalizer.transform(img)
        cv2.imwrite(dest_path + image_path, img)
    except:
        print(image_path)

In [None]:
def copy_file(image_path: str, dest_path: str):
    try:
        if 'malignant' in image_path:
            dest_path += 'Malignant/'
            prefix = 'Normalized/Malignant/'
        else:
            dest_path += 'Benign/'
            prefix = 'Normalized/Benign/'
        img = cv2.imread(prefix + image_path)
        cv2.imwrite(dest_path + image_path, img)
    except:
        print(image_path)

### Augmenting data

In [None]:
print(len(train))
print(len(test))
print(len(validation))

### Create Seperate directories for each class for each dataset

In [None]:
benign_train = train[train['class'] == 'benign']
malignant_train = train[train['class'] == 'malignant']
benign_test = test[test['class'] == 'benign']
malignant_test = test[test['class'] == 'malignant']
benign_val = validation[validation['class'] == 'benign']
malignant_val = validation[validation['class'] == 'malignant']

In [None]:
train['input_path'].map(lambda x: copy_file(x, 'Normalized-Train/'))
test['input_path'].map(lambda x: copy_file(x, 'Normalized-Test/'))
validation['input_path'].map(lambda x: copy_file(x, 'Normalized-Validation/'))

In [None]:
augmentations.augment_images(benign_train['input_path'], 0.5, 'Normalized-Train/Benign/', 'Normalized-Train/Benign/', 3)
augmentations.augment_images(malignant_train['input_path'], 0.4, 'Normalized-Train/Malignant/', 'Normalized-Train/Malignant/', 1)

### Other stuff

In [None]:
malignant_df['input_path'].map(lambda x: normalize_train(x, 'Normalized/Malignant/', normalizer))

In [None]:
img = flip_augmentation('Input/', train['input_path'][0], '', 1.0, False)
plt.imshow(img)

In [None]:
img = rotate_augmentation(train['input_path'][0], '', 1.0, False)
plt.imshow(img)

In [None]:
augmentations.augment_images(train['input_path'], 0.87, 'Normalized/', 'Train/Augmented/', 2)