Preliminars are pieces of code to obtain information about the datset, get paths of the images and their correpondant endpoint and patient ID arrays.

- If they have already been run, it is not necessary to rerun. If rerun, there is no problem with overwriting.

# Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import sys
import skimage
import matplotlib.pyplot as plt
import cv2 as cv
import numpy as np
import gc
from tqdm import tqdm
import pickle
import copy

#Local binary pattern
from skimage.feature import local_binary_pattern

#Paths
import glob

#Model creation
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Functions

In [None]:
#Global variables
mag_dict = {0:'40',1:'100',2:'200',3:'400'}
tt_dict = {0:'train',1:'test'}

## For folder information

In [None]:
def num_files(fold, magnification):
  #Calculate the number of images in the training and the test set.
  #Ratio is the porcentage of data that is for training
  os.chdir(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold}/train/{magnification}X');
  len_train = len([name for name in os.listdir('.') if os.path.isfile(name)]);
  os.chdir(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold}/test/{magnification}X');
  len_test = len([name for name in os.listdir('.') if os.path.isfile(name)]);

  ratio = len_train/(len_train+len_test);

  return [len_train, len_test, ratio]

def printFoldsInfo():
  for i in mag_dict:
    for j in range(5):
      print(f'For magnification {mag_dict[i]} in folder {j} we have the distribution:')
      print(f'Training size: {num_files(j,mag_dict[i])[0]}, test size: {num_files(j,mag_dict[i])[1]}, For train: {round(num_files(j,mag_dict[i])[2]*100)}%\n')

In [None]:
def train_test_from_pickle(category, fold, mag):
  """
  Loads feature matrices, enpoints or patient's ID for each folder and magnification.
  It returns the train and test arrays of the selected category
  
  :param category: Category of the data (feature matrices (X), Enpoints (y) or patient's ID)
  :param f: Fold
  :param mag: Magnification
  :return: two np.arrays, train and test
  """
  path_train = f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold+1}/train/{category}_f{fold+1}_train_{mag_dict[mag]}x_fv.p'
  path_test = f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold+1}/test/{category}_f{fold+1}_test_{mag_dict[mag]}x_fv.p'
  with open(path_train,'rb') as handle:
    train_array = pickle.load(handle)
  with open(path_test, 'rb') as handle:
    test_array = pickle.load(handle)
  
  return train_array, test_array

## For paths

In [None]:
def give_paths(folder):
  #Returns the image paths, for train and test, of a given folder for all magnifications
  #Input: folder (integer [1-5])
  paths_train = []; #The two list that will be returned
  paths_test = [];

  magnifications=['40','100','200','400'] #Magnification list
  for mag in magnifications:
    path_train=f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{folder}/train/{mag}X'
    path_test=f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{folder}/test/{mag}X'

    train_f1 = glob.glob(path_train+ '/*.png')
    test_f1=glob.glob(path_test+ '/*.png')

    paths_train.append(train_f1);
    paths_test.append(test_f1);

  return paths_train, paths_test
  #paths_train has lenght 4. 0:40x, 1:100x, 2:200x, 3:400x

def all_paths():
  #This function returns a list with all paths.
  #You can access each path with the following indexing:
  #Indexing: paths[folder][train-test][magnification]
  paths=[];
  for i in range(1,6):
    paths.append(give_paths(i))
  return paths

# 1. Information and paths

## Information about cross-validation folds

In this part, we obtain information of the 5 folders data ditribution from the nested cross-validation spliting of data.

In [None]:
#Save in a txt file the information of the train-test distribution for our 5-fold cross-validation folders.
os.chdir('/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum')
orig_stdout = sys.stdout #Save original std
f = open('Information.txt', 'w')
sys.stdout = f

printFoldsInfo()

sys.stdout = orig_stdout #Reset original std
f.close()

## Image paths

Here the paths for the images contained on each folder are saved in a pickle folder.

The dictionary **paths** contains all images paths.
You can access the individual paths following this indexing:


*   paths[folder][train or test][magnification]



1. folder number (0-4)
2. 0:train, 1:test
3. 0:40x, 1: 100x, 2:200x, 3:400x

In [None]:
#We obtain all images paths
paths_lists = all_paths()

paths_array = np.array(paths_lists,dtype=object) #Change to array

### Saving paths in file as array of lists

In [None]:
#Checking that the array of lists is well defined
print(paths_array.shape)
##Saving
os.chdir('/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum')
pickle.dump(paths_array,open("paths.p","wb"))

(5, 2, 4)


# 2. Binary classification

## Endpoint array (Binary classification)
An endpoint array is created for each folder (40 in total)

In [None]:
#Load paths
with open('/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/paths.p', 'rb') as handle:
    paths = pickle.load(handle)

In [None]:
#Which folder will be used
for f in range(5):
  for tt in range(2):
    for mag in range(4):
      paths_folder = paths[f,tt,mag] #Set folder to use
      n = len(paths_folder)
      endpoint_vector = np.zeros((n,1),np.uint8) #Define size of array where endpoint will be saved
      for i in range(n):
        img_path = paths_folder[i]
        endpoint_vector[i] = img_path.find('_M_')!=-1

      #Saving document
      file_name = f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{f+1}/{tt_dict[tt]}/endpoints_f{f+1}_{tt_dict[tt]}_{mag_dict[mag]}x_fv.p'
      with open(file_name, 'wb') as handle:
          pickle.dump(endpoint_vector, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#Number of benign vs malignat
for fold in range(5):
  for mag in range(4):

    y_train, y_test = train_test_from_pickle(category='endpoints', fold=fold, mag=mag)

    print(f'For fold {fold+1} and magnification {mag_dict[mag]}')
    malignant_train = y_train.sum()
    benign_train = (y_train.shape[0] - malignant_train).astype(np.uint32)
    
    print(f'Train: (Benign: {benign_train}, Malignant: {malignant_train}) Benign %: {round(benign_train/y_train.shape[0]*100,2)}')

    malignant_test = y_test.sum()
    benign_test = (y_test.shape[0] - malignant_test).astype(np.uint32)
    print(f'test: (Benign: {benign_test}, Malignant: {malignant_test}) Benign %: {round(benign_test/y_test.shape[0]*100,2)}\n')

For fold 1 and magnification 40
Train: (Benign: 370, Malignant: 880) Benign %: 29.6
test: (Benign: 255, Malignant: 490) Benign %: 34.23

For fold 1 and magnification 100
Train: (Benign: 383, Malignant: 938) Benign %: 28.99
test: (Benign: 261, Malignant: 499) Benign %: 34.34

For fold 1 and magnification 200
Train: (Benign: 368, Malignant: 901) Benign %: 29.0
test: (Benign: 255, Malignant: 489) Benign %: 34.27

For fold 1 and magnification 400
Train: (Benign: 351, Malignant: 814) Benign %: 30.13
test: (Benign: 237, Malignant: 418) Benign %: 36.18

For fold 2 and magnification 40
Train: (Benign: 433, Malignant: 930) Benign %: 31.77
test: (Benign: 192, Malignant: 440) Benign %: 30.38

For fold 2 and magnification 100
Train: (Benign: 449, Malignant: 1012) Benign %: 30.73
test: (Benign: 195, Malignant: 425) Benign %: 31.45

For fold 2 and magnification 200
Train: (Benign: 455, Malignant: 961) Benign %: 32.13
test: (Benign: 168, Malignant: 429) Benign %: 28.14

For fold 2 and magnification 4

## Patient ID array

In [None]:
#Load paths
with open('/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/paths.p', 'rb') as handle:
    paths = pickle.load(handle)

In [None]:
#Similar structure like in endpoint array above
for f in range(5):
  for tt in range(2):
    for mag in range(4):
      paths_folder = paths[f,tt,mag]
      n = len(paths_folder)
      ID_vector = np.empty((n,1), dtype='object') #Define size of array where ID will be saved. #Dtype object is not as fast as a normal np.array but it is convinient in this case either way.
      for i in range(n):
        img_path = paths_folder[i]
        ID_vector[i] = img_path.rsplit('/', 1)[1].split('-', 3)[2]
      #Saving document
      file_name = f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{f+1}/{tt_dict[tt]}/ID_f{f+1}_{tt_dict[tt]}_{mag_dict[mag]}x_fv.p'
      with open(file_name, 'wb') as handle:
          pickle.dump(ID_vector, handle, protocol=pickle.HIGHEST_PROTOCOL)

#3. Multiclass Classification

## Endpoint array (Multiclass classification)

In [None]:
# End points for multiclassses
'''
Benign
B_A : adenosis (A)
B_F : fibroadenoma (F)
B_PT : phyllodes tumor (PT)
B_TA : tubular adenone (TA)

Malignant
M_DC : ductal carcinoma (DC)
M_LC : lobular carcinoma (LC)
M_MC : mucinous carcinoma (MC)
M_PC : papillary carcinoma (PC)
'''
import pickle
paths = pickle.load(open('/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/paths.p','rb'))

In [None]:
def get_multi_class(paths):
  '''
  Returns the dataframe of encoded multi classes from 0 to 7. 
  input: paths => the list of the paths in a folder.
  '''
  classes = ['B_A', 'B_F', 'B_PT', 'B_TA', 'M_DC', 'M_LC', 'M_MC', 'M_PC']
  classes_l = []
  for path in paths:
    c = path.rsplit('/', 1)[1].split('_', 1)[1].split('-', 1)[0] 
    for slide_class in classes:
      if c == slide_class:
        classes_l.append(c)
  reps = {'B_A': 0, 'B_F': 1, 'B_TA': 2, 'B_PT': 3, 'M_DC': 4, 'M_LC': 5,'M_MC': 6, 'M_PC': 7}
  cat = [reps.get(x,x) for x in classes_l]
  df = pd.DataFrame(cat)
  return df

In [None]:
#Which folder will be used
for f in range(5):
  for tt in range(2):
    for mag in range(4):
      endpoint_vector = get_multi_class(paths[f,tt,mag]).to_numpy(dtype=np.uint8)

      #Saving document
      file_name = f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{f+1}/{tt_dict[tt]}/endpoints_multi_f{f+1}_{tt_dict[tt]}_{mag_dict[mag]}x_fv.p'
      with open(file_name, 'wb') as handle:
          pickle.dump(endpoint_vector, handle, protocol=pickle.HIGHEST_PROTOCOL)