# Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import skimage
import matplotlib.pyplot as plt
import cv2 as cv
import numpy as np
import gc
from tqdm import tqdm
import pickle
import copy

#Model creation
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import matthews_corrcoef, make_scorer

#Oversamplig for unbalance
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
from imblearn.pipeline import Pipeline as Pipelineim
from sklearn.base import BaseEstimator, TransformerMixin

# Functions

## Original functions

In [None]:
#Global dictionaries
mag_dict = {0:'40',1:'100',2:'200',3:'400'}
tt_dict = {0:'train',1:'test'}

In [None]:
def read_files(extractor, fold, mag):
  '''
  extractor: Feature extractor - String.
  mag: Magnification factor - int.
  fold: Fold number - int.
  Reads and splits the files as X_train, y_train, X_test, y_test
  '''

  data_train=pd.read_csv(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold}/train/{extractor}_f{fold}_train_{mag}x_fv.csv', delimiter='\t', index_col=0)
  X_train=data_train.iloc[:,1:-1]
  y_train=data_train.iloc[:,-1]
  df_patient_train=data_train.iloc[:,0:1] # Just patient Id's

  data_test=pd.read_csv(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold}/test/{extractor}_f{fold}_test_{mag}x_fv.csv', delimiter='\t', index_col=0)
  X_test=data_test.iloc[:,1:-1]
  y_test=data_test.iloc[:,-1]
  df_patient_test=data_test.iloc[:,0:1] # Just patient Id's

  return X_train, X_test, y_train, y_test, df_patient_train, df_patient_test

def read_files_csv(extractor, fold, mag):
  '''
  extractor: Feature extractor - String.
  mag: Magnification factor - int.
  fold: Fold number - int.
  Reads and splits the files as X_train, y_train, X_test, y_test
  '''

  data_train=pd.read_csv(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold+1}/train/{extractor}_f{fold+1}_train_{mag_dict[mag]}x_fv.csv', delimiter='\t', index_col=0)
  X_train=data_train.iloc[:,1:-1]
  y_train=data_train.iloc[:,-1]
  df_patient_train=data_train.iloc[:,0:1] # Just patient Id's

  data_test=pd.read_csv(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold+1}/test/{extractor}_f{fold+1}_test_{mag_dict[mag]}x_fv.csv', delimiter='\t', index_col=0)
  X_test=data_test.iloc[:,1:-1]
  y_test=data_test.iloc[:,-1]
  df_patient_test=data_test.iloc[:,0:1] # Just patient Id's

  return X_train, X_test, y_train, y_test, df_patient_train, df_patient_test

In [None]:
def knn_pipeline(X_train, y_train, X_test, y_test, df_patient_test):
  '''
  KNN pipeline with Standart scaler. 
  It returns Recognition rate.
  '''
  #pipe = Pipeline([('scaler', StandardScaler()),('classifier',KNeighborsClassifier(n_neighbors=1))])
  pipe = Pipeline([('scaler', StandardScaler()),('classifier',SVC(C=10,gamma=0.008))])
  #pipe = Pipeline([('scaler', StandardScaler()),('classifier',RandomForestClassifier(n_estimators=300, class_weight="balanced", max_depth=100, max_features= 'log2', criterion= 'gini') )])
  
  model = pipe.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  acc = metrics.accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred, average='weighted')

  pscore = patient_score(df_patient_test, y_pred, y_test)
  rec_rate = recognition_rate(pscore)

  return rec_rate, acc, f1

def avg_rec_rate(mag,extractor, multi=False):
  '''
  mag: Magnification factor - int.
  extractor: Feature extractor - String.
  multi: If it is a multiclass vlassification - Boolean. Default=False.
  It calculates average recognition rate in all folds for each magnitification (40,100,200,400) for a specific feature extraction method. 
  '''
  r=0
  a=0
  f=0
  folds=[1,2,3,4,5]

  if multi == True:
    for i in folds:
      X_train, y_train, df_patient_train, X_test, y_test, df_patient_test = read_files_multi_class(extractor, i, mag)
      rec_rate, acc, f1 = knn_pipeline(X_train, y_train, X_test, y_test, df_patient_test)
      r += rec_rate
      a += acc
      f += f1

  else:
    for i in folds:
      X_train, X_test, y_train, y_test, _, df_patient_test = read_files(extractor, i, mag)
      rec_rate, acc, f1 = knn_pipeline(X_train, y_train, X_test, y_test, df_patient_test)
      r += rec_rate
      a += acc
      f += f1

  return r/len(folds), a/len(folds), f/len(folds) # Take the average

In [None]:
def patient_score(df_pat_test, y_p, y_t):
    '''
    Np : number of cancer images belong to the patient p.
    Nrec: number of images that are correctly classified for patient p. From y_pred
    Creation of DataFrame of Patient Id, y_pred, y_test, comparison of y_pred and y_test.
    '''
    y_pred_df=pd.DataFrame(y_p) # Create dataframe y_pred_df
    y_pred_df.columns = ['y_pred']

    y_test_df=pd.DataFrame(y_t) # Create dataframe y_test_df
    y_test_df.columns = ['y_test']
  
    df_p_test=df_pat_test.copy() # Copy the main dataframe with Patient ID's for test set.
    df_p_test.columns = ['patients'] # Naming the column in df_p_test
    df_p_test["y_test"]=y_test_df["y_test"] # Adding the dataframe y_test
    df_p_test["y_pred"]=y_pred_df["y_pred"] # Adding the dataframe y_pred
    df_p_test['comparison'] = np.where(df_p_test['y_test'] == df_p_test['y_pred'], 1, 0) # Adding the comparison coloumn, where y_pred==y_test, it's true.

    num_img_p=[]
    true_counts_p=[]
    for d in df_p_test.groupby('patients'): # Groupby the patients based on their Id's
      true_counts=d[1].comparison.sum() # The total correctly classfied images. The sum of comparison column. 
      true_counts_p.append(true_counts) # Creating Nrec value. 
      values=d[1].patients.value_counts(dropna=False) # Count number of apperances of patient Id per patient = # number of images obtained per patient.
      num_img_p.append(values[0]) # Creating Np value.

      p_score_arr=[]
      for i in range(len(num_img_p)): # Both arrays are in the same lenght 
        p_score=true_counts_p[i]/num_img_p[i] # For each index, calculate p score.
        p_score_arr.append(p_score) 

    return p_score_arr#, true_counts_p, num_img_p

def recognition_rate(p_score_arr):
  '''
  p_score_arr: p scores. - Array.
  It returns recognition rate.
  '''
  sum_p_score=np.array(p_score_arr).sum() # Sum of p score.
  rec_rate=sum_p_score/len(p_score_arr) # Calculation of the recognition rate. 
  return rec_rate

## Pickle-based functions
Functions using pickle and numpy arrays

In [None]:
def train_test_from_pickle(category, fold, mag):
  """
  Loads feature matrices, enpoints or patient's ID for each folder and magnification.
  It returns the train and test arrays of the selected category
  
  :param category: Category of the data (feature matrices (X), Enpoints (y) or patient's ID)
  :param f: Fold
  :param mag: Magnification
  :return: two np.arrays, train and test
  """
  path_train = f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold+1}/train/{category}_f{fold+1}_train_{mag_dict[mag]}x_fv.p'
  path_test = f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold+1}/test/{category}_f{fold+1}_test_{mag_dict[mag]}x_fv.p'
  with open(path_train,'rb') as handle:
    train_array = pickle.load(handle)
  with open(path_test, 'rb') as handle:
    test_array = pickle.load(handle)
  
  return train_array, test_array

def read_files_pickle(extractor, fold, mag): #Name given to mimic csv files function
  """
  Given the feature extractor name, folder nad magnifications, it returns the train-test split for the three categories (X, y, Patient ID)

  :param extractor: feature extractor name. e.g. extractor='GLCM'
  :param fold: fold -1
  :param mag: magnification following the mag_dicitonary indexing
  :return: train-test split
  """
  X_train, X_test = train_test_from_pickle(category=extractor, fold=fold, mag=mag)
  y_train, y_test = train_test_from_pickle(category='endpoints', fold=fold, mag=mag)
  ID_train, ID_test = train_test_from_pickle(category='ID', fold=fold, mag=mag)

  return X_train, X_test, y_train, y_test, ID_train, ID_test

def remove_correlated(X_train, X_test, max_corr):
  """
  Removes high correlated features. Input can be np. array of pandas df, but output will be pandas df.
  :param X_train: training feature matrix
  :param X_test: testing feature matrix
  :param corr: Maximum correlation accepted ebtween features.
  :return: Training and test un correlated feature matrices as dataframe
  """
  X_train = pd.DataFrame(X_train)
  X_test = pd.DataFrame(X_test)
  cor_matrix = X_train.corr().abs()
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
  to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > max_corr)]
  X_train_nocorr = X_train.drop(X_train[to_drop], axis=1)
  X_test_nocorr = X_test.drop(X_train[to_drop], axis=1)
  
  return X_train_nocorr, X_test_nocorr

def machine_learning_pipeline(X_train, X_test, y_train, y_test, ID_test, pipe, max_corr=1):
  """
  Main machine_learning pipeline.It receives the data (feature matrices, label and patients' ID) and produces the csv with the comparison between real and predicted labes. The model is also returned.
  
  :param X_train, X_test...etc: feature matrices, labels and patients' ID. X matrices are dataframes, the rest are np.arrays
  :param pipe: Pipeline defined using sklearn
  :param max_corr: Maximum correlation accepted among features
  :return: dataframe with the ID, prediction and real label, and model used
  """
  print(f'Input features: {X_train.shape[1]}\n')
  X_train_nocorr, X_test_nocorr = remove_correlated(X_train, X_test, max_corr=max_corr) #Remove correlated features from X_train and X_test
  print(f'No-correlated features: {X_train_nocorr.shape[1]}\n')
  model = pipe.fit(X_train_nocorr, y_train.ravel()) #Model fit
  y_pred = model.predict(X_test_nocorr) #Prediction

  y_pred = pd.DataFrame(y_pred) # Dataframe of predictions
  y_pred.columns = ['y_pred']
  y_test = pd.DataFrame(y_test) # Dataframe of test labels
  y_test.columns = ['y_test']
  df_comparison = pd.DataFrame(ID_test).copy() # Copy the main dataframe with Patient ID's for test set.
  df_comparison.columns = ['ID'] # Naming the column in df_p_test
  df_comparison["y_test"]=y_test["y_test"] # Adding the dataframe y_test
  df_comparison["y_pred"]=y_pred["y_pred"] # Adding the dataframe y_pred.
  df_comparison['comparison'] = np.where(df_comparison['y_test'] == df_comparison['y_pred'], 1, 0) # Adding the comparison coloumn, where y_pred==y_test, it's true.

  return df_comparison, model

def patient_score_beta(df_comparison):
  """
  Computes patient score based on the patients' ID as list. An extra list with the patient score and the ID as tuple is given to analyse individual response.
  :param df_comparison: Dataframe with 3 columns: ID, prediction and real label
  :return: two list with the pscores and pscores with ID
  """
  p_score_list = [] #List with the patient scores
  p_score_ID_list = [] #List with the patient score and IDs

  for d in df_comparison.groupby('ID'): # Groupby the patients based on their Id's
    true_counts=d[1].comparison.sum() # Creating Nrec value. 
    num_img=d[1].ID.value_counts(dropna=False)[0] #Creating Np value.
    p_score=true_counts/num_img # For each index, calculate p score, Nrec/Np
    p_score_list.append(p_score) #Append at the end of the patient score list
    p_score_ID_list.append([p_score,d[0]]) #Append at the end of the ID list
  
  return p_score_list, p_score_ID_list

def performance_metrics(df_comparison):
  """
  Here the performance metrics are computed. Currently patient score and recognition rate are the basis.
  Image-wise accuracy has also been added.
  :param df_comparison: Dataframe with 3 columns: ID, prediction and real label
  :return: performance metrics
  """
  p_score_list, p_score_ID_list = patient_score_beta(df_comparison) #Get patient scores
  rec_rate = np.mean(p_score_list) #Get recognition rate
  acc = df_comparison.comparison.mean()

  return p_score_list, p_score_ID_list, rec_rate, acc

def get_problematic_patients(p_score_ID_list, min_score=0.5):
  """
  Given a pscore with patients ID, the ID and performance of problematic (pscore<min_score) patients os given.
  :param p_score_ID_list: list of pscores and IDs
  :return: list of problematic patients
  """
  problem_list = []
  for p in p_score_ID_list:
    if p[0]<0.5:
      problem_list.append(p)
  return problem_list

def model2performance_metrics(extractor='PFTAS', read_files_type=read_files_csv, pipe=SVC(), max_corr=1, fold=0, mag=0):
  """
  Input ML model settings, as well as folder to extract
  :param extractor: name of extractor of features used
  :param read_files_type: type of file to be read (csv, pickle)
  :param pipe: Pipeline of the classification method.
  :param max_corr: Maximum correlation allowed between features
  :param fold, mag: known
  :return: metrics of the model + model. Additionally, problematic patients are displayed.
  """
  X_train, X_test, y_train, y_test, _, ID_test = read_files_type(extractor=extractor, fold=fold, mag=mag) #Split data
  df_comparison, model = machine_learning_pipeline(X_train, X_test, y_train, y_test, ID_test, pipe=pipe, max_corr=max_corr) #Main pipeline. Obtain comparison of labels
  p_score_list, p_score_ID_list, rec_rate, acc = performance_metrics(df_comparison) #Get performance metrics
  p_problem = get_problematic_patients(p_score_ID_list)

  print(f'-For fold {fold+1} and magnification {mag_dict[mag]}:\n') #Print metrics
  print(f'Recognition rate: {rec_rate}')
  print(f'Image-wise accuracy: {acc}')
  print(f'Problematic patients: {p_problem}\n') #Show problematic patients' ID and score

  return p_score_list, p_score_ID_list, rec_rate, acc, model

def result_all_folders(extractor, read_files_type, pipe, max_corr=1, mag=0):
  """
  All 5 folds are run for the same magnification. The extractor name, pipe definition as well as the max_correaltion have to be given
  :param extractor: name of extractor of features used
  :param read_files_type: type of file to be read (csv, pickle)
  :param pipe: Pipeline of the classification method.
  :param max_corr: Maximum correlation allowed between features
  :param mag: known
  :return: NONE
  """
  all_rec_rates = np.zeros(5) #To save recognition rates
  for fold in range(5):
    p_score_list, p_score_ID_list, rec_rate, acc, model = model2performance_metrics(extractor, read_files_type, pipe=pipe, max_corr=max_corr, fold=fold, mag=mag) #Fold-wise learning method
    all_rec_rates[fold] = rec_rate #save recognition rate
    if hasattr(model,'best_estimator_'):
      print(f'Grid best estimator: {model.best_estimator_}\n') #Print the best estimator hyperparameters of the grid search
      print(f'Best estimator number of Principal COmponents: {model.best_estimator_.named_steps["reductor"].explained_variance_ratio_.shape[0]}\n')
      print('############\n')

  print(f'---- Mean recognition rate for magnification {mag_dict[mag]}x: {all_rec_rates.mean()}') #Mean recognition rate (final metric)
  return model

In [None]:
def classifier_and_grid(method, grid='normal',verbose=1, reductor_components=[1]):
  """
  Definition of the classifier grid method for the hyperparameters search.
  :param method: learning method name (SVM, random forest, etc.)
  :param grid: type of grid search given (random or normal)
  """
  n_splits = 5
  #KNN
  if(method=='KNN'):
    param_grid = {'classifier__n_neighbors': list(range(1,40))}
    pipe = Pipeline([('scaler', StandardScaler()),('classifier',KNeighborsClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring='accuracy', verbose = verbose)
    return grid
  #KNN with PCA
  elif(method=='KNN_PCA'):
    param_grid = {'reductor__n_components': reductor_components,'classifier__n_neighbors': list(range(1,30))}
    pipe = Pipeline([('scaler', StandardScaler()),('reductor',PCA()),('classifier',KNeighborsClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring='accuracy', verbose = verbose)
    return grid
  #SVM with PCA
  elif(method=='SVM_PCA'):
    if(grid=='normal'):
      param_grid = {'reductor__n_components': reductor_components,'classifier__C': [0.1, 1, 10, 100],
              'classifier__gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
              'classifier__kernel': ['rbf']}
      pipe = Pipeline([('scaler', StandardScaler()),('reductor',PCA()),('classifier',SVC())])
      grid = GridSearchCV(pipe, param_grid, scoring='accuracy', cv = StratifiedKFold(n_splits=n_splits) ,verbose = verbose)
      return grid
    elif(grid=='random'):
      parameters = {'reductor__n_components': reductor_components,'classifier__C': scipy.stats.expon(scale=10), 'classifier__gamma': scipy.stats.expon(scale=.001), #Parameters for grid search
      'classifier__kernel': ['rbf'], 'classifier__class_weight':['balanced']}
      pipe = Pipeline([('scaler', StandardScaler()),('reductor',PCA()), ('classifier',SVC())]) #Definition of pipeline
      grid = RandomizedSearchCV(pipe, parameters,n_iter=20, scoring='accuracy', cv = StratifiedKFold(n_splits=n_splits) , verbose=verbose, return_train_score=False) #Random search
      return grid
  #SVM no PCA    
  elif(method=='SVM'):
    if(grid=='normal'):
      param_grid = {'classifier__C': [0.1, 1, 10, 100],
              'classifier__gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
              'classifier__kernel': ['rbf'],'classifier__class_weight':['balanced']}
      pipe = Pipeline([('scaler', StandardScaler()),('classifier',SVC())])
      grid = GridSearchCV(pipe, param_grid, scoring='accuracy', verbose = verbose)
      return grid
    elif(grid=='random'):
      parameters = {'classifier__C': scipy.stats.expon(scale=10), 'classifier__gamma': scipy.stats.expon(scale=.001), #Parameters for grid search
      'classifier__kernel': ['rbf'], 'classifier__class_weight':['balanced']}
      pipe = Pipeline([('scaler', StandardScaler()),('classifier',SVC())]) #Definition of pipeline
      grid = RandomizedSearchCV(pipe, parameters,n_iter=100, scoring='accuracy', verbose=verbose, return_train_score=False) #Random search
      return grid
  #Random forest
  elif(method=='RF'):
    param_grid = {'classifier__n_estimators': [100, 200, 400, 600, 800],}
    pipe = Pipeline([('scaler', StandardScaler()),('classifier',RandomForestClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring='accuracy', verbose = verbose)
    return grid
  #Random forest with PCA
  elif(method=='RF_PCA'):
    param_grid = {'reductor__n_components': reductor_components,'classifier__n_estimators': [100, 200, 400, 600, 800],}
    pipe = Pipeline([('scaler', StandardScaler()),('reductor',PCA()), ('classifier',RandomForestClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring='accuracy', verbose = verbose)
    return grid

## Stratified and grouped functions

In [None]:
def classifier_and_grid_strat(X, y, ID, method, grid='normal',verbose=1, reductor_components=[1], scoring = 'accuracy', sampler =SMOTE()):
  """
  Definition of the classifier grid method for the hyperparameters search.
  :param method: learning method name (SVM, random forest, etc.)
  :param grid: type of grid search given (random or normal)
  """
  #CV split definition
  gkf = list(StratifiedGroupKFold(n_splits=5).split(X,y,groups=ID))
  #MCC scoring
  scoring = make_scorer(matthews_corrcoef) if scoring=='MCC' else None

  #KNN
  if(method=='KNN'):
    param_grid = {'classifier__n_neighbors': list(range(1,20))}
    pipe = Pipelineim([('scaler', StandardScaler()),('sampler',sampler),('classifier',KNeighborsClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring=scoring, cv = gkf,verbose = verbose)
    return grid
  #KNN with PCA
  elif(method=='KNN_PCA'):
    param_grid = {'reductor__n_components': reductor_components,'classifier__n_neighbors': list(range(1,20))}
    pipe = Pipelineim([('scaler', StandardScaler()),('reductor',PCA()), ('sampler',sampler), ('classifier',KNeighborsClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring=scoring, cv = gkf,verbose = verbose)
    return grid
  #SVM with PCA
  elif(method=='SVM_PCA'):
    if(grid=='normal'):
      param_grid = {'reductor__n_components': reductor_components,'classifier__C': [0.1, 1, 10, 100],
              'classifier__gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
              'classifier__kernel': ['rbf']}
      pipe = Pipelineim([('scaler', StandardScaler()),('reductor',PCA()),('sampler',sampler),('classifier',SVC())])
      grid = GridSearchCV(pipe, param_grid, scoring=scoring, cv = gkf, verbose = verbose)
      return grid
    elif(grid=='random'):
      parameters = {'reductor__n_components': reductor_components,'classifier__C': scipy.stats.expon(scale=10), 'classifier__gamma': scipy.stats.expon(scale=.001), #Parameters for grid search
      'classifier__kernel': ['rbf'], 'classifier__class_weight':['balanced']}
      pipe = Pipelineim([('scaler', StandardScaler()),('reductor',PCA()), ('sampler',sampler), ('classifier',SVC())]) #Definition of pipeline
      grid = RandomizedSearchCV(pipe, parameters,n_iter=15, scoring=scoring, cv = gkf, verbose=verbose, return_train_score=False) #Random search
      return grid
  #SVM no PCA    
  elif(method=='SVM'):
    if(grid=='normal'):
      param_grid = {'classifier__C': [0.1, 1, 10, 100],
              'classifier__gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
              'classifier__kernel': ['rbf'],'classifier__class_weight':['balanced']}
      pipe = Pipelineim([('scaler', StandardScaler()),('sampler',sampler),('classifier',SVC())])
      grid = GridSearchCV(pipe, param_grid, scoring=scoring, cv = gkf, verbose = verbose)
      return grid
    elif(grid=='random'):
      parameters = {'classifier__C': scipy.stats.expon(scale=10), 'classifier__gamma': scipy.stats.expon(scale=.001), #Parameters for grid search
      'classifier__kernel': ['rbf'], 'classifier__class_weight':['balanced']}
      pipe = Pipelineim([('scaler', StandardScaler()),('sampler',sampler),('classifier',SVC())]) #Definition of pipeline
      grid = RandomizedSearchCV(pipe, parameters,n_iter=15, scoring=scoring, cv = gkf, verbose=verbose, return_train_score=False) #Random search
      return grid
  #Random forest
  elif(method=='RF'):
    param_grid = {'classifier__n_estimators': [100, 200, 400, 600],}
    pipe = Pipelineim([('scaler', StandardScaler()),('sampler',sampler),('classifier',RandomForestClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring=scoring, cv = gkf, verbose = verbose)
    return grid
  #Random forest with PCA
  elif(method=='RF_PCA'):
    param_grid = {'reductor__n_components': reductor_components,'classifier__n_estimators': [100, 200, 400, 600],}
    pipe = Pipelineim([('scaler', StandardScaler()),('reductor',PCA()),('sampler',sampler), ('classifier',RandomForestClassifier())])
    grid = GridSearchCV(pipe, param_grid, scoring=scoring, cv = gkf, verbose = verbose)
    return grid


#Define identity transformation
class IdentityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, input_array, y=None):
        return self
    
    def transform(self, input_array, y=None):
        return input_array*1

def model2performance_metrics_strat(extractor='dense', method = 'SVM', grid = 'normal', verbose = 1, reductor_components = [1], scoring = 'accuracy', sampler = SMOTE(), max_corr=1, fold=0, mag=0):
  """
  Input ML model settings, as well as folder to extract
  :param extractor: name of extractor of features used
  :param read_files_type: type of file to be read (csv, pickle)
  :param pipe: Pipeline of the classification method.
  :param max_corr: Maximum correlation allowed between features
  :param fold, mag: known
  :return: metrics of the model + model. Additionally, problematic patients are displayed.
  """
  read_files_type=read_files_pickle if (extractor in ['dense','GLCM']) else read_files_csv #Read files type

  X_train, X_test, y_train, y_test, ID_train, ID_test = read_files_type(extractor=extractor, fold=fold, mag=mag) #Split data
  pipe = classifier_and_grid_strat(X_train, y_train, ID_train, method, grid = grid, verbose=verbose, reductor_components=reductor_components, scoring = scoring, sampler =sampler) #Defining pipe
  df_comparison, model = machine_learning_pipeline(X_train, X_test, y_train, y_test, ID_test, pipe=pipe, max_corr=max_corr) #Main pipeline. Obtain comparison of labels
  p_score_list, p_score_ID_list, rec_rate, acc = performance_metrics(df_comparison) #Get performance metrics
  p_problem = get_problematic_patients(p_score_ID_list)

  print(f'-For fold {fold+1} and magnification {mag_dict[mag]}:\n') #Print metrics
  print(f'Recognition rate: {rec_rate}')
  print(f'Image-wise accuracy: {acc}')
  print(f'Problematic patients: {p_problem}\n') #Show problematic patients' ID and score

  return p_score_list, p_score_ID_list, rec_rate, acc, model

def result_all_folders_strat(extractor = 'dense', method = 'SVM', grid = 'normal', verbose = 1, reductor_components = [1], scoring = 'accuracy', sampler = IdentityTransformer(), max_corr=1, mag=0):
  """
  All 5 folds are run for the same magnification. The extractor name, pipe definition as well as the max_correaltion have to be given
  :param extractor: name of extractor of features used
  :param read_files_type: type of file to be read (csv, pickle)
  :param pipe: Pipeline of the classification method.
  :param max_corr: Maximum correlation allowed between features
  :param mag: known
  :return: NONE
  """
  all_rec_rates = np.zeros(5) #To save recognition rates
  for fold in range(5):
    p_score_list, p_score_ID_list, rec_rate, acc, model = model2performance_metrics_strat(extractor=extractor, method = method, grid = grid, verbose = verbose, reductor_components = reductor_components, scoring = scoring, sampler = sampler, max_corr=max_corr, fold=fold, mag=mag) #Fold-wise learning method
    all_rec_rates[fold] = rec_rate #save recognition rate
    if hasattr(model,'best_estimator_'):
      print(f'Grid best estimator: {model.best_estimator_}\n') #Print the best estimator hyperparameters of the grid search
      #print(f'Best estimator number of Principal Components: {model.best_estimator_.named_steps["reductor"].explained_variance_ratio_.shape[0]}\n')
      print('############\n')
  rec_rate_mean = all_rec_rates.mean()
  rec_rate_std = all_rec_rates.std()
  print(f'---- Mean recognition rate for magnification {mag_dict[mag]}x: {rec_rate_mean}') #Mean recognition rate (final metric)
  return rec_rate_mean, rec_rate_std

In [None]:
#Functions to print information about the models evolution based on neighbors number.
def KNN_neighbors_check(model):
  """
  After fitting a KNN model check the model evolution
  :param model: trained KNN model
  :return: NONE, plot graph  
  """
  cv_results = model.cv_results_
  scores_mean = cv_results['mean_test_score']
  scores_mean = np.array(scores_mean)
  scores_mean
  scores_sd = cv_results['std_test_score']
  scores_sd = np.array(scores_sd)

  # Plot Grid search scores
  fig, ax = plt.subplots(1,1)

  # # 
  ax.plot(scores_mean, '-o')
  ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
  ax.set_xlabel('Num_neighbors', fontsize=16)
  ax.set_ylabel('CV Average Score', fontsize=16)
  ax.grid('on')
  plt.imshow()

# Model training

## Binary classification

In [None]:
super_matrix = np.zeros((12,9),dtype='object') #Matrix containing all results
i = 0 #Matrix row counter
for extractor in ['fbp','GLCM','PFTAS','Gabor','dense']: #fbp is LBP typo #All features used
  for method in ['KNN','KNN_PCA','SVM','SVM_PCA','RF','RF_PCA']: #All method used
    for sampler in [SMOTE(), IdentityTransformer()]: #With oversample of withour
      
      #Magnification level model comparison
      rec_rate_all = np.zeros((4,2)) #Row is magnification, column is mean and standard deviation
      for mag in [0,1,2,3]:

        grid = 'random'
        verbose = 3
        reductor_components = [0.95] #For PCA only
        scoring = 'MCC'
        max_corr = 0.99

        #Predict for magnification
        rec_rate_all[mag,:] = result_all_folders_strat(extractor = extractor, method = method, grid = grid, verbose = verbose, reductor_components = reductor_components, scoring = scoring, sampler =sampler, max_corr=max_corr, mag = mag)
      super_matrix[i,1:] = rec_rate_all.ravel()
      super_matrix[i,0] = extractor+'_'+method+'_'+str(sampler)
      i = i+1
with open(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/Machine_Learning/Results/super_matrix_{extractor}.p','wb') as handle:
  pickle.dump(super_matrix, handle, pickle.HIGHEST_PROTOCOL)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 5/5] END classifier__C=5.692183948934706, classifier__class_weight=balanced, classifier__gamma=0.0035371823813547362, classifier__kernel=rbf, reductor__n_components=0.95;, score=0.151 total time=   0.2s
[CV 1/5] END classifier__C=13.056462572313004, classifier__class_weight=balanced, classifier__gamma=0.00040249082218104566, classifier__kernel=rbf, reductor__n_components=0.95;, score=-0.030 total time=   0.2s
[CV 2/5] END classifier__C=13.056462572313004, classifier__class_weight=balanced, classifier__gamma=0.00040249082218104566, classifier__kernel=rbf, reductor__n_components=0.95;, score=0.215 total time=   0.2s
[CV 3/5] END classifier__C=13.056462572313004, classifier__class_weight=balanced, classifier__gamma=0.00040249082218104566, classifier__kernel=rbf, reductor__n_components=0.95;, score=0.188 total time=   0.2s
[CV 4/5] END classifier__C=13.056462572313004, classifier__class_weight=balanced, classifier__gamma=

### Combination of features (No improvements)

In [None]:
extractor_1 = 'PFTAS'
read_files_type_1 = read_files_csv
extractor_2 = 'dense'
read_files_type_2 = read_files_pickle
extractor_3 = 'fbp'
read_files_type_3 = read_files_csv

# fold=1
# mag=1
pipe = classifier_and_grid(method='SVM_PCA',grid='random',verbose=1, reductor_components=[0.9,0.95,0.99])

for mag in range(4):
  all_rec_rate = np.zeros(5)
  print(f'For magnification {mag_dict[mag]}\n')
  for fold in range(5):
    X_train_1, X_test_1, _, _, _, _ = read_files_type_1(extractor=extractor_1, fold=fold, mag=mag) #Split data
    X_train_2, X_test_2, y_train, y_test, _, ID_test = read_files_type_2(extractor=extractor_2, fold=fold, mag=mag) #Split data
    X_train_3, X_test_3, _, _, _, _ = read_files_type_3(extractor=extractor_3, fold=fold, mag=mag) #Split data
    #Transforming to np
    X_train_1 = X_train_1.to_numpy()
    X_test_1 = X_test_1.to_numpy()

    X_train_3 = X_train_3.to_numpy()
    X_test_3 = X_test_3.to_numpy()

    X_train = np.concatenate([X_train_1,X_train_2,X_train_3],axis=1)
    X_test = np.concatenate([X_test_1,X_test_2,X_test_3],axis=1)
    df_comparison, model = machine_learning_pipeline(X_train, X_test, y_train, y_test, ID_test, pipe=pipe) #Main pipeline. Obtain comparison of labels
    p_score_list, p_score_ID_list, rec_rate, acc = performance_metrics(df_comparison) #Get performance metrics
    p_problem = get_problematic_patients(p_score_ID_list)
    print(rec_rate)
    print(f'Grid best estimator: {model.best_estimator_}\n') if hasattr(model,'best_estimator_') else None #Print the best estimator hyperparameters of the grid search
    print(f'Best estimator PCA explained variance: {model.best_estimator_.named_steps["reductor"].explained_variance_ratio_}\n')
    all_rec_rate[fold] = rec_rate
  print(f'Average:{all_rec_rate.mean()}')

For magnification 40

Fitting 5 folds for each of 20 candidates, totalling 100 fits
0.9087082371302525
Grid best estimator: Pipeline(steps=[('scaler', StandardScaler()),
                ('reductor', PCA(n_components=0.95)),
                ('classifier',
                 SVC(C=2.9918497031144344, class_weight='balanced',
                     gamma=0.0005357589497553396))])

Best estimator PCA explained variance: [0.13978326 0.11691509 0.06042964 0.04577836 0.04091022 0.03651289
 0.0332255  0.0283872  0.02273528 0.0217425  0.02024462 0.01872558
 0.01667126 0.01471094 0.0119776  0.01171285 0.01103857 0.01036154
 0.00935244 0.00890508 0.0084887  0.00776875 0.00738642 0.00687588
 0.00658573 0.00610672 0.00575833 0.00556373 0.00531212 0.00508322
 0.00466614 0.00454629 0.00438981 0.00412047 0.00409648 0.00386395
 0.00375999 0.00350717 0.00330305 0.0031587  0.00308052 0.00298095
 0.0029539  0.00283577 0.00278471 0.00260739 0.00258846 0.0024763
 0.00239707 0.00232705 0.00228683 0.0022325  0.00

## Multiclass classification

### Plain-classification (No hierarchy) (Vanilla version)

In [None]:
def get_multi_class(paths):
  '''
  Returns the dataframe of encoded multi classes from 0 to 7. 
  input: paths => the list of the paths in a folder.
  Benign
  B_A : adenosis (A)
  B_F : fibroadenoma (F)
  B_PT : phyllodes tumor (PT)
  B_TA : tubular adenone (TA)

  Malignant
  M_DC : ductal carcinoma (DC)
  M_LC : lobular carcinoma (LC)
  M_MC : mucinous carcinoma (MC)
  M_PC : papillary carcinoma (PC)
  '''
  classes = ['B_A', 'B_F', 'B_PT', 'B_TA', 'M_DC', 'M_LC', 'M_MC', 'M_PC']
  classes_l = []
  for path in paths:
    c = path.rsplit('/', 1)[1].split('_', 1)[1].split('-', 1)[0] 
    for slide_class in classes:
      if c == slide_class:
        classes_l.append(c)
  reps = {'B_A': 0, 'B_F': 1, 'B_PT': 2, 'B_TA': 3, 'M_DC': 4, 'M_LC': 5,'M_MC': 6, 'M_PC': 7}
  cat = [reps.get(x,x) for x in classes_l]
  return cat

In [None]:
paths = pickle.load(open('/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/paths.p','rb'))

def read_files_multi_class(extractor, fold, mag):
  '''
  extractor: Feature extractor - String.
  mag: Magnification factor - int.
  fold: Fold number - int.
  Reads and splits the files as X_train, y_train, X_test, y_test
  '''
  fold_convert=fold-1

  if mag == 40:
    mag_convert = 0
  if mag == 100:
    mag_convert = 1
  if mag == 200:
    mag_convert = 2
  if mag == 400:
    mag_convert = 3

  data_train=pd.read_csv(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold}/train/{extractor}_f{fold}_train_{mag}x_fv.csv', delimiter='\t', index_col=0)
  X_train=data_train.iloc[:,1:-1]
  y_train=np.array(get_multi_class(paths[fold_convert,0,mag_convert]))
  df_patient_train=data_train.iloc[:,0:1] # Just patient Id's

  data_test=pd.read_csv(f'/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/fold{fold}/test/{extractor}_f{fold}_test_{mag}x_fv.csv', delimiter='\t', index_col=0)
  X_test=data_test.iloc[:,1:-1]
  y_test=np.array(get_multi_class(paths[fold_convert,1,mag_convert])) 
  df_patient_test=data_test.iloc[:,0:1] # Just patient Id's

  return X_train, y_train, df_patient_train, X_test, y_test, df_patient_test

In [None]:
extractor='PFTAS'
average_40=avg_rec_rate(40,extractor, multi=True)
average_100=avg_rec_rate(100,extractor, multi=True)
average_200=avg_rec_rate(200,extractor, multi=True)
average_400=avg_rec_rate(400,extractor, multi=True)
print(f'Averages among magnifications: {average_40, average_100, average_200, average_400}')

Averages among magnifications: ((0.4032868869689814, 0.38608759517892366, 0.34093176379465273), (0.36763193750830586, 0.3435266060002509, 0.3120065046391135), (0.40449107772115517, 0.3854549518341167, 0.3532085700735391), (0.4109311159686153, 0.3854721614344069, 0.3449384000437748))


In [None]:
extractor='PFTAS'
average_40=avg_rec_rate(40,extractor)
average_100=avg_rec_rate(100,extractor)
average_200=avg_rec_rate(200,extractor)
average_400=avg_rec_rate(400,extractor)
print(f'Averages among magnifications: {average_40, average_100, average_200, average_400}')

Averages among magnifications: ((0.7008711554707434, 0.6937476647809216, 0.6728499553907232), (0.7383497267914324, 0.7339436546085664, 0.7241907639689833), (0.7255634989840556, 0.7253045347738902, 0.7150454881468997), (0.7272674479213309, 0.7214692743251229, 0.7080717285962393))


In [None]:
extractor='PFTAS'
rec_40, acc_40, f1_40 = avg_rec_rate(40,extractor, multi=True)
rec_100, acc_100, f1_100 = avg_rec_rate(100,extractor, multi=True)
rec_200, acc_200, f1_200 = avg_rec_rate(200,extractor, multi=True)
rec_400, acc_400, f1_400 = avg_rec_rate(400,extractor, multi=True)
print(f'Averages among magnifications: {rec_40, rec_100, rec_200, rec_400}')
print(f'Averages among magnifications: {acc_40, acc_100, acc_200, acc_400}')
print(f'Averages among magnifications: {f1_40, f1_100, f1_200, f1_400}')

Averages among magnifications: (0.4032868869689814, 0.36763193750830586, 0.40449107772115517, 0.4109311159686153)
Averages among magnifications: (0.38608759517892366, 0.3435266060002509, 0.3854549518341167, 0.3854721614344069)
Averages among magnifications: (0.34093176379465273, 0.3120065046391135, 0.3532085700735391, 0.3449384000437748)


In [None]:
extractor='PFTAS'
rec_40, acc_40, f1_40=avg_rec_rate(40,extractor)
rec_100, acc_100, f1_100=avg_rec_rate(100,extractor)
rec_200, acc_200, f1_200=avg_rec_rate(200,extractor)
rec_400, acc_400, f1_400=avg_rec_rate(400,extractor)
print(f'Averages among magnifications: {rec_40, rec_100, rec_200, rec_400}')
print(f'Averages among magnifications: {acc_40, acc_100, acc_200, acc_400}')
print(f'Averages among magnifications: {f1_40, f1_100, f1_200, f1_400}')

Averages among magnifications: (0.7008711554707434, 0.7383497267914324, 0.7255634989840556, 0.7272674479213309)
Averages among magnifications: (0.6937476647809216, 0.7339436546085664, 0.7253045347738902, 0.7214692743251229)
Averages among magnifications: (0.6728499553907232, 0.7241907639689833, 0.7150454881468997, 0.7080717285962393)


# Best models extraction

From the results obtained above we extract the best models

## Special functions

In [None]:
def model_and_metrics(extractor='dense', method = 'SVM', grid = 'normal', verbose = 1, reductor_components = [1], scoring = 'accuracy', sampler = SMOTE(), max_corr=1, fold=0, mag=0):
  """
  Input ML model settings, as well as folder to extract
  :param extractor: name of extractor of features used
  :param read_files_type: type of file to be read (csv, pickle)
  :param pipe: Pipeline of the classification method.
  :param max_corr: Maximum correlation allowed between features
  :param fold, mag: known
  :return: metrics of the model + model. Additionally, problematic patients are displayed.
  """
  read_files_type=read_files_pickle if (extractor in ['dense','GLCM']) else read_files_csv #Read files type

  X_train, X_test, y_train, y_test, ID_train, ID_test = read_files_type(extractor=extractor, fold=fold, mag=mag) #Split data
  pipe = classifier_and_grid_strat(X_train, y_train, ID_train, method, grid = grid, verbose=verbose, reductor_components=reductor_components, scoring = scoring, sampler =sampler) #Defining pipe
  df_comparison, model, auc = machine_learning_pipeline_binary(X_train, X_test, y_train, y_test, ID_test, pipe=pipe, max_corr=max_corr) #Main pipeline. Obtain comparison of labels
  p_score_list, p_score_ID_list, rec_rate, acc = performance_metrics(df_comparison) #Get performance metrics
  p_problem = get_problematic_patients(p_score_ID_list)

  print(f'-For fold {fold+1} and magnification {mag_dict[mag]}:\n') #Print metrics
  print(f'Recognition rate: {rec_rate}')
  print(f'Image-wise accuracy: {acc}')
  print(f'Problematic patients: {p_problem}\n') #Show problematic patients' ID and score

  return p_score_list, p_score_ID_list, rec_rate, acc, model, auc

def machine_learning_pipeline_binary(X_train, X_test, y_train, y_test, ID_test, pipe, max_corr=1):
  """
  Main machine_learning pipeline.It receives the data (feature matrices, label and patients' ID) and produces the csv with the comparison between real and predicted labes. The model is also returned.
  
  :param X_train, X_test...etc: feature matrices, labels and patients' ID. X matrices are dataframes, the rest are np.arrays
  :param pipe: Pipeline defined using sklearn
  :param max_corr: Maximum correlation accepted among features
  :return: dataframe with the ID, prediction and real label, and model used
  """
  print(f'Input features: {X_train.shape[1]}\n')
  X_train_nocorr, X_test_nocorr = remove_correlated(X_train, X_test, max_corr=max_corr) #Remove correlated features from X_train and X_test
  print(f'No-correlated features: {X_train_nocorr.shape[1]}\n')
  model = pipe.fit(X_train_nocorr, y_train.ravel()) #Model fit
  y_pred = model.predict(X_test_nocorr) #Prediction

  fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
  auc = metrics.auc(fpr, tpr)

  y_pred = pd.DataFrame(y_pred) # Dataframe of predictions
  y_pred.columns = ['y_pred']
  y_test = pd.DataFrame(y_test) # Dataframe of test labels
  y_test.columns = ['y_test']
  df_comparison = pd.DataFrame(ID_test).copy() # Copy the main dataframe with Patient ID's for test set.
  df_comparison.columns = ['ID'] # Naming the column in df_p_test
  df_comparison["y_test"]=y_test["y_test"] # Adding the dataframe y_test
  df_comparison["y_pred"]=y_pred["y_pred"] # Adding the dataframe y_pred.
  df_comparison['comparison'] = np.where(df_comparison['y_test'] == df_comparison['y_pred'], 1, 0) # Adding the comparison coloumn, where y_pred==y_test, it's true.

  return df_comparison, model, auc

## Extract best models

In [None]:
#Create matrix that will contain best models and its metrics
best_pred_binary = np.zeros((4,3),dtype='object')

In [None]:
#Unchangable settings
grid = 'random'
extractor = 'dense'
verbose = 3
reductor_components = [0.95,0.99]
scoring = 'MCC'
max_corr = 0.99

#Settings to change
fold = 0

######
#Depends on setting
mag = 3
method = 'RF'
sampler = SMOTE()

#Built model
p_score_list, p_score_ID_list, rec_rate, acc, model, auc = model_and_metrics(extractor=extractor, method = method, grid =grid , verbose = verbose, reductor_components = reductor_components, scoring = scoring, sampler = sampler, max_corr=max_corr, fold=fold, mag=mag)
#Save model, aAUC and accuracy
best_pred_binary[mag,:] = [model,auc,acc]

In [None]:
#Save best modle in pickle file
with open('/content/drive/MyDrive/Ars_machinae_autodiscentis/Inceptum/Machine_Learning/Results/Best_models/Best_pred_binary.p','wb') as handle:
  pickle.dump(best_pred_binary,handle,pickle.HIGHEST_PROTOCOL)