### **Frequency analysis for HC18 segmentation task**

**Description:** In this notebook, we provide the codes for:

1.   PCA, non-uniform Fourier transform & frequency error computation functions 
2.   Opening reference data and data formatting functions
3.   Storing and saving results functions
5.   Loop to execute the above functions for the various optimizers and seeds

**STEP 1 - Frequency analysis functions**

In [None]:
!pip install nfft

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn.decomposition import PCA
import nfft

def get_pca_coeffs(x):

  '''This function extracts the first PC of dataset X.
  X: Nsamples x Nfeatures

  return numpy array, numpy array
  (first PC of X, projection of X on PC)
  '''

  pca = PCA(n_components=1)
  pca.fit_transform(x)
  x = np.squeeze(pca.transform(x))

  return pca, x

def F_effect_nn(pca_out,y,x,x_sort,FGT):

    '''This function computes the frequency error of function y = x, where:

    y; Nsamples x Npixels: Unet output masks flattened 
    x; Nsamples: is the first projection of input images on first PC of input images
    x_sort: x values sorted in increasing order
    pca_out: is the first PC of output masks
    FGT: non-uniform Fourier coefficients of ground truch function proj(yGT,pca_out) = x.

    return: numpy array
    (Frequency error)
    '''

    # projection on pca_out
    y = np.squeeze(pca_out.transform(y)) 

    # computing the non uniform Fourier transform introduced in [1]
    # for 20 freq indexes
    y_sort = y[np.argsort(x)]
    FT = nfft.nfft_adjoint(x_sort,y_sort,20)

    # Computes delta F 
    F_err = np.abs(F_GT - FT) / np.abs(F_GT)

    return F_err

#[1]: @inproceedings{xu2019training,
#  title={Training behavior of deep neural network in frequency domain},
#  author={Xu, Zhi-Qin John and Zhang, Yaoyu and Xiao, Yanyang},
#  booktitle={International Conference on Neural Information Processing},
#  pages={264--274},
#  year={2019},
#  organization={Springer}
#}


def analize_training_nn(path, tr_output, pca_out, x, x_sort, F_GT, do_FPrinciple = True):

  '''
  This function loops over training data files to compute the frequency analysis.
    path: the folder where to find the DNN training output masks per epoch as .npy
    pca_out: is the first PC of train output masks

    x; Nsamples: is the first projection of training input images on first PC of training input images
    x_sort: x values sorted in increasing order
    F_GT: non-uniform Fourier coefficients of ground truch function proj(yGT,pca_out) = x (training)

    return list, list, list 
    (epochs, accuracies, freq errors)
  '''

  files = [file for file in os.listdir(path) if (('train' in file) and ('.npy' in file))]
  epochs = [int(file.split('=')[-1].split('.')[0]) for file in files ]
  files = np.array(files)[np.argsort(np.array(epochs))]

  epoch_tr = []
  accuracy_tr = []
  F = []

  for file in files[:100]:

        print(file)
        y = np.load(path+file)

        # We compute the segmentation accuracy 
        res = np.mean(np.mean(y == tr_output,axis=1))
        accuracy_tr.append(res)
        
        if do_FPrinciple:
            # Compute the frequency error for the given datset
            F_err = F_effect_nn(pca_out,y,x,x_sort,F_GT)[np.newaxis,:]
            F.append(F_err)

        epoch_tr.append(int(file.split('=')[-1].split('.')[0])) # keep track on epochs order

  return epoch_tr, accuracy_tr, F


def analize_val(path, v_output):

  '''
    This function loops over validation data files to compute the val accuracy.
    path: the folder where to find the DNN val output masks per epoch as .npy
    tr_output: val ground truth output masks to compute the accuracy

    return: list, list
    (epochs, accuracies)
  '''

  files = [file for file in os.listdir(path) if (('val' in file) and ('.npy' in file))]
  epochs = [int(file.split('=')[-1].split('.')[0]) for file in files ]
  files = np.array(files)[np.argsort(np.array(epochs))]

  accuracy = []
  epoch = []


  for file in files[:100]:

      print(file)

      y = np.load(path+file)   
      # Compute the accuracy
      res = np.mean(np.mean(y == v_output,axis=1))

      accuracy.append(res)
      epoch.append(int(file.split('=')[-1].split('.')[0]))

  return epoch, accuracy

**STEP 1 - Opening and format data functions**

In [None]:
import pandas as pd
import numpy as np
import PIL
import os


def extract_files(path):
  ''' Function to help load data. Performs basic pre-processing on images (normalization, masks to float)
  path: folder where the training imags are stored

  return: list, list (two lists containing the images and masks respectively)
  '''

  input = []
  output = []

  def format_mask(mask):
    return (mask>0).astype(float)

  def normalize(im):
    im = im.astype(float)
    im = (im - np.min(im))/(np.max(im)-np.min(im))
    return im    

  for file in os.listdir(path):
    if not '_Annotation' in file:

      imp = normalize(np.array(PIL.Image.open(path+file)))
      imo = format_mask(np.array(PIL.Image.open(path+file.replace('.png','_Annotation.png'))))

      input.append(imp)
      output.append(imo)

  return input, output


def load_data(trpath = '/content/drive/MyDrive/OptML_Data/format_train/', 
              vpath = '/content/drive/MyDrive/OptML_Data/format_val/'):
  
  '''This function loads the validation and training data from their respective folders and fuse them to allow the 
  automatic change of train / val split depending on random shuffling seed. 

  return: numpy array, numpy array (two numpy arrays with the fused images data and masks respectively)
  '''

  train_input, train_output = extract_files(trpath)
  val_input, val_output = extract_files(vpath)

  train_input, train_output = extract_files(trpath)
  plus_input, plus_output = extract_files(vpath)

  train_input_ = list(np.concatenate([train_input, plus_input]))
  train_output_ = list(np.concatenate([train_output, plus_output]))

  return train_input_, train_output_


def split_train_val(train_input_, SEED=0):

  '''This function simply split train_input_ into 300 validation samples and 699 
  training samples after random shuffling with seed SEED.

  Return: numpy array, numpy array, numpy array 
  (train images, train masks, val images, val masks resp.) 
  '''

  indexes = np.arange(len(train_input_))
  np.random.seed(SEED)
  np.random.shuffle(indexes)

  v_indexes = indexes[:300]
  tr_indexes = indexes[300:]

  train_input = np.array(train_input_)[tr_indexes,:]
  train_output = np.array(train_output_)[tr_indexes,:]

  val_input = np.array(train_input_)[v_indexes,:]
  val_output = np.array(train_output_)[v_indexes,:]

  print(len(val_input), len(train_input))

  return train_input, train_output, val_input, val_output


def format_tr_v(train_input, train_output, val_input, val_output):

  '''Prepares the training and validation datasets for frequency analysis, by flattening images.
  train_input (samples x W x H): list of input train images
  train_output (samples x W x H): list of output train masks
  val_input (samples x W x H): list of input val images
  val_output (samples x W x H): list of output val masks

  return: numpy array, numpy array, numpy array, numpy array
  (formatted train input, train output, val input, val output)
  '''
  
  tr_input = np.concatenate([arr.flatten('F')[np.newaxis,:] for arr in train_input],axis=0)
  tr_output = np.concatenate([arr.flatten('F')[np.newaxis,:] for arr in train_output],axis=0)
  v_input = np.concatenate([arr.flatten('F')[np.newaxis,:] for arr in val_input],axis=0)
  v_output = np.concatenate([arr.flatten('F')[np.newaxis,:] for arr in val_output],axis=0)

  return tr_input, tr_output, v_input, v_output

**STEP 2 - Functions to store and save the results**

In [None]:
def store_values(tr_output, pca_out, x,  x_sort, F_GT,  v_output, paths):

    '''
    This function loops over training data files to compute the frequency analysis.

    paths: the list of folders (each folder / path = one method like SGD, Adam...) where to find the
    DNN training output masks per epoch as .npy

    tr_output: train ground truth output masks to compute the train accuracy
    pca_out: is the first PC of train output masks
    x; Nsamples: is the first projection of training input images on first PC of training input images
    x_sort: x values sorted in increasing order
    F_GT: non-uniform Fourier coefficients of ground truch function proj(yGT,pca_out) = x (training)
    v_output: val ground truth output masks to compute the val accuracy

    Return: 6 lists
    (paths, Freq errors, val accuracies, traina ccuracies, val epochs, train epochs)

    '''

    PATH = [] # store the folder path 
    Ferr = [] # frequency errors per epoch
    ACC_V = [] # validation accuracy for each method
    ACC_T = [] # training accuracy for each metho
    EPS_V = [] # keep track of val epochs order
    EPS_T = [] # keep track of train epochs order

    for path in paths:

          print(path)

          epoch_tr, accuracy_tr, F = analize_training_nn(path, tr_output, pca_out, x, x_sort, F_GT, do_FPrinciple = True)
          epoch_v, accuracy_v = analize_val(path, v_output)
          
          PATH.append(path)
          Ferr.append(F)
          EPS_V.append(epoch_v)
          EPS_T.append(epoch_tr)
          ACC_V.append(accuracy_v)
          ACC_T.append(accuracy_tr)

    return PATH, Ferr, ACC_V, ACC_T, EPS_V, EPS_T


def save_summaries(PATH, Ferr, ACC_V, ACC_T, EPS_V, EPS_T):

  '''Save the results stored in the input parameters as .npy.

    PATH: folder paths 
    Ferr: frequency errors per epoch
    ACC_V: validation accuracy for each method
    ACC_T: training accuracy for each method
    EPS_V: keep track of val epochs order
    EPS_T: keep track of train epochs order

  '''

  for k in range(len(PATH)):

    path = PATH[k]
    F = Ferr[k]
    epoch_tr = EPS_T[k]
    accuracy_v = ACC_V[k]
    epoch_v = EPS_V[k]
    accuracy_tr = ACC_T[k]

    savepath = path.replace('Results/','Results/Summary_per_seed/')
    print(savepath)

    if not os.path.exists(savepath):
      os.makedirs(savepath)

    np.save(savepath+'FP.npy',F)
    np.save(savepath+'epoch_tr.npy',epoch_tr)
    np.save(savepath+'accuracy_tr.npy',accuracy_tr)
    np.save(savepath+'epoch_v.npy',epoch_v)
    np.save(savepath+'accuracy_v.npy',accuracy_v)

**STEP 3 - Run the analysis for all the methods and store the results**

In [None]:
mainpath = './Results/Logistic_loss epochs=100 val seed = 0'

paths = [mainpath+'/Unet_SCRN(r=5)_batch=5/',
mainpath+'/Unet_SGD_batch=5/',
mainpath+'/Unet_AdaHessian_batch=5/',
mainpath+'/Unet_Adam_batch=5/']


train_input_, train_output_ = load_data()
SEEDS = [0,1,2,3,4] 


for SEED in SEEDS:

  # We load the reference data
  train_input, train_output, val_input, val_output = split_train_val(train_input_, SEED=SEED)
  tr_input, tr_output, v_input, v_output = format_tr_v(train_input, train_output, val_input, val_output)

  # We extract the ground truth Fourier transform
  pca_in, x = get_pca_coeffs(tr_input)
  pca_out, y = get_pca_coeffs(tr_output)
  y_sort = y[np.argsort(x)]
  x_sort = np.sort(x)
  F_GT = nfft.nfft_adjoint(x_sort,y_sort,20)

  # correct the base path to adapt the seed
  paths_ = [p.replace('seed = 0','seed = '+str(SEED)) for p in paths]

  # Run the analysis for the given seed and methods
  PATH, Ferr, ACC_V, ACC_T, EPS_V, EPS_T = store_values(tr_output, pca_out, x,  x_sort, F_GT, v_output, paths_)
  save_summaries(PATH, Ferr, ACC_V, ACC_T, EPS_V, EPS_T)

300 699
/content/drive/MyDrive/OptML_Data/Results/Logistic_loss epochs=100 val seed = 0/Unet_SCRN(r=5)_batch=5/
train_epoch=0.npy
train_epoch=1.npy
train_epoch=2.npy


OSError: ignored