## Preprocess the data stored in the NPZ folder (for 100frame single tile dataset, or the multi-tile MAP, according to the selected directory)

In [4]:
# imports

import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

import sys
import numpy as np
from sklearn.decomposition import PCA
import time
import matplotlib.pyplot as plt
from scipy.optimize import nnls 
from scipy.stats import zscore
from datetime import datetime
import cv2
from sklearn.preprocessing import MinMaxScaler
from functions_EDX import *
from scipy import signal

### Choose the directory containing the NPZ files 

In [5]:
# load file
file_names = os.listdir('/Volumes/Microscopy3/EDX_data/old_datasets/NPZ/')
file_names = [name for name in file_names if name.endswith('npz')]
file_names.sort()


In [None]:
# number of principal components kept
pc_keep = 10


# Choose the output directory to save the folder containing the preprocessed files. 
# It could be the home directory containing the EMD and NPZ files (see structure below), or a new one 
# e.g. if storage doesn't allow saving them in the same drive.

# HomePath Structure:
# /path/to/directory
#   |-- EMD    (contains the EMD file)
#   |-- NPZ    
#   |-- Spectrum2D_extendedGauss1_3_5'   (the preprocessed data)


print("Start time: ",datetime.now())

try:
    os.mkdir(os.path.join('/path/to/OutputDirectory','Spectrum2D_extendedGauss1_3_5'))
except:
    print("Folder exists.")

for file_idx,file_name in enumerate(file_names):
    start = datetime.now()
    file_path = os.path.join('/path/to/NPZdirectory',file_names[file_idx])
    loaded_file = np.load(file_path)
    haadf = loaded_file['haadf']
    spectrum = loaded_file['spectrum'][:,:,96:]
    xray_energies = loaded_file['xray_energies'][96:]
    subsample_size = spectrum.shape[2]
    
    # Clean up then bin the spectrum and check if any empty channels remain
    n_bins = 250

    spectrum = rebin_spectrum(spectrum,n_bins)
    xray_energies = rebin_energies(xray_energies,n_bins)
    
    # Now bin in XY
    subsample_size = 1024
    spectrum = rebin_spectrumXY(spectrum,subsample_size)  
    haadf = rebin_XY(haadf,subsample_size)       

    # check if there are empty channels along z
    where_notempty = ~np.all(spectrum==0,axis=(0,1))
    spectrum = spectrum[:,:,where_notempty]
    spectral_depth = spectrum.shape[2]
    spectrum_2D = np.reshape(spectrum,(subsample_size*subsample_size,spectral_depth))
    print("%04d channels remain" % spectral_depth)
    
    # Pre-Filter 
    spectrum = MeanFilterCube(spectrum.astype('float32'),sigma=2, size=3)
    spectrum_2D = np.reshape(spectrum,(subsample_size*subsample_size,spectral_depth))
    
    # Poisson scaling
    g = np.mean(spectrum,axis=2).reshape(subsample_size*subsample_size,1)
    #g = np.ones(g.shape)
    h = np.mean(np.mean(spectrum,axis=0),axis=0).reshape(spectral_depth,-1)
    W = g@np.transpose(h)
    W = np.sqrt(W)     
    spectrum_2D = np.divide(spectrum_2D,W)
    spectrum = np.reshape(spectrum_2D,(subsample_size,subsample_size,spectral_depth))
    end = datetime.now()
    
    pca_partial = PCA(n_components=pc_keep)
    X_partial= pca_partial.fit_transform(spectrum_2D)

    # Inverse transform and inverse the weight
    spectrum_2D = np.multiply(pca_partial.inverse_transform(X_partial),W)
    
    # Add averaged features
    radii = [1,3,5]
    spectrum_ext = spectrum_plus(spectrum,radii=radii,sigma=2)
    spectrum_2D = np.dstack((spectrum,spectrum_ext)).reshape((-1,(len(radii)+1)*250))
    del spectrum_ext
    print(spectrum_2D.shape)
    
    np.savez_compressed(os.path.join('/path/to/OutputDirectory','Spectrum2D_extendedGauss1_3_5','%s_spectrum_2D.npz' % (file_name[:-4])), spectrum_2D=spectrum_2D)
    end = datetime.now()
    print('Duration: {}'.format(end - start))
    print('Tile %02d finished\n' % file_idx)
    