# PFN model load data and normalize

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import time as t
import scipy.constants as spc
import matplotlib.ticker as ticker
import h5py as h5

In [3]:
path_prefix = 'D:/Work/EPE/ML4pi/'
plotpath = path_prefix+'plots/'
modelpath_c = path_prefix+''
modelpath = path_prefix+''
ext_path = "H:/EPE_file_storage/"
ext_modelpath = ext_path + "Model/"
# ext_datapath = ext_path + "data_storage/STMC/"
ext_datapath = 'D:/Work/Datastorage/STMC/'
ext_plotpath = ext_path + "plots/"

In [4]:
import sys
sys.path.append(path_prefix)
from util import resolution_util as ru
from util import plot_util as pu
from util import ml_util as mu
import uproot3 as ur

In [4]:
def tvt_num(_data, _tvt=(75, 10, 15)):
    ''' Function designed to output appropriate numbers for traning validation and testing given
    a variable length input. TVT expressed as ratios and do not need to add to 100. '''
    _tot = len(_data)
    _train, _val, _test = _tvt
    _tvt_sum = _train + _val + _test
    
    _train_rtrn = round(_train*_tot/_tvt_sum)
    _val_rtrn = round(_val*_tot/_tvt_sum)
    _test_rtrn = _tot - _train_rtrn - _val_rtrn
    
    return _train_rtrn, _val_rtrn, _test_rtrn

def normalize_input1d(arr, mask):
    ''' Note that non masked values are left alone i.e. zeros if mask = arr != 0'''
    len_arr = arr.shape[0]
    mean = np.repeat(np.mean(arr, where=mask), len_arr)
    std_dev = np.repeat(np.std(arr, where=mask), len_arr)
    norm_arr = np.subtract(arr, mean, out=arr, where=mask)
    std_mask = np.logical_and(std_dev!=0, mask)
    norm_arr = np.divide(norm_arr, std_dev, out=norm_arr, where=std_mask)
    return norm_arr

def normalize_input2d(arr, mask):
    ''' Truth value is where to perform the operation, exclude False vals.
    returns: Array with the shape of arr with normalization carried out with mask '''
    len_ax1 = arr.shape[1]
    mean = np.tile(np.mean(arr, axis=1, where=mask), (len_ax1,1)).transpose()
    std_dev = np.tile(np.std(arr, axis=1, where=mask), (len_ax1,1)).transpose()
    norm_arr = np.subtract(arr, mean, out=arr, where=mask)
    std_mask = np.logical_and(std_dev != 0, mask)
    norm_arr = np.divide(norm_arr, std_dev, out=norm_arr, where=std_mask)
    return norm_arr

def eval_generator(data, batch_size):
     batches = (len(data) + batch_size - 1)//batch_size
     for i in range(batches):
          X = data[i*batch_size : (i+1)*batch_size]
          yield (X)

In [5]:
def normalized_data(X,Y):
    ''' 
    Normalizes the data
    '''
    t0 = t.time()
    # take log of Y
    target_zero_mask = Y[:,0] > .05
    Ylog = np.log(Y[:,0][target_zero_mask])
    
    ## Normalize rPerp to 1/3630
    rPerp_mask = X[:,:,3] != 0
    X[:,:,3][rPerp_mask] = X[:,:,3][rPerp_mask]/3630.

    isnan = np.isnan(X[:,:,0])
    anytruth = np.any(isnan)
    print(anytruth)

    ## Energy Values that are not zero!
    E_nonZeroMask = X[:,:,0] != 0
    X[:,:,0][E_nonZeroMask] = np.log(X[:,:,0][E_nonZeroMask])
    cellE_mean = np.mean(X[:,:,0][E_nonZeroMask])
    cellE_std = np.std(X[:,:,0][E_nonZeroMask])
    X[:,:,0][E_nonZeroMask] = (X[:,:,0][E_nonZeroMask] - cellE_mean)/cellE_std

    ## Eta and Phi
    # do nothing for now as a control and check performance
    eta_mask = X[:,:,1] != 0
    X[:,:,1][eta_mask] = X[:,:,1][eta_mask]/.7

    phi_mask = X[:,:,2] != 0
    cellPhi_std = np.std(X[:,:,2][phi_mask])
    X[:,:,2][phi_mask] = X[:,:,2][phi_mask]/cellPhi_std
    t1 = t.time()
    print('Time to Normalize: '+str(t1-t0)+' (s)')
    return X, Ylog

In [None]:
# use for loop to read all the stmc file and write to a single file
Nfile = 30
fileNames = []
startingF = 0
for i in range(startingF,startingF+Nfile+1):
    fileNames.append(ext_datapath + 'STMC_' + str(i) + '.npz')

# create a new h5 file and h5 datasets
with h5.File(ext_datapath + 'STMC_train.h5', 'w') as hf:
    hf.create_dataset('X', (0,1086,5), maxshape=(None,1086,5), dtype='f')
    hf.create_dataset('Y', (0,), maxshape=(None,), dtype='f')

# read data from each file, normalized and write to h5 datasets
for filename in fileNames:
    with h5.File(ext_datapath + 'STMC_train.h5', 'r+') as hf:
        with np.load(filename) as data:
            data = np.load(filename)
            X = data['x']
            Y = data['y']
            X, Y = normalized_data(X, Y)
            hf['X'].resize((hf['X'].shape[0] + X.shape[0]), axis=0)
            hf['X'][-X.shape[0]:] = X[:,:,:5]
            hf['Y'].resize((hf['Y'].shape[0] + Y.shape[0]), axis=0)
            hf['Y'][-Y.shape[0]:] = Y

In [None]:
# use for loop to read all the stmc file and write to a single file
Nfile = 30
fileNames = []
startingF = 0
for i in range(startingF,startingF+Nfile+1):
    fileNames.append(ext_datapath + 'STMC_' + str(i) + '.npz')
# read data from each file, normalized and write to a single numpy file
for filename in fileNames:
    with np.load(filename) as data:
        X = data['x']
        Y = data['y']
        X, Y = normalized_data(X, Y)
        np.savez(ext_datapath+'STMC_train.npz', x=X, y=Y)


In [5]:
import tensorflow as tf

In [6]:
def read_npy_file(item):
    data = np.load(item.decode())
    return data.astype(np.float32)

In [7]:
Nfile = 30
fileNames = []
startingF = 0
for i in range(startingF,startingF+Nfile+1):
    fileNames.append(ext_datapath + 'STMC_' + str(i) + '.npz')

In [8]:
dataset = tf.data.Dataset.from_tensor_slices(fileNames)

In [10]:
dataset = dataset.map(
        lambda item: tuple(tf.py_function(read_npy_file, [item], [tf.float32,])))

<TakeDataset shapes: (<unknown>,), types: (tf.float32,)>