## Code to extract training and testing data from hdf5 files and storing them in the right form in .npy files

This script gives processed data reading.
Only dependency is util.py
- Oct 25, 2018

In [2]:
import sys
import os

import numpy as np
import glob
import pickle
import h5py

import time

In [3]:
# Import modules from other files

from util import add_pulse_to_inp_tensor, get_nonempty_pulses, total_doms, total_height, total_width, get_pulse_array, get_nonempty_events

### Modules to make dataset

In [4]:

def make_dataset(filename, sig_or_bg):
    
    hf = h5py.File(filename,'r')
    pulse_array_keys = get_nonempty_pulses(hf)
    num_events = len(pulse_array_keys)
    
    ####### Added by Venkitesh, Oct 24, 2018. 
    ## Extracting the weights
    event_array_keys=get_nonempty_events(hf)
    assert len(pulse_array_keys)==len(event_array_keys), "Pulse and event array keys have different sizes"
    # Computing the weights
    wgts=np.array([hf['events'][event_key]['weight'][0] for event_key in event_array_keys])
    
    # Checking whether the event_array_keys and pulse_array_keys are in order and identical
#     print(np.array_equal(pulse_array_keys,event_array_keys))    
    assert np.array_equal(pulse_array_keys,event_array_keys), "Pulse array %s and Event array %s are not identical. Possibility of mismatch"%(pulse_array_keys,event_array_keys)
    #######
        
    tens = np.zeros((num_events, total_doms, total_height, total_width))
    
    for ex_num, pulse_array_key in enumerate(pulse_array_keys):
        pulse_array = get_pulse_array(hf, pulse_array_key)
        add_pulse_to_inp_tensor(tens, ex_num, pulse_array)
        
    lbls = np.ones((num_events,)) if sig_or_bg == "sig" else np.zeros((num_events,))
        
    return tens, lbls, wgts


def get_data(sig_filename_list, bg_filename_list):
    
    ### Changes made by Venkitesh, 10/24/2018
    x, y, wt = make_dataset(sig_filename_list[0], "sig")
    
    for fn in sig_filename_list[1:]:
    #for fn in sig_filename_list:
        xs,ys,wts = make_dataset(fn, "sig")
        x = np.vstack((x,xs))
        y = np.concatenate((y,ys))
        wt = np.concatenate((wt,wts))

    for fn in bg_filename_list:
        xb,yb,wtb = make_dataset(fn, "bg")
        x = np.vstack((x,xb))
        y = np.concatenate((y,yb))
        wt= np.concatenate((wt,wtb))
        
        
    return x,y,wt


In [5]:

def f_get_file_lists(data_folder,mode):
    ''' Function to the get the list of signal files and background files (sigpath and bgpath) for reserved and training data. 
        mode='quick' picks a smaller set of files for quick training. These files have the form '*00.hdf5'.
        
        Arguments:
        data_folder='regular' or 'reserved'
        mode='regular' or 'quick'
    '''
    
    if data_folder=='reserved':
        sigpath = "/project/projectdirs/dasrepo/icecube_data/reserved_data/filtered/nugen/11374/clsim-base-4.0.3.0.99_eff/"
        bgpath = "/global/project/projectdirs/dasrepo/icecube_data/reserved_data/filtered/corsika/11057/"
    else:
        sigpath = "/project/projectdirs/dasrepo/icecube_data/hdf5_out/filtered/nugen/11374/clsim-base-4.0.3.0.99_eff/"
        bgpath = "/project/projectdirs/dasrepo/icecube_data/hdf5_out/filtered/corsika/11057/"
        
    
    # For quick testing, use only the file starting with a '00' at the end ('*00.hdf5'). This give a much smaller set of files, for quick testing.
    suffix='*00.hdf5' if mode=='quick' else '*.hdf5'     
    sig_list=glob.glob(sigpath+suffix)
    bg_list=glob.glob(bgpath+suffix)
    
    return sig_list,bg_list


def f_extract_data(data_folder,save_location,mode='normal'):
    '''
    Function to perform :
    - Data read
    - Data format
    - Data save to file
    
    Arguments:
    data_folder='regular' or 'reserved'
    save_location= location to save the data files (that are very large)
    mode='normal' or 'quick'
    '''
    
    print("Type of data:\t",data_folder)
    
    ##########################################
    ### Read Data from files ###
    sig_list,bg_list=f_get_file_lists(data_folder,mode)
    print(len(sig_list),len(bg_list))
    
    return
    
#     block_size=length/
    
    
    
    inx,inpy,wts = get_data(sig_list, bg_list)
    num=inx.shape[0]
    print("Data shape after read:\tx:{0}\ty:{1}\twts:{2}".format(inx.shape,inpy.shape,wts.shape))
    
    ##########################################
    ### Format the x-data for keras 3D CNN ###
    inx2=np.expand_dims(inx,axis=1)
    inx3=np.transpose(inx2,axes=[0,3,4,2,1])
    # print(inx2.shape,inx3.shape)
    inpx=inx3.copy()
    print("Data shape after format:\tx:{0}\ty:{1}".format(inpx.shape,inpy.shape,wts.shape))
    
    ##########################################
    ### Save data to files ###
    prefix='processed_input_'+data_folder
    f1,f2,f3=prefix+'_x',prefix+'_y',prefix+'_wts'

#     for fname,data in zip([f1,f2,f3],[inpx,inpy,wts]):
#         np.save(save_location+fname,data)
        


In [6]:
if __name__=='__main__':
    
    save_data_dir='/global/project/projectdirs/dasrepo/vpa/ice_cube/data_for_cnn/extracted_data_v/data/temp/'
#     # Regular data
    t1=time.time()
#     f_extract_data(data_folder='regular',save_location=save_data_dir,mode='quick')
    f_extract_data(data_folder='regular',save_location=save_data_dir,mode='normal')

    t2=time.time()
    print("Time taken in hours ",(t2-t1)/3600.0)

    ### Reserved data ###
    t1=time.time()
#     f_extract_data(data_folder='reserved',save_location=save_data_dir,mode='quick')
    f_extract_data(data_folder='reserved',save_location=save_data_dir,mode='normal')
    t2=time.time()

    print("Time taken in hours ",(t2-t1)/3600.0)


Type of data:	 regular
11361 10384
Time taken in hours  0.00021515283319685193
Type of data:	 reserved
7593 57635
Time taken in hours  0.0005267043246163262


In [7]:
# ! jupyter nbconvert --to script extract_data.ipynb

In [1]:
ls -l 

total 128954
drwxrwx--- 2 vpa vpa      512 Oct 25 17:03 [0m[01;34m__pycache__[0m/
-rw-r--r-- 1 vpa vpa 50000000 Nov  8 10:33 array_a.txt
-rw-r--r-- 1 vpa vpa 50000000 Nov  8 10:33 array_b.txt
-rw-r--r-- 1 vpa vpa 16000128 Nov  8 10:33 bin_a.txt.npy
-rw-r--r-- 1 vpa vpa 16000128 Nov  8 10:33 bin_b.txt.npy
-rw-r--r-- 1 vpa vpa     9967 Nov 11 07:31 extract_data.ipynb
-rw-r--r-- 1 vpa vpa     5723 Nov 11 07:30 extract_data.py
-rw-r----- 1 vpa vpa     2231 Oct 25 11:34 load_data.pyc
drwxrwx--- 4 vpa vpa      512 Oct 25 11:34 [01;34mold_extract_files[0m/
-rw-r----- 1 vpa vpa     4158 Oct 25 16:14 util.py
