# do_processing.ipynb
## Author: Andrew Phillips
## Purpose: Selects NTN phase 1 dataset from raw i3 sim files, applies custom modules, and saves to csv


In [3]:
import numpy as np
import sys
import os, fnmatch
import pandas as pd
import glob
import json
from icecube import dataio, dataclasses, icetray, MuonGun
from I3Tray import *
from icecube.hdfwriter import I3HDFWriter
import h5py
from i3_tools import *

#### Read in file paths, and corresponding subject set ids

In [2]:
file_info = pd.read_csv(os.path.join(os.getcwd(), 'phase1_files.csv'))
i3_files = list(file_info['filepath'])
subj_set_ids = list(file_info['subject_set_id'])

#### Select only desired events


In [6]:
#make dict of subj_set_ids, where data is list of all (event_id, subject_id) pairs in that subject set
ntn_subjects = pd.read_csv('/home/aphillips/data/data_exports_3-30/name-that-neutrino-subjects.csv')
ntn_subjects = ntn_subjects[ntn_subjects['workflow_id']==23715]
ssid_dict = dict.fromkeys(subj_set_ids)
for subj_set_id in subj_set_ids:
    subj_set = ntn_subjects[ntn_subjects['subject_set_id'] == subj_set_id]
    metadata = subj_set['metadata']
    subject_ids = list(subj_set['subject_id'])
    evt_ids = [json.loads(md)['event'] for md in metadata]
    val = zip(subject_ids, evt_ids)
    ssid_dict[subj_set_id] = [tup for tup in zip(subject_ids, evt_ids)]

In [7]:
for idx in range(0, len(subj_set_ids)): #loop over all the ssids
    
    print(f'Processing file {idx+1} of {len(subj_set_ids)}')
    ssid = subj_set_ids[idx]
    event_ids = [tup[1] for tup in ssid_dict[ssid]]
    event_ids = list(set(event_ids))
    subject_ids = [tup[0] for tup in ssid_dict[ssid]]
    event_ids.sort() #sort the event_ids. this should speed things up since the 
    outfile = dataio.I3File(os.path.join('/home/aphillips/data/output', f'ntn_events_{ssid}.i3'), 'w') #open empty i3 for output
    infile = dataio.I3File(i3_files[idx]) #open target i3

    while(infile.more()):
        frame = infile.pop_daq() #pop frame
        evt_head = frame["I3EventHeader"] #get event header
        evt_id = evt_head.event_id #get event id
        if(evt_id == event_ids[0]): #check if event id is in our list
            frame['subject_id'] = icetray.I3Int(subject_ids.pop(0))
            outfile.push(frame) #if so, push the frame to our output file
            event_ids.pop(0) #remove that value from the list of event ids
        if event_ids == []: #stop when we've grabbed all of our event ids
            break   
                            
    outfile.close() #close the files
    infile.close()

Processing file 17 of 19
112119
classifier_rehyd_DST_IC86.2020_NuE.022067.001999.i3.zst
Processing file 18 of 19
112118
classifier_rehyd_DST_IC86.2020_NuE.022067.000999.i3.zst
Processing file 19 of 19
112120
classifier_rehyd_DST_IC86.2020_NuE.022067.009999.i3.zst


#### Apply custom modules, save to csvs

In [5]:
dataframes = []
for f in [fname for fname in os.listdir('/home/aphillips/data/output/') if fname != 'output']:
    #print(f)
    (outfile, hd5_name) = apply_modules(os.path.join('/home/aphillips/data/output/',f), '/home/aphillips/data/output/')
    dataframes.append(process_data(os.path.join(os.getcwd(), 'output', hd5_name)))

#### Concatenate all the csvs into a master df

In [7]:
DF = pd.concat(dataframes) #create a master dataframe
DF.to_csv('/home/aphillips/data/output/all_ntn_events_4-24-24.csv', index=False) #save