# do_processing.ipynb
## Author: Andrew Phillips
## Purpose: Selects NTN phase 1 dataset from raw i3 sim files, applies custom modules, and saves to csv


In [1]:
import numpy as np
import sys
import os, fnmatch
import pandas as pd
import glob
import json
from icecube import dataio, dataclasses, icetray, MuonGun
from I3Tray import *
from icecube.hdfwriter import I3HDFWriter
import h5py
from i3_tools import *

#### Read in file paths, and corresponding subject set ids

In [2]:
file_info = pd.read_csv(os.path.join(os.getcwd(), 'phase1_files.csv'))
i3_files = list(file_info['filepath'])
subj_set_ids = list(file_info['subject_set_id'])

#### Select only desired events


In [3]:
#make dict of subj_set_ids, where data is list of all (event_id, subject_id) pairs in that subject set
ntn_subjects = pd.read_csv('/home/aphillips/data/data_exports_3-30/name-that-neutrino-subjects.csv')
ntn_subjects = ntn_subjects[ntn_subjects['workflow_id']==23715]
ssid_dict = dict.fromkeys(subj_set_ids)
for subj_set_id in subj_set_ids:
    subj_set = ntn_subjects[ntn_subjects['subject_set_id'] == subj_set_id]
    metadata = list(subj_set['metadata'])
    subj_ids = list(subj_set['subject_id'])
    seen = []
    evt_ids = []
    subject_ids = []
    #print(metadata[0])
    for idx in range(0, len(metadata)):
        eid = json.loads(metadata[idx])['event']
        if eid not in seen:
            seen.append(eid)
            evt_ids.append(eid)
            subject_ids.append(subj_ids[idx])
            
            
    val = zip(subject_ids, evt_ids)
    
    
    ssid_dict[subj_set_id] = [tup for tup in val]
    #ssid_dict[subj_set_id] = [{'event_id':tup[1], 'subject_id':tup[0]} for tup in val]

In [4]:
n = 0
for ssid in ssid_dict.keys():
    n += len(ssid_dict[ssid])
print(n)

4271


In [5]:
for idx in range(0, len(subj_set_ids)): #loop over all the ssids
    
    print(f'Processing file {idx+1} of {len(subj_set_ids)}')
    ssid = subj_set_ids[idx]
    l = ssid_dict[ssid]
    l.sort(key=lambda x: x[1]) #sort the list in order of event id
    event_ids = [tup[1] for tup in ssid_dict[ssid]]
    
    #event_ids = list(set(event_ids))
    subject_ids = [tup[0] for tup in ssid_dict[ssid]]
    #event_ids.sort() #sort the event_ids. this should speed things up since the 
    outfile = dataio.I3File(os.path.join('/home/aphillips/data/output', f'ntn_events_{ssid}.i3'), 'w') #open empty i3 for output
    infile = dataio.I3File(i3_files[idx]) #open target i3

    while(infile.more()):
        frame = infile.pop_daq() #pop frame
        evt_head = frame["I3EventHeader"] #get event header
        evt_id = evt_head.event_id #get event id
        if(evt_id == event_ids[0]): #check if event id is in our list
            frame['subject_id'] = icetray.I3Int(subject_ids.pop(0))
            outfile.push(frame) #if so, push the frame to our output file
            event_ids.pop(0) #remove that value from the list of event ids
        if event_ids == []: #stop when we've grabbed all of our event ids
            break   
                            
    outfile.close() #close the files
    infile.close()

Processing file 1 of 19


KeyboardInterrupt: 

#### Apply custom modules, save to csvs

In [5]:
dataframes = []
for f in [fname for fname in os.listdir('/home/aphillips/data/output/') if fname != 'output']:
    if (f.split('.')[-1]) == 'hd5':
    #(outfile, hd5_name) = apply_modules(os.path.join('/home/aphillips/data/output/',f), '/home/aphillips/data/output/')
        #dataframes.append(process_data(os.path.join(os.getcwd(), 'output', f)))
        print(f)
        dataframes.append(process_data(os.path.join(os.getcwd(), '../../data/output/', f)))

ap_modules_ntn_events_112392.i3.hd5
ap_modules_ntn_events_112467.i3.hd5
ap_modules_ntn_events_112118.i3.hd5
ap_modules_ntn_events_112120.i3.hd5
ap_modules_ntn_events_112492.i3.hd5
ap_modules_ntn_events_112425.i3.hd5
ap_modules_ntn_events_112473.i3.hd5
ap_modules_ntn_events_112433.i3.hd5
ap_modules_ntn_events_112414.i3.hd5
ap_modules_ntn_events_112498.i3.hd5
ap_modules_ntn_events_112418.i3.hd5
ap_modules_ntn_events_112109.i3.hd5
ap_modules_ntn_events_112481.i3.hd5
ap_modules_ntn_events_112119.i3.hd5
ap_modules_ntn_events_112487.i3.hd5
ap_modules_ntn_events_112501.i3.hd5
ap_modules_ntn_events_112464.i3.hd5
ap_modules_ntn_events_112116.i3.hd5
ap_modules_ntn_events_112391.i3.hd5


#### Concatenate all the csvs into a master df

In [8]:
DF = pd.concat(dataframes) #create a master dataframe
print(len(DF))
DF.to_csv('/home/aphillips/data/output/all_ntn_events_4-24-24.csv', index=False) #save

SyntaxError: invalid syntax (<ipython-input-8-d3c14b816b12>, line 3)

testing

In [4]:
(outfile, hd5_name) = apply_modules('./test.i3', '.')