# do_processing.ipynb
## Author: Andrew Phillips
## Purpose: Selects NTN phase 1 dataset from raw i3 sim files, applies custom modules, and saves to csv


In [9]:
import numpy as np
import sys
import os, fnmatch
import pandas as pd
import glob
import json
from icecube import dataio, dataclasses, icetray, MuonGun
from I3Tray import *
from icecube.hdfwriter import I3HDFWriter
import h5py
from i3_tools import *

In [6]:
SUBJ_SET_IDS = [112109, 112391, 112425, 112433, 112464, 112392, 112414, 112418, 112498, 
                       112473, 112487, 112492, 112467, 112116, 112481, 112501, 112118, 112119, 112120]                  #subject set ids we want

#### Read in file paths, and corresponding subject set ids

In [2]:
file_info = pd.read_csv(os.path.join(os.getcwd(), 'ntn_phase1_files.csv'))
i3_files = list(file_info['file path'])
subj_set_ids = list(file_info['subject_set_id'])

#### Select only desired events


In [13]:
#make dict of subj_set_ids, where data is list of all (event_id, subject_id) pairs in that subject set
ntn_subjects = pd.read_csv('../../data/data_exports_3-30/name-that-neutrino-subjects.csv')
ntn_subjects = ntn_subjects[ntn_subjects['workflow_id']==23715]
ssid_dict = dict.fromkeys(SUBJ_SET_IDS)
for subj_set_id in SUBJ_SET_IDS:
    subj_set = ntn_subjects[ntn_subjects['subject_set_id'] == subj_set_id]
    metadata = subj_set['metadata']
    subject_ids = list(subj_set['subject_id'])
    evt_ids = [json.loads(md)['event'] for md in metadata]
    val = zip(subject_ids, evt_ids)
    ssid_dict[subj_set_id] = [tup for tup in zip(subject_ids, evt_ids)]
print(ssid_dict)
event_ids = [tup[1] for tup in ssid_dict[112109]]
print(event_ids)

{112109: [(86173828, 10516), (86173829, 8285), (86173832, 10871), (86173836, 19364), (86173842, 13667), (86173846, 26877), (86173851, 10164), (86173866, 4811), (86173870, 703), (86173873, 4962), (86173877, 30340), (86173881, 11340), (86173885, 18487), (86173889, 24338), (86173893, 32370), (86173897, 8059), (86173901, 8671), (86173906, 2640), (86173910, 9646), (86173914, 4499), (86173919, 879), (86173924, 19344), (86173927, 33446), (86173931, 18054), (86173936, 10553), (86173940, 9206), (86173944, 29513), (86174025, 19969), (86174029, 15097), (86174034, 5471), (86174042, 31797), (86174046, 17325), (86174050, 23692), (86174054, 26700), (86174059, 28724), (86174064, 26656), (86174069, 32183), (86174072, 16955), (86174076, 14173), (86174081, 11723), (86174086, 6179), (86174089, 2587), (86174093, 1530), (86174096, 11091), (86174100, 28045), (86174104, 624), (86174108, 23398), (86174112, 6412), (86174115, 25308), (86174119, 5968), (86174122, 26542), (86174125, 11584), (86174128, 4167), (8617

In [3]:
for idx in range(0, len(subj_set_ids)): #loop over all the ssids
    
    print(f'Processing file {idx+1} of {len(subj_set_ids)}')
    print(subj_set_ids[idx])
    print(i3_files[idx].split('/')[-1])
    f = os.path.join(os.getcwd(), 'event_ids', f'evt_ids_{subj_set_ids[idx]}.csv') #get path to event ids list
    df = pd.read_csv(f)
    df = df.sort_values(by=['event_id'])
    df = df.drop_duplicates(subset=['event_id'])
    #event_ids = list(df['event_id']) #TURN DF Into list
    #event_ids = list(set(event_ids)) #there's some duplicate entries, so get rid of those
    ssid = subj_set_ids[idx]
    event_ids = [tup[1] for tup in ssid_dict[ssid]]
    event_ids = list(set(event_ids))
    subject_ids = [tup[0] for tup in ssid_dict[ssid]]
    

    #subject_ids = list(df['subject_id'])
    event_ids.sort() #sort the event_ids. this should speed things up since the 
    outfile = dataio.I3File(os.path.join('output', f'ntn_events_{subj_set_ids[idx]}.i3'), 'w') #open empty i3 for output
    infile = dataio.I3File(i3_files[idx]) #open target i3

    while(infile.more()):
        frame = infile.pop_daq() #pop frame
        evt_head = frame["I3EventHeader"] #get event header
        evt_id = evt_head.event_id #get event id
        if(evt_id == event_ids[0]): #check if event id is in our list
            frame['subject_id'] = icetray.I3Int(subject_ids.pop(0))
            outfile.push(frame) #if so, push the frame to our output file
            event_ids.pop(0) #remove that value from the list of event ids
        if event_ids == []: #stop when we've grabbed all of our event ids
            break   
                            
    outfile.close() #close the files
    infile.close()

Processing file 1 of 19
112109
classifier_DST_IC86.2020_NuMu.021971.000000.i3.bz2
Processing file 2 of 19
112391
classifier_rehyd_DST_IC86.2020_NuMu.021971.000001.i3.zst
Processing file 3 of 19
112425
classifier_rehyd_DST_IC86.2020_NuMu.021971.000002.i3.zst
Processing file 4 of 19
112433
classifier_rehyd_DST_IC86.2020_NuMu.021971.000219.i3.zst
Processing file 5 of 19
112464
classifier_rehyd_DST_IC86.2020_NuMu.021971.000898.i3.zst
Processing file 6 of 19
112392
classifier_rehyd_DST_IC86.2020_NuE.022067.000000.i3.zst
Processing file 7 of 19
112414
classifier_rehyd_DST_IC86.2020_NuE.022067.000001.i3.zst
Processing file 8 of 19
112418
classifier_rehyd_DST_IC86.2020_NuE.022067.000003.i3.zst
Processing file 9 of 19
112498
classifier_rehyd_DST_IC86.2020_NuE.022067.000032.i3.zst
Processing file 10 of 19
112473
classifier_rehyd_DST_IC86.2020_NuE.022067.000052.i3.zst
Processing file 11 of 19
112487
classifier_rehyd_DST_IC86.2020_NuE.022067.000119.i3.zst
Processing file 12 of 19
112492
classifier

#### Apply custom modules, save to csvs

In [5]:
output_csvs = []
for f in [fname for fname in os.listdir(os.path.join(os.getcwd(), 'output')) if fname != 'output']:
    print(f)
    #(outfile, hd5_name) = apply_modules(os.path.join(os.getcwd(), 'output',f), os.path.join(os.getcwd(), 'output'))
    #output_csvs.append(process_data(os.path.join(os.getcwd(), 'output', hd5_name), os.path.join(os.getcwd(), 'output', 'output'), f.split('.')[0].split('_')[-1]))

ntn_events_112498.i3
ntn_events_112120.i3
ntn_events_112433.i3
ntn_events_112118.i3
ntn_events_112119.i3
ntn_events_112425.i3
ntn_events_112391.i3
ntn_events_112418.i3
ntn_events_112392.i3
ntn_events_112109.i3
ntn_events_112116.i3
ntn_events_112501.i3
ntn_events_112481.i3
ntn_events_112414.i3
ntn_events_112464.i3
ntn_events_112473.i3
ntn_events_112492.i3
ntn_events_112467.i3
ntn_events_112487.i3


#### Concatenate all the csvs into a master df

In [3]:
DF = pd.read_csv(output_csvs.pop()) #create a new dataframe

for csv in output_csvs: #loop over all the remaining csvs, concatenate to dataframe
    
    DF = pd.concat([DF, pd.read_csv(csv)])
    
#DF.drop(['Unnamed: 0'])
DF.to_csv('ntn_all_events.csv', index=False) #save