In [1]:
import numpy as np # type: ignore
import sys
import os, fnmatch
import pandas as pd # type: ignore
import glob
import json
from icecube import dataio, dataclasses, icetray, MuonGun # type: ignore
from I3Tray import * # type: ignore
from icecube.hdfwriter import I3HDFWriter # type: ignore
import h5py # type: ignore
from APMCLabeler import APMCLabeler
from phase2_filters import *

## phase2_processing notebook
Used for processing of phase 2 files. Keeps track of event counts and attempts to obtain uniform distribution of event types.
If program is interrupted, see phase2_log file to see last processed file. Edit nue_files.csv amd numu_files.csv as neeeded to specify desired subruns to process. phase2_log shows most recent event counts - can also edit event_counter below.

In [2]:
OUTDIR = '/scratch/aphillips/phase2_data/' #change to desired
NUMU_DIR = '/data/sim/IceCube/2020/filtered/test/newprocessing/neutrino-generator/21971/0000000-0000999/classifier/'
NUE_DIR = '/data/sim/IceCube/2020/filtered/test/newprocessing/neutrino-generator/22067/0000000-0000999/classifier/'
N = 8000 #number of events desired

In [3]:
numu_subruns = pd.read_csv('/home/aphillips/name-that-neutrino/phase2/numu_files.csv')['subrun']
nue_subruns = pd.read_csv('/home/aphillips/name-that-neutrino/phase2/nue_files.csv')['subrun']

In [4]:
#counters for event types [n_skim, n_casc, n_tg, n_stop, n_start]
event_counter = [1604, 798, 1700, 1028, 1759]
categories = [1,3]

In [5]:
log_file = open("phase2_log.txt", "a")

In [6]:
def log(text, file):
    print(text)
    file.write(text)
    file.flush()

In [7]:
log('Processing NuMu files...', log_file)
for subrun in numu_subruns:
    
    if categories == []:
        break
    
    subrun = f'{subrun}'.rjust(6, '0')
    filename = f'classifier_rehyd_DST_IC86.2020_NuMu.021971.{subrun}.i3.zst'
    filepath = os.path.join(NUMU_DIR, filename)

    log(f'Processing file {filename} ...\n', log_file)

    label_events(filepath, OUTDIR) #mc labels all events
    make_csv(os.path.join(OUTDIR,f'mc_labeled_{filename}.hd5'), os.path.join(OUTDIR, 'numu_event_csvs'), subrun, event_types = categories) #makes dataframe of desired events
    do_cuts(os.path.join(OUTDIR,f'mc_labeled_{filename}'), OUTDIR, os.path.join(OUTDIR, 'numu_event_csvs', f'events_df_{subrun}.csv'))
    extract_daq(os.path.join(OUTDIR, f'cuts_mc_labeled_{filename}'), f'21971{subrun}', OUTDIR) 
    
    event_csv = pd.read_csv(os.path.join(OUTDIR, 'numu_event_csvs', f'events_df_{subrun}.csv')) #open the csv we just created

    #loop over the categories
    for i in categories:
        event_counter[i] += len(event_csv[event_csv['ntn_category'] == i]) #count the number of events in each category

    categories = [i for i in categories if event_counter[i] <= N/5]

    log(f'[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = {event_counter}\n', log_file)
    
    log(f'File {filename} processing complete\n', log_file)

Processing NuMu files...
Processing file classifier_rehyd_DST_IC86.2020_NuMu.021971.000019.i3.zst ...

Length: 1390
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 450, 1700, 586, 1759]

File classifier_rehyd_DST_IC86.2020_NuMu.021971.000019.i3.zst processing complete

Processing file classifier_rehyd_DST_IC86.2020_NuMu.021971.000020.i3.zst ...

Length: 1407
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 457, 1700, 622, 1759]

File classifier_rehyd_DST_IC86.2020_NuMu.021971.000020.i3.zst processing complete

Processing file classifier_rehyd_DST_IC86.2020_NuMu.021971.000021.i3.zst ...

Length: 1257
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 465, 1700, 658, 1759]

File classifier_rehyd_DST_IC86.2020_NuMu.021971.000021.i3.zst processing complete

Processing file classifier_rehyd_DST_IC86.2020_NuMu.021971.000022.i3.zst ...

Length: 1325
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 468, 1700, 695, 1

In [8]:
log('Processing NuE files...\n', log_file)
for subrun in nue_subruns:
    
    if categories == []:
        break
    
    subrun = f'{subrun}'.rjust(6, '0')
    filename = f'classifier_rehyd_DST_IC86.2020_NuE.022067.{subrun}.i3.zst'
    filepath = os.path.join(NUE_DIR, filename)

    log(f'Processing file {filename} ...\n', log_file)

    label_events(filepath, OUTDIR) #mc labels all events
    make_csv(os.path.join(OUTDIR,f'mc_labeled_{filename}.hd5'), os.path.join(OUTDIR, 'nue_event_csvs'), subrun,  event_types = categories) #makes dataframe of desired events
    do_cuts(os.path.join(OUTDIR,f'mc_labeled_{filename}'), OUTDIR, os.path.join(OUTDIR, 'nue_event_csvs', f'events_df_{subrun}.csv')) #cut on desired frames
    extract_daq(os.path.join(OUTDIR, f'cuts_mc_labeled_{filename}'), f'22067{subrun}', OUTDIR) #extract daq only and split into 2 mb sizes

    event_csv = pd.read_csv(os.path.join(OUTDIR, 'nue_event_csvs', f'events_df_{subrun}.csv')) #open the csv we just created

    #loop over the categories
    for i in categories:
        event_counter[i] += len(event_csv[event_csv['ntn_category'] == i]) #count the number of events in each category

    categories = [i for i in categories if event_counter[i] <= N/5]

    log(f'[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = {event_counter}\n', log_file)
    
    log(f'File {filename} processing complete\n', log_file)

Processing NuE files...

Processing file classifier_rehyd_DST_IC86.2020_NuE.022067.000021.i3.zst ...

Length: 328
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 553, 1700, 1028, 1759]

File classifier_rehyd_DST_IC86.2020_NuE.022067.000021.i3.zst processing complete

Processing file classifier_rehyd_DST_IC86.2020_NuE.022067.000022.i3.zst ...

Length: 364
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 567, 1700, 1028, 1759]

File classifier_rehyd_DST_IC86.2020_NuE.022067.000022.i3.zst processing complete

Processing file classifier_rehyd_DST_IC86.2020_NuE.022067.000023.i3.zst ...

Length: 407
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 583, 1700, 1028, 1759]

File classifier_rehyd_DST_IC86.2020_NuE.022067.000023.i3.zst processing complete

Processing file classifier_rehyd_DST_IC86.2020_NuE.022067.000024.i3.zst ...

Length: 388
[n_skim, n_cascade, n_throughgoing, n_stopping, n_starting] = [1604, 612, 1700, 1028, 1759]

F

In [9]:
log('All files processed...exiting\n', log_file)

All files processed...exiting

