# CVN Classifier

----

Aditya Marathe

## Imports

In [1]:
%matplotlib inline

import sys

import numpy as np

import pandas as pd

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

: 

**Local:**

In [None]:
sys.path.insert(1, './../')

import ana
import plotting
import labbook

**Version:**

In [None]:
print(f'Python {sys.version}\n')
print(f'NumPy {np.__version__}')

Python 3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]

NumPy 1.26.4
Sci-kit Learn 1.4.0


## Configuration

In [None]:
config = {
    'Cuts': [
        'Detector Quality',
        'Data Quality',
        'Cosmic Rej.',
        'Veto',
        'CVN PID Score'
        # 'Not Containment'
    ],
    'Transforms': [
        'tf_290124_positive_energies',
        'tf_290124_valid_pid',
        'tf_280224_encode_event_type',
        'tf_280224_class_balance'
    ],
    'TestingTransforms': [
        'tf_290124_positive_energies',
        'tf_290124_valid_pid',
        'tf_120224_first_prong',
        'tf_280224_encode_event_type'
    ],
    'XDataCols': [
        # Loose pre-selection IDs
        'rec.sel.cvnloosepreselptp.cosmicid',
        'rec.sel.cvnloosepreselptp.ncid',
        'rec.sel.cvnloosepreselptp.numuid',
        # REM and SCP IDs
        'rec.sel.remid.pid',
        'rec.sel.scann.scpid',
        # Muon energies
        'rec.energy.numu.lstmmuon'
    ],
    'YDataCols': [
        'ana.cat.event_type'
    ]
}

## Loading training dataset

In [None]:
ds = ana.Datasets()

Datasets | Found the following: MINI_DATA_DIR, DATA_V2_DIR, DATA_V3_DIR, REALLY_MINI_DIR, COPYMERGED_C8_DIR, COPYMERGED_C9_DIR, COPYMERGED_C10_DIR, COPYMERGED_C11_DIR, COPYMERGED_C13_DIR, COPYMERGED_C15_DIR, COPYMERGED_C16_DIR, COPYMERGED_C17_DIR, COPYMERGED_C19_DIR, COPYMERGED_C20_DIR, COPYMERGED_C21_DIR, COPYMERGED_C22_DIR


In [None]:
data = ana.NOvAData.init_from_copymerge_h5(
    h5dirs=[
        ds.COPYMERGED_C8_DIR,  # type: ignore
        ds.COPYMERGED_C9_DIR,  # type: ignore
        ds.COPYMERGED_C10_DIR,  # type: ignore
        ds.COPYMERGED_C11_DIR,  # type: ignore
        ds.COPYMERGED_C13_DIR,  # type: ignore
        ds.COPYMERGED_C15_DIR,  # type: ignore
        ds.COPYMERGED_C16_DIR  # type: ignore
    ]
)

NOvAData | Loading tables from copymerged HDF5 files...
NOvAData | Loaded table from copymerged HDF5 files (1 / 7).
NOvAData | Loaded table from copymerged HDF5 files (2 / 7).
NOvAData | Loaded table from copymerged HDF5 files (3 / 7).
NOvAData | Loaded table from copymerged HDF5 files (4 / 7).
NOvAData | Loaded table from copymerged HDF5 files (5 / 7).
NOvAData | Loaded table from copymerged HDF5 files (6 / 7).
NOvAData | Loaded table from copymerged HDF5 files (7 / 7).
NOvAData | Initialised NOvAData(features=54, events=1_094_528).


In [None]:
data.fill_ana_flags(inplace=True)
data.fill_ana_track_kinematics(inplace=True)
data.fill_categorical(inplace=True)

NOvAData | Filled MC truth flags.
NOvAData | Filled track kinematics.
NOvAData | Filled categorical data.


In [None]:
cuts = ana.Cuts.init_nova_cuts()

In [None]:
# Cuts
data.table = cuts.apply_cuts(config['Cuts'], data.table)
# data.table = cuts.apply_cut('Containment', data.table, passed=False)


Cuts     | Applied 'Detector Quality' cut (1_094_528 -> 919_711 events).
Cuts     | Applied 'Data Quality' cut (919_711 -> 919_688 events).
Cuts     | Applied 'Cosmic Rej.' cut (919_688 -> 351_421 events).
Cuts     | Applied 'Veto' cut (351_421 -> 313_034 events).
Cuts     | Applied 'CVN PID Score' cut (313_034 -> 196_427 events).
Cuts     | Applied 'Containment' cut (196_427 -> 105_055 events).


In [None]:
# Transforms
data.apply_transforms(config['Transforms'], inplace=True)

NOvAData | Applied a transform which cuts out the negative energies caused by issues with the reco. models.
NOvAData | Applied a transform which ensures that the PID score is between 0 and 1.
NOvAData | Applied a transform which encodes the event type as 1 for (A-)NuMu CC and 0 for background.
NOvAData | Applied a transform which balances the number of events for each class.


In [None]:
data.table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,rec.energy.numu.E,rec.energy.numu.calccE,rec.energy.numu.hadcalE,rec.energy.numu.hadtrkE,rec.energy.numu.lstmmuon,rec.energy.numu.lstmnu,rec.energy.numu.regcvnhadE,rec.energy.numu.trkccE,rec.energy.numu.recomuonE,rec.energy.numu.hadclust.calE,...,ana.mc.flag.isANuMuCC,ana.mc.flag.isNuECC,ana.mc.flag.isANuECC,ana.trk.kalman.tracks.cosBeam,ana.trk.kalman.tracks.PtToPmu,ana.trk.kalman.tracks.Pmu,ana.trk.kalman.tracks.Pt,ana.trk.kalman.tracks.Qsquared,ana.trk.kalman.tracks.W,ana.cat.event_type
run,subrun,cycle,batch,evt,subevt,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
14746,41,0,0,173,1,0.822706,0.816658,0.00418,0.00991,0.787001,0.870839,0.074651,0.822706,0.783632,0.00744,...,False,False,False,[0.93944854],[0.34269005],0.779876,[0.2672558],[0.083492205],[0.9768258],0
14746,41,0,0,652,1,0.822706,0.816658,0.00418,0.00991,0.787001,0.870839,0.074651,0.822706,0.783632,0.00744,...,False,False,False,[0.93944854],[0.34269005],0.779876,[0.2672558],[0.083492205],[0.9768258],0
14750,20,0,0,699,1,1.001116,1.070769,0.190039,0.024564,0.554846,0.997018,0.612823,1.001116,0.560385,0.33827,...,False,False,False,[0.9324372],[0.3613321],0.544693,[0.19681512],[0.082464255],[1.2757915],0
14750,20,0,0,793,2,2.867611,3.288869,0.728458,0.010688,2.563976,3.151421,0.924363,2.867611,1.465072,1.296655,...,False,False,False,"[0.7390402, 0.28997177]","[0.6736613, 0.9570352]",2.561798,"[1.7257842, 2.451731]","[4.2161794, 11.467112]","[nan, nan]",0
14754,16,0,0,461,2,1.859418,1.658546,0.022085,0.007007,1.886155,2.02481,0.094978,1.859418,1.790292,0.039311,...,False,False,False,[0.9914725],[0.13031621],1.883193,[0.2454106],[0.065862425],[1.0366701],0


In [None]:
std_scaler = StandardScaler()

scale_vars = [
    'rec.energy.numu.lstmmuon'
]

In [None]:
tt_split = data.train_test_split(
    x_cols=config['XDataCols'],
    y_cols=config['YDataCols'],
    test_size=0.3
)

tt_split['XTrain'].loc[:, scale_vars] = std_scaler.fit_transform(
    tt_split['XTrain'][scale_vars]
)
tt_split['XTest'].loc[:, scale_vars] = std_scaler.transform(
    tt_split['XTest'][scale_vars]
)
tt_split['YTrain'] = tt_split['YTrain'].to_numpy().flatten()
tt_split['YTest'] = tt_split['YTest'].to_numpy().flatten()

## Loading testing dataset

In [None]:
test_data = ana.NOvAData.init_from_copymerge_h5(
    h5dirs=[
        ds.COPYMERGED_C19_DIR,  # type: ignore
        ds.COPYMERGED_C20_DIR,  # type: ignore
        ds.COPYMERGED_C21_DIR,  # type: ignore
        ds.COPYMERGED_C22_DIR  # type: ignore
    ]
)

NOvAData | Loading tables from copymerged HDF5 files...
NOvAData | Loaded table from copymerged HDF5 files (1 / 4).
NOvAData | Loaded table from copymerged HDF5 files (2 / 4).
NOvAData | Loaded table from copymerged HDF5 files (3 / 4).
NOvAData | Loaded table from copymerged HDF5 files (4 / 4).
NOvAData | Initialised NOvAData(features=54, events=671_350).


In [None]:
test_data.fill_ana_flags(inplace=True)
test_data.fill_categorical(inplace=True)
test_data.fill_ana_track_kinematics(inplace=True)

NOvAData | Filled MC truth flags.
NOvAData | Filled categorical data.
NOvAData | Filled track kinematics.


In [None]:
test_data.apply_transforms(config['TestingTransforms'], inplace=True)

NOvAData | Applied a transform which cuts out the negative energies caused by issues with the reco. models.
NOvAData | Applied a transform which ensures that the PID score is between 0 and 1.
NOvAData | Applied a transform which only keeps data for the first prong.
NOvAData | Applied a transform which encodes the event type as 1 for (A-)NuMu CC and 0 for background.


In [None]:
# Cuts
fail_cont_table = cuts.apply_cuts(config['Cuts'], test_data.table)
# fail_cont_table = cuts.apply_cut('Containment', fail_cont_table, passed=False)

Cuts     | Applied 'Detector Quality' cut (275_579 -> 264_421 events).
Cuts     | Applied 'Data Quality' cut (264_421 -> 264_421 events).
Cuts     | Applied 'Cosmic Rej.' cut (264_421 -> 149_447 events).
Cuts     | Applied 'Veto' cut (149_447 -> 141_050 events).
Cuts     | Applied 'CVN PID Score' cut (141_050 -> 95_669 events).
Cuts     | Applied 'Containment' cut (95_669 -> 47_822 events).


In [None]:
# Cuts
full_cut_table = cuts.apply_all_cuts(test_data.table)

Cuts     | Applied 'Detector Quality' cut (275_579 -> 264_421 events).
Cuts     | Applied 'Data Quality' cut (264_421 -> 264_421 events).
Cuts     | Applied 'CVN PID Score' cut (264_421 -> 101_766 events).
Cuts     | Applied 'Containment' cut (101_766 -> 48_577 events).
Cuts     | Applied 'Cosmic Rej.' cut (48_577 -> 47_955 events).
Cuts     | Applied 'Veto' cut (47_955 -> 47_847 events).


In [None]:
x_test_data = fail_cont_table[config['XDataCols']]
y_test_data = fail_cont_table[config['YDataCols']].to_numpy()

In [None]:
x_test_data.loc[:, scale_vars] = std_scaler.transform(x_test_data[scale_vars])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test_data.loc[:, scale_vars] = std_scaler.transform(x_test_data[scale_vars])


## Model