In [1]:
!pip install uproot awkward 
from uproot_io import Events, View
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import os

file_path = "/home/jovyan/CheatedRecoFile_5.root"

# Check if the file exists
if os.path.exists(file_path):
    print(f"File exists: {file_path}")
    print(f"File size: {os.path.getsize(file_path)} bytes")
else:
    print(f"File does not exist: {file_path}")

events_unseen = Events("CheatedRecoFile_5.root")


from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

File exists: /home/jovyan/CheatedRecoFile_5.root
File size: 583465641 bytes


In [2]:
''' Step Functions '''

# will merely rewrite the functions to use 3d reconstructions. Will evaluate different functions later.

def step_length_3d(events, event_idx):
    # find all info for the feature
    x = events.reco_hits_3d_x[event_idx]
    y = events.reco_hits_3d_y[event_idx]
    z = events.reco_hits_3d_z[event_idx]
    
    x_vtx = events.neutrino_vtx_3d_x[event_idx]
    y_vtx = events.neutrino_vtx_3d_y[event_idx]
    z_vtx = events.neutrino_vtx_3d_z[event_idx]

    if len(x) < 3:
        print(f'Warning: Event {event_idx} has only {len(x)} hits.')
        return None

    if events.is_true_track[event_idx]:
        print('Warning: Event is a true track')

    # finding step length
    x_step = min([abs(t - x_vtx) for t in x])
    y_step = min([abs(t - y_vtx) for t in y])
    z_step = min([abs(t - z_vtx) for t in z])
    step_length = np.sqrt(x_step**2 + y_step**2 + z_step**2)

    return step_length

def adc_step_prod_3d(events, event_idx):
    # find all info for the feature
    x = events.reco_hits_3d_x[event_idx]
    y = events.reco_hits_3d_y[event_idx]
    z = events.reco_hits_3d_z[event_idx]
    
    x_vtx = events.neutrino_vtx_3d_x[event_idx]
    y_vtx = events.neutrino_vtx_3d_y[event_idx]
    z_vtx = events.neutrino_vtx_3d_z[event_idx]
    
    adcs = np.array(events.reco_adcs_w[event_idx])
    adc_avg = np.mean(adcs)

    if len(x) < 3:
        print(f'Warning: Event {event_idx} has only {len(x)} hits.')
        return None

    if events.is_true_track[event_idx]:
        print('Warning: Event is a true track')

    # finding step length
    x_step = min([abs(t - x_vtx) for t in x])
    y_step = min([abs(t - y_vtx) for t in y])
    z_step = min([abs(t - z_vtx) for t in z])
    step_length = np.sqrt(x_step**2 + y_step**2 + z_step**2)

    return step_length * adc_avg

def adc_step_div_3d(events, event_idx):
    # find all info for the feature
    x = events.reco_hits_3d_x[event_idx]
    y = events.reco_hits_3d_y[event_idx]
    z = events.reco_hits_3d_z[event_idx]
    
    x_vtx = events.neutrino_vtx_3d_x[event_idx]
    y_vtx = events.neutrino_vtx_3d_y[event_idx]
    z_vtx = events.neutrino_vtx_3d_z[event_idx]
    
    adcs = np.array(events.reco_adcs_w[event_idx])
    adc_avg = np.mean(adcs)

    if len(x) < 3:
        print(f'Warning: Event {event_idx} has only {len(x)} hits.')
        return None

    if events.is_true_track[event_idx]:
        print('Warning: Event is a true track')

    # finding step length
    x_step = min([abs(t - x_vtx) for t in x])
    y_step = min([abs(t - y_vtx) for t in y])
    z_step = min([abs(t - z_vtx) for t in z])
    step_length = np.sqrt(x_step**2 + y_step**2 + z_step**2)

    return step_length / adc_avg

In [3]:
''' New categorisation function '''

def categorise_event(events, event_idx):
    pdg = events.mc_pdg[event_idx]
    if events.is_true_track[event_idx]:
        return 'track'
    elif pdg in [11, -11]:
        return 'e'  
    elif pdg == 22:
        return 'gamma'
    else: return 'error'

In [4]:
# Get feature data

training_events = Events("CheatedRecoFile_0.root") # Train events with same data as Likelihood to ensure fairness

''' Code for identifying candidate lepton in event. '''

def identify_candidate(events):
    identifiers = events.event_number
    data = np.arange(0, len(events.event_number))
    slices = []
    start_idx = 0

    # Split data into slices based on changes in identifiers
    for i in range(1, len(identifiers)):
        if identifiers[i] != identifiers[i - 1]:
            slices.append(data[start_idx:i])
            start_idx = i  # Update start index for the next slice

    slices.append(data[start_idx:])

    results = []

    for event_number, event_indices in enumerate(slices):  # Enumerate slices to get the event number

        w_hits_event = []
        indices = []  # To keep track of the corresponding `i` values
        
        for i in event_indices:
            w_hits_event.append(events.reco_hits_w[i])
            indices.append(i)  # Store the corresponding `i` values
        
        # Find the index of the maximum length in w_hits_event
        max_idx = max(range(len(w_hits_event)), key=lambda idx: len(w_hits_event[idx]))
        
        # Retrieve the corresponding `i` value
        candidate_idx = indices[max_idx]

        results.append((event_number, candidate_idx))
    
    return results

array_0 = identify_candidate(training_events)
array_5 = identify_candidate(events_unseen)
cheated_0_candidates = [idx[1] for idx in array_0]
cheated_5_candidates = [idx[1] for idx in array_5]

shower_candidates_0 = []
shower_candidates_5 = []

for i in cheated_0_candidates:
    if training_events.mc_pdg[i] in [-11, 11, 22]:
        shower_candidates_0.append(i)

for i in cheated_5_candidates:
    if events_unseen.mc_pdg[i] in [-11, 11, 22]:
        shower_candidates_5.append(i)

def prepare_training_data(events, candidate_array):
    """Prepares the feature matrix and labels for training."""
    features = []
    labels = []

    for event_idx in candidate_array:
        # Calculate feature scores
        step1 = step_length_3d(events, event_idx)
        step2 = adc_step_prod_3d(events, event_idx)
        step3 = adc_step_div_3d(events, event_idx)

        # Ensure valid scores
        if None in [step1, step2, step3]:
            continue

        # Append feature vector
        features.append([step1, step2, step3])

        # Append label
        indicator = categorise_event(events, event_idx)
        if indicator == 'e':
            labels.append(1)
        elif indicator == 'gamma':
            labels.append(0)

    return np.array(features), np.array(labels) # Return as arrays


In [5]:
x_train, y_train = prepare_training_data(training_events, shower_candidates_0) # Training data
x_test, y_test = prepare_training_data(events_unseen, shower_candidates_5) # Testing data - with same data as Likelihood



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)




In [6]:
''' Using AdaBoostClassifier'''
bdt = AdaBoostClassifier(n_estimators=100, random_state=0, algorithm='SAMME')
bdt.fit(x_train, y_train) # Do a BDT model on Training

# Evaluate performance on the training set
train_accuracy = bdt.score(x_train, y_train)
print(f"Training Accuracy: {train_accuracy:.2%}")

# Evaluate performance on the test set
y_pred = bdt.predict(x_test) 
test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Test Accuracy: {test_accuracy:.2%}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

# Plot decision scores for the training set
train_scores = bdt.decision_function(x_train)
min_f = np.round(np.min(train_scores), 1)
max_f = np.round(np.max(train_scores), 1)
bins = np.linspace(min_f, max_f, 4 * int((max_f - min_f) / 0.1) + 1)

cls_0_scores = train_scores[np.where(y_train == 0)]
cls_1_scores = train_scores[np.where(y_train == 1)]

weights_0 = np.ones_like(cls_0_scores) / len(cls_0_scores)
weights_1 = np.ones_like(cls_1_scores) / len(cls_1_scores)

plt.hist(cls_0_scores, color='r', weights=weights_0, bins=bins, histtype='step', label='Class 0')
plt.hist(cls_1_scores, color='b', weights=weights_1, bins=bins, histtype='step', label='Class 1')
plt.legend()
plt.title("Decision Scores for Training Set")
plt.show()

# Visualise classification on the training set
titlesize = 30
labelsize = 20
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

# Ground truth
ax = axes[0]
ax.tick_params(axis='x', labelsize=labelsize)
ax.tick_params(axis='y', labelsize=labelsize)
cls_0 = np.where(y_train == 0)
ax.scatter(x_train[cls_0, 0], x_train[cls_0, 1], c="r", label="Class 0")
cls_1 = np.where(y_train == 1)
ax.scatter(x_train[cls_1, 0], x_train[cls_1, 1], c="b", label="Class 1")
ax.set_xlabel("Feature 1", fontsize=labelsize)
ax.set_ylabel("Feature 2", fontsize=labelsize)
ax.set_title("Training Set Truth", fontsize=titlesize)
ax.legend(fontsize=labelsize)

# Predictions
ax = axes[1]
pred = bdt.predict(x_train)
ax.tick_params(axis='x', labelsize=labelsize)
ax.tick_params(axis='y', labelsize=labelsize)
cls_0 = np.where(pred == 0)
ax.scatter(x_train[cls_0, 0], x_train[cls_0, 1], c="r", label="Class 0")
cls_1 = np.where(pred == 1)
ax.scatter(x_train[cls_1, 0], x_train[cls_1, 1], c="b", label="Class 1")
ax.set_xlabel("Feature 1", fontsize=labelsize)
ax.set_ylabel("Feature 2", fontsize=labelsize)
ax.set_title("Training Set Classification", fontsize=titlesize)
ax.legend(fontsize=labelsize)

fig.tight_layout()
plt.show()

# Visualise prediction correctness
plt.figure(figsize=(12, 12))
correct = np.where(pred == y_train)
plt.scatter(x_train[correct, 0], x_train[correct, 1], c="k", label="Correct")
incorrect = np.where(pred != y_train)
plt.scatter(x_train[incorrect, 0], x_train[incorrect, 1], c="r", label="Incorrect")
plt.xlabel("Feature 1", fontsize=labelsize)
plt.ylabel("Feature 2", fontsize=labelsize)
plt.title("Training Set Prediction Correctness", fontsize=titlesize)
plt.legend(fontsize=labelsize)
plt.show()

Training Accuracy: 87.56%


ValueError: Found input variables with inconsistent numbers of samples: [4768, 4769]