In [1]:
!pip install uproot awkward 
!pip install xgboost
from uproot_io import Events, View
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from scipy.spatial import ConvexHull
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import joblib

cheated_0 = Events('CheatedRecoFile_0.root') # training set
cheated_5 = Events("CheatedRecoFile_5.root") # testing



In [2]:
""" BDT Feature Functions  """ 

# track/shower
def correlation(events, event_idx):
    x_hits = events.reco_hits_x_w[event_idx]
    w_hits = events.reco_hits_w[event_idx] 

    # Check if there are valid hits
    if len(w_hits) == len(x_hits) and len(w_hits) > 15: # talk about advantages and disadvantages of results with a threshold 
        if np.std(x_hits) == 0 or np.std(w_hits) == 0:
            return None  # No valid correlation if there's no variation in data
        
        correlation = np.corrcoef(x_hits, w_hits)[0, 1]
        
        # Fit line using w_hits for x and calculate predicted y-values
        line_fit = np.polyfit(w_hits, x_hits, 1)
        line_y_pred = np.polyval(line_fit, w_hits)
        
        # Calculate line error between predicted and actual x_hits
        line_error = np.mean((x_hits - line_y_pred) ** 2)
        
        # Normalize scores
        correlation_score = abs(correlation) if not np.isnan(correlation) else 0
        error_score = max(0, 1 - line_error / 20) if line_error < 20 else 0
        
        # Weighted score
        line_score = (correlation_score * 0.7) + (error_score * 0.3)
        
        return (line_score * 100)  # Return the score and category

    else:
        return None

def noise(events, event_idx, eps=2, min_samples=5):
    # Extract hit positions (no PDG filtering, just use reco hits)
    x_hits = events.reco_hits_x_w[event_idx]
    w_hits = events.reco_hits_w[event_idx]

    # Check if there are valid hits
    if len(w_hits) == len(x_hits) and len(w_hits) > 15:
        # Combine the coordinates for clustering
        hits_coordinates = np.column_stack((w_hits, x_hits))

        # Apply DBSCAN clustering
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(hits_coordinates)
        labels = db.labels_

        # Count noise points (labeled as -1)
        n_noise = np.sum(labels == -1)

        # Count clusters (unique labels excluding -1)
        unique_clusters = set(labels) - {-1}
        n_clusters = len(unique_clusters)

        return n_noise + n_clusters
    else:
        return None
        
def rms(events, event_idx):
    w_hits = events.reco_hits_w[event_idx]
    x_hits = events.reco_hits_x_w[event_idx]

    if len(w_hits) == len(x_hits) and len(w_hits) > 15:
        slope, intercept = np.polyfit(w_hits, x_hits, 1)
        
        actual = x_hits
        predicted = slope * w_hits + intercept
        
        meanSquaredError = ((predicted - actual) ** 2).mean()
        return np.sqrt(meanSquaredError)
    else: 
        return None 
        
def angle(events, event_idx):
    x_hits = events.reco_hits_x_w[event_idx]
    w_hits = events.reco_hits_w[event_idx]

    if len(w_hits) == len(x_hits) and len(w_hits) > 15:
        # Fit the best-fit line
        line_fit = np.polyfit(w_hits, x_hits, 1)
        line_slope = line_fit[0]
        line_intercept = line_fit[1]

        # Calculate residuals (distance from the line)
        line_y_pred = np.polyval(line_fit, w_hits)
        residuals = np.abs(x_hits - line_y_pred)

        # Find the index of the furthest point
        furthest_idx = np.argmax(residuals)
        furthest_point = np.array([x_hits[furthest_idx], w_hits[furthest_idx]])

        # Start of the line is at the minimum W-coordinate
        min_w = np.min(w_hits)
        start_point = np.array([line_slope * min_w + line_intercept, min_w])

        # End of the red line (best-fit line) at the maximum W-coordinate
        max_w = np.max(w_hits)
        end_of_red_line = np.array([line_slope * max_w + line_intercept, max_w])

        # Calculate the lengths of the three sides of the triangle
        red_line_length = np.linalg.norm(end_of_red_line - start_point)  # Distance between start and end of red line
        purple_line_length = np.linalg.norm(furthest_point - start_point)  # Distance between start and furthest point (purple line)
        third_line_length = np.linalg.norm(furthest_point - end_of_red_line)  # Distance between end of red line and furthest point (third line)

        # Using the cosine rule to calculate the angle between the red and purple lines
        cos_theta = (red_line_length**2 + purple_line_length**2 - third_line_length**2) / (2 * red_line_length * purple_line_length)
        angle_radians = np.arccos(np.clip(cos_theta, -1.0, 1.0))  # Clip value to avoid out-of-bound errors
        angle_degrees = np.degrees(angle_radians)  # Convert radians to degrees
        
        return angle_degrees
    else:
        return None
        
def line(events, event_idx):
    w_hits = np.array(events.reco_hits_w[event_idx])
    x_hits = np.array(events.reco_hits_x_w[event_idx])

    if len(w_hits) == len(x_hits) and len(w_hits) > 15:
    
        # Calculate differences between consecutive points
        dx = np.diff(w_hits)
        dy = np.diff(x_hits)
    
        # Compute segment lengths
        segment_lengths = np.sqrt(dx**2 + dy**2)
    
        # Total arc length (line integral)
        total_length = np.sum(segment_lengths)
    
        # Normalize by the number of points
        normalised_length = total_length / len(w_hits)

        return normalised_length
    else:
        return None 
        
def q4(events, event_idx):
    adcs = events.reco_adcs_w[event_idx]

    if len(adcs) > 15:

        q4_idx = len(adcs) // 4

        adcs_q4 = adcs[-q4_idx:]
    
        ratio = sum(adcs_q4) / sum(adcs)
    
    
        return ratio
    else:
       return None

# e/gamma
def step_length(events, event_idx):
    # Find all info for the feature
    w_hits = events.reco_hits_w[event_idx]
    x_hits = events.reco_hits_x_w[event_idx]
    w_vtx = events.neutrino_vtx_w[event_idx]
    x_vtx = events.neutrino_vtx_x[event_idx]

    # Skip events where there are fewer than 15 hits
    if len(w_hits) < 15:
        return None  # Return None to indicate that the feature should be skipped

    # Finding step length
    w_step = min([abs(w - w_vtx) for w in w_hits])
    x_step = min([abs(x - x_vtx) for x in x_hits])
    step_length = np.sqrt(w_step**2 + x_step**2)

    return step_length

def find_radial_density_increase(x, y, bins=50, center=None, start_radius=0, debug=False, safety_r=5): # NOT A FEATURE
    if center is None:
        center = (np.mean(x), np.mean(y))
    
    r = np.sqrt((x - center[0])**2 + (y - center[1])**2)
    
    r = r[r > start_radius]
    
    if len(r) == 0:
        return safety_r
    
    r_sorted = np.sort(r)
    
    bin_edges = np.linspace(start_radius, max(r_sorted), bins)
    counts, _ = np.histogram(r_sorted, bins=bin_edges)
    
    areas = np.pi * (bin_edges[1:]**2 - bin_edges[:-1]**2)
    densities = counts / areas
    
    diffs = np.diff(densities)
    increase_idx = np.argmax(diffs > 0)
    
    if increase_idx == 0 and diffs[0] <= 0:
        return safety_r
    
    return bin_edges[increase_idx]
def dE_dx(events, event_idx, smear=[0, 0], cone_angle=(5/6)*np.pi, debug=False):
    w_hits = events.reco_hits_w[event_idx]

    if len(w_hits) <= 15:
        if debug:
            print(f'Event {event_idx} does not meet 15 hit cutoff\n')
        return None

    def debug_print(message):
        if debug:
            print(f"    {message}")
    if debug:
        print(f"Debug for event {event_idx}")
    
    x_hits = events.reco_hits_x_w[event_idx]
    w_vtx = events.neutrino_vtx_w[event_idx]
    x_vtx = events.neutrino_vtx_x[event_idx]
    w_vtx_smeared = w_vtx + smear[0]
    x_vtx_smeared = x_vtx + smear[1]
    adcs = events.reco_adcs_w[event_idx]
    
    pdg = events.mc_pdg[event_idx]
    c_class = 'lightblue'
    event = 'ERROR: Not e/gamma'
    if pdg in [-11, 11]:
        c_class = '#EEEE00'
        event = 'Electron'
    elif pdg == 22:
        c_class = 'g'
        event = 'Photon'
    debug_print(f'Lepton: {event}')

    if np.sign(np.mean(w_hits) - w_vtx) == -1:
        w_hits = 2 * w_vtx - np.array(w_hits)
        debug_print(f'Event {event_idx} was left-facing, and has been inverted')

    theta_0 = np.arctan2(np.mean(x_hits) - x_vtx, np.mean(w_hits) - w_vtx)
    theta_u = theta_0 + (cone_angle/2)
    theta_l = theta_0 - (cone_angle/2)
    debug_print(f'Angle info:\n        Yaw = {np.degrees(theta_0):.2f}° from +w\n        Upper Angle = {np.degrees(theta_u):.2f}°\n        Lower Angle: {np.degrees(theta_l):.2f}°')
    
    # You may also want to play around with cone_angle, I don't expect it to have a super huge effect for values > pi/2

    angles = np.arctan2(x_hits - x_vtx_smeared, w_hits - w_vtx_smeared)
    distance = [np.sqrt((w - w_vtx)**2 + (x - x_vtx)**2) for w, x in zip(w_hits, x_hits)]

    r_start = 5
    if len(distance) >= 5:
        r_start = sorted(distance)[4]
        
    # r_start is arbitrary, just the distance from the event vertex that the radial density algorithm will begin searching, its good for events that start with a cluster and then a gap
    # worth tweaking if you really want to
    
    testing_distance = find_radial_density_increase(w_hits, x_hits, center = (w_vtx_smeared, x_vtx_smeared), start_radius = r_start)
    branch_distance = max(r_start + 5, testing_distance)
    
    mask = (angles >= theta_l) & (angles <= theta_u) & (distance < branch_distance)
    if mask.sum() == 0:
        debug_print('ERROR: Mask is empty')
        return None

    points = np.column_stack((w_hits[mask], x_hits[mask]))
    di = np.sqrt(((points[:, None, :] - points[None, :, :]) ** 2).sum(axis=2))
    dx = np.maximum(np.max(di), 0.48)

    val = np.sum(adcs[mask]) / dx
    debug_print(f'dE = {np.sum(adcs[mask])}')
    debug_print(f'dx = {dx}')
    debug_print(f'dE/dx = {val}\n')

    return val

# booster features
def hit_count(events, event_idx, min_hits=15):
    """ Hit_count is to help candidate_lepton it increased the accuracy by around 1.5% """
    # Get all hits for the given event
    all_w_hits = events.reco_hits_w[event_idx]  # Wire plane hits
    all_x_hits = events.reco_hits_x_w[event_idx]  # X-plane hits
    
    # Count total number of hits
    num_hits = len(all_w_hits)  # Assuming same length for both planes

    # Apply filtering: Skip events with fewer than `min_hits`
    if num_hits < min_hits:
        return None  # Not enough hits
    
    return num_hits

def adc_sum(events, event_idx):
    adcs = events.reco_adcs_w[event_idx]
    return np.sum(adcs)

def hull_density(events, event_idx):
    w_hits = events.reco_hits_w[event_idx] 

    if len(w_hits) < 15:
        return None
        
    x_hits = events.reco_hits_x_w[event_idx]
    points = np.column_stack((w_hits, x_hits))
    
    hull = ConvexHull(points)
    hull_area = hull.volume
    
    hull_density = len(points) / hull_area
    return hull_density

# candidate lepton feature
def primary(events, idx_array): # NOT A FEATURE
    # find nu_vtx and particle_vtx of all indices in idx_array
    nu_vtx_w = events.neutrino_vtx_w[idx_array]
    nu_vtx_x = events.neutrino_vtx_x[idx_array]
    idx_w_vtx = events.reco_particle_vtx_w[idx_array]
    idx_x_vtx = events.reco_particle_vtx_x[idx_array]

    w_distances = idx_w_vtx - nu_vtx_w
    x_distances = idx_x_vtx - nu_vtx_x

    abs_distances = np.sqrt(w_distances**2 + x_distances**2)

    # if distances are empty
    if len(abs_distances) != 0:
        primary = np.argmin(abs_distances)
        return idx_array[primary]
    else: return None
def charged_candidate_feature(events, event_idx):
    identifiers = events.event_number
    change_indices = np.where(np.diff(identifiers) != 0)[0] + 1
    slices = np.split(np.arange(len(identifiers)), change_indices)  # Groups indices by event
    
    # Find the subarray that contains event_idx
    event_indices = next((sub for sub in slices if event_idx in sub), None)

    # If event_idx is not found, return None
    if event_indices is None:
        return None

    candidate_idx = primary(events, event_indices)  # Identify primary candidate
    
    if events.mc_pdg[candidate_idx] in [-13, 13]:
        return 2  # Primary muon detected
    elif events.mc_pdg[candidate_idx] in [-11, 11]:
        return 1  # Primary electron detected
    else: return 0 # No useful primary candidate found

In [3]:
"""
Want to get an overall look at the feature for an event
-> this function gets, for one feature, the mean, median, standard deviation, max, min and sum of the features
-> returns this as a LIST of 6 values
"""

def find_feature_stats(feature_func, events, event_number):
    identifiers = events.event_number
    change_indices = np.where(np.diff(identifiers) != 0)[0] + 1
    event_indices = np.split(np.arange(len(identifiers)), change_indices)[event_number]

    event_feature_values = [feature_func(events, i) for i in event_indices]
    event_feature_values = [v for v in event_feature_values if v is not None]

    # If event_feature_values is empty after filtering, return [None] * 6
    if len(event_feature_values) == 0:
        return [None] * 6
        
    stats = [
        np.mean(event_feature_values),
        np.median(event_feature_values),
        np.std(event_feature_values),
        np.max(event_feature_values),
        np.min(event_feature_values),
        np.sum(event_feature_values),
    ]

    return stats

from tqdm import tqdm

def get_x_data(feature_func_array, events): # calculates feature data for input into a bdt.
    n = len(np.unique(events.event_number))
    x_data = []

    # Use tqdm to add a progress bar to the loop
    for event_number in tqdm(range(n), desc="Processing Events", unit="event"):
        x = []
        for func in feature_func_array:
            stats = find_feature_stats(func, events, event_number)
            x.extend(stats)

        x_data.append(x)  # Append to x_data

    return np.array(x_data)

In [4]:
"""
Now we need to label the data
The BDT takes in 12 (features) * 6 (stats) values, 
=> each event_number will have 72 features

Conventionally label events as so for y_train data:
    - 2 (CCnu_mu)
    - 1 (CCnu_e)
    - 0 (NCnu_x)
"""

def label_events(events):
    identifiers = events.event_number
    change_indices = np.where(np.diff(identifiers) != 0)[0] + 1
    first_idx = np.array([arr[0] for arr in np.split(np.arange(len(identifiers)), change_indices)])  # Fix here

    numu = events.is_numu[first_idx]
    nue = events.is_nue[first_idx]
    
    y_data = np.zeros_like(first_idx)
    y_data[numu] = 2
    y_data[nue] = 1
    
    return y_data

In [5]:
"""
The BDT:
    1. create the feature_func_array
    2. get x_train and y_train, x_test, y_test
    3. hyperparametrise the bdt
    4. save the bdt
"""
features = [ # Step 1
    correlation,
    noise,
    rms,
    angle,
    line,
    q4,
    step_length,
    dE_dx,
    hit_count,
    adc_sum,
    hull_density,
    charged_candidate_feature
]

In [6]:
"""
Loading the Training and Testing Data
"""

trained_data = np.load("training_data_72f.npz", allow_pickle=True)
testing_data = np.load("testing_data_72f.npz", allow_pickle=True)

x_trained = trained_data["x_train"]
y_trained = trained_data["y_train"]

x_testing = testing_data["x_test"]
y_testing = testing_data["y_test"]

EOFError: No data left in file

In [None]:
""" Step 3 """

param_grid = {
    'n_estimators': [50, 100, 200, 500],  # Number of trees
    'max_depth': [3, 5, 7, 10],  # Tree depth
    'learning_rate': [0.001, 0.01, 0.1, 0.3],  # Step size shrinkage
    'subsample': [0.6, 0.8, 1.0],  # Row sampling per tree
    'colsample_bytree': [0.6, 0.8, 1.0],  # Feature sampling per tree
    'gamma': [0, 0.1, 0.2, 0.5],  # Minimum loss reduction for split
    'reg_lambda': [0, 1, 10],  # L2 regularization
    'reg_alpha': [0, 1, 10]  # L1 regularization
}

In [None]:
""" Step 4 """

bdt = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")

# Use Randomized Search for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=bdt,
    param_distributions=param_grid,
    n_iter=50,  # Number of random samples to try
    scoring='accuracy',  # Optimize for classification accuracy
    cv=5,  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Train with hyperparameter tuning
random_search.fit(x_trained, y_trained)

# Get the best model and its parameters
best_xgb = random_search.best_estimator_
print("Best hyperparameters found: ", random_search.best_params_)