This will have the same content as the single-run kmeans file but in a jupyter notebook format it'll be easier to run and manage 

In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import torch
import csv
import pandas as pd  
from scipy import stats as st
from sklearn.preprocessing import MinMaxScaler
from collections import Counter 

events = None 
density = None 
noise = None 
filename = None 
folder = None 
k = 100
max_iters = 500

In [None]:
def kmeans_gpu(data, k, max_iters=100):
    data = data.to(device)
    centroids = data[torch.randperm(len(data))[:k]]

    for i in range(max_iters):
        print(f"Iteration: {i+1}")
        dist = torch.cdist(data, centroids) 
        ai_labels = torch.argmin(dist, dim=1)

        # update centroids 
        for j in range(k):
            centroids[j] = data[ai_labels == j].mean(dim=0)
    
    return ai_labels.cpu().numpy(), centroids.numpy()

device = "cuda" if torch.cuda.is_available() else "cpu" 

In [None]:
def labelmaker(events, density, noise, filename = None, folder = None): 
    '''creates labels based on my naming convention for different files, keeps it consistent and easy'''
    if folder: 
        folder = folder + '/'

    if filename: 
        datafile = str(filename) 
    else: 
        datafile = str(events) + 'ev_' + str(density) + 'dense_n' + str(noise)
    
    labelfile = 'labels_' + datafile + '.csv'
    sourcefile = 'sources_' + datafile + '.csv'
    ai_labelfile = datafile + '_results' + '.csv'
    centroidfile = datafile + '_centroids' + '.csv'

    if folder: 
        datafile = folder + datafile 
        centroidfile = folder + centroidfile 
        ai_labelfile = folder + ai_labelfile 
        labelfile = folder + labelfile 
        sourcefile = folder + sourcefile 

    return datafile, labelfile, sourcefile, ai_labelfile, centroidfile

def readfiles(datafile, labelfile, sourcefile): 
    '''data reader and simplifier for files that haven't been passed through the algorithm'''
    columns = ['x[px]', 'y[px]', 't[s]']
    
    dataread = pd.read_csv(datafile) 
    data = np.array(dataread[columns])
    
    labelread = pd.read_csv(labelfile)
    labels = np.array(labelread['labels'])

    sourceread = pd.read_csv(sourcefile) 
    sources = np.array(sourceread[columns])

    return data, labels, sources

def readai(datafile, labelfile, sourcefile, ai_labelfile, centroidfile):
    '''file reader and data simplifier for data thats been through the algorithm'''
    columns = ['x[px]', 'y[px]', 't[s]']
    
    dataread = pd.read_csv(datafile) 
    data = np.array(dataread[columns])
    
    labelread = pd.read_csv(labelfile)
    labels = np.array(labelread['labels'])

    sourceread = pd.read_csv(sourcefile) 
    sources = np.array(sourceread[columns])

    ai_labelsread = pd.read_csv(ai_labelfile)
    ai_labels = np.array(ai_labelsread['labels'])

    centroidread = pd.read_csv(centroidfile) 
    centroids = np.array(centroidread[columns])

    return data, labels, sources, ai_labels, centroids

Generating datafile names and data + normalizing it using MinMaxScaling below: 

In [None]:
datafile, labelfile, sourcefile, ai_labelfile, centroidfile = labelmaker(events, density, noise, filename, folder)
# read out the files
data, labels, sources = readfiles(datafile, labelfile, sourcefile)
# establish feature range and transform data, can comment this out to turn off scaling, naming is the same
scaler = MinMaxScaler(feature_range=(0,1))
data = scaler.fit_transform(data)
sources = scaler.fit_transform(sources)
# turn into a tensor 
data_tensor = torch.Tensor(data)

Running k-means algorithm

In [None]:
ai_labels, centroids = kmeans_gpu(data_tensor, k, max_iters)

Writing to the centroid and ai_label files 

In [None]:
with open(ai_labelfile, mode = 'w', newline='') as wfile: 
    writer = csv.writer(wfile)
    writer.writerow(['labels'])
    for item in ai_labels: 
                writer.writerow([item])

columns = ['x[px]', 'y[px]', 't[s]']
with open(centroidfile, mode = 'w', newline = '') as wfile: 
    writer = csv.writer(wfile)
    writer.writerow(columns)
    writer.writerows(centroids)

Breaks and sizing functions for the true label-focused loss function 

In [None]:
def breaks(array): 
    '''This function takes an array, goes through it item by item, and returns the list of indices where the value changes'''
    value = array[0] 
    indices = []
    for index, ele in enumerate(array): 
        if ele != value:
            indices.append(index)
            value = array[index]
    indices.append(len(array))

    return indices

def sizes(indices): 
    '''This function takes a list of indices and calculates the number of items belonging to each value by taking the difference between 
    subsequent indices'''
    gaps = []
    prev=0
    for i in range(len(indices)):
        gaps.append(indices[i]-prev)
        prev = indices[i]

    return gaps

True Label-Focused Loss, ripped from the losses file 

In [None]:
# additional function needed for the loss function 
def modded_mode(array): 
    '''This function will return event labels, its a modification on a normal mode function where the label has to appear at least 33% of the time to be a label in that chunk.'''
    n = len(array)
    if n == 0: 
        return []
    
    threshold = n/3 
    counts = Counter(array)

    result = [key for key, count in counts.items() if count > threshold]
    return result 

# THE loss func. 

def loss(true_labels, network_labels):
    all_modes =[]
    event_splits = 0 
    all_misidentified = [] # array for all fractions of misidentified events 

    break_indices = breaks(true_labels)
    gaps = sizes(break_indices) # sizes of each event 

    beginning = 0 
    for i in range(len(gaps)):
        misidentified=0 # number of misidentified photons per cluster starts at 0
        end = break_indices[i] # determine where the cluster ends 
        chunk = network_labels[beginning:end] 
        chunk_modes = modded_mode(chunk) # find the modes, these are the ai labels given to the cluster, what events are dominant here 
        all_modes.extend(chunk_modes) # add to master list of modes, this indicates when each event label shows up (how often)
        if len(chunk_modes) > 1: 
            event_splits += 1  # add one to a split. if there is more than one ai label in this list, theres a split cluster 

        chunk_modes_set = chunk_modes # change the list of modes into a set to increase efficiency
        misidentified = sum(1 for item in chunk if item not in chunk_modes_set) # counts the number of photons not included in the main event labels (modes)
        err_fraction = misidentified/gaps[i] # calculates the fraction of misidentified over number of photons in the event 
        all_misidentified.append(err_fraction) # adds the misidentification error to array 
        beginning = break_indices[i] # adjust the beginning of the next chunk
    
    avg_misidentified = np.average(all_misidentified)
    counter = Counter(all_modes)
    unfiltered_counts = dict(counter) # look at how many times each mode or "label" shows up in this list 
    mode_modes = {item: count for item, count in counter.items() if count > 1} # if it's more than once then an event has been combined 
    combo_frac = len(mode_modes.values())/len(unfiltered_counts.values())
    avg_ev_in_combo = sum(mode_modes.values())/len(mode_modes.values())

    print(f"The fraction of misidentified photons in each event is: {all_misidentified}")
    print(f"the average fraction of misidentified photons is: {avg_misidentified}") 
    print(f"The total number of event splits is {event_splits}")
    print(f"The full list of events that were combined is: {mode_modes}, with a fraction of {combo_frac} events being combined. The average number of events involved in a combination is: {avg_ev_in_combo}")
    # the mode modes here refer to the labels given to the events by the ai, not the true labels (for reference) 
    return event_splits, all_misidentified, avg_misidentified, mode_modes, combo_frac, avg_ev_in_combo

AI Label-Focused Loss function

In [None]:
# these results should match up exactly with the previous loss function within rounding errors 

def ai_based_loss(true_labels, network_labels):
    '''This function does the same thing as the previous loss function, but does it in such a way 
    that the ai labels are ordered and the clustering is based off of these. it should give the same 
    or similar results to the original loss function'''
    
    dom_true_labels = []
    total_combos = 0 
    ev_per_combo = 0 # divide by number of combos to get average
    fractions_misIDs = []
    total_splits = 0 
    ev_per_split = 0 # divide by number of splits to get the average 

    sorted_pairs = sorted(zip(network_labels, true_labels))
    #reordered labels are below
    reo_network_labels, reo_true_labels = zip(*sorted_pairs)

    break_indices = breaks(reo_network_labels)
    gaps = sizes(break_indices)

    beginning = 0 
    for i in range(len(gaps)):
        end = break_indices[i] # determine where the chunk ends 
        chunk = reo_true_labels[beginning:end]
        chunk_modes = modded_mode(chunk) # finding the dominant labels in the chunk (max 3)
        dom_true_labels.extend(chunk_modes) # adding to mega list of dominant labels 
        e_in_combo = len(chunk_modes)
        if e_in_combo > 1: 
            total_combos += 1
            ev_per_combo += e_in_combo
        
        misIDs = sum(1 for item in chunk if item not in chunk_modes)
        fractions_misIDs.append(misIDs/gaps[i])
        beginning = break_indices[i] # update the beginning of the following chunk
    
    # totaling shit and averages 
    counter = Counter(dom_true_labels)
    all_counts = dict(counter) # amount of times each label shows up in the list 
    repeat_labels = {item: count for item, count in counter.items() if count >1}
    total_splits = len(repeat_labels.values())
    ev_per_split = sum(repeat_labels.values())/total_splits # average number of events in a split 
    frac_splits = total_splits/len(all_counts.values()) # fraction of events involved in a split 

    ev_per_combo = ev_per_combo/total_combos # averaging out 
    avg_misIDs = np.average(fractions_misIDs)

    



