# Samples creation

## Initialization

In [92]:
import json
import os
import sys 
import random
import math
import numpy as np
import pandas as pd
import inspect
import os.path
import matplotlib.pyplot as plt
import heapq
import shutil
from math import atan2,pi

project_root = os.path.abspath(os.path.join('..'))
if project_root not in sys.path:
    sys.path.append(project_root)

print(project_root)

c:\Users\aurel\Documents\GitHub\Code_Thesis_GitHub\Code_Thesis_GitHub


In [93]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from event_model import event_model as em
from validator import validator_lite as vl
import data_analysis.event_generator as eg
from visual.color_map import Colormap

In [94]:
def load_event(file_name, plot_event=False):

    f = open(file_name)
    json_data_event = json.loads(f.read())

    ev = em.event(json_data_event, read_tracks=True)

    modules = ev.modules
    tracks = ev.real_tracks

    if plot_event:
        eg.plot_tracks_and_modules(tracks, modules, title="Loaded Event")

    modules_even = []
    modules_odd = []

    for i in range(len(modules)):
        if i % 2 == 0:
            modules_even.append(modules[i])
        else:
            modules_odd.append(modules[i])

    return json_data_event, modules

In [4]:
#event_file_name = "/datasets/minibias/velo_event_"
#file_name = project_root + event_file_name

#src_folder = project_root + "/datasets/minibias/"
#dst_folder = project_root + "/data_analysis/samples_datasets"

# 1. Samples definition

In [135]:
def sampling_method1(data, num_bins):
#> Quantile-based binning: This method involves dividing the data into intervals based on quantiles, 
# such as quartiles or deciles, which split the data into equal parts based on rank.

    # calculate the quantiles to use as bin cutoffs
    quantiles = np.percentile(data, np.linspace(0, 100, num_bins + 1))

    # create the bins by splitting the data based on the quantiles
    bins = np.split(np.array(sorted(data)), np.searchsorted(sorted(data), quantiles[1:]))

    # access each bin as a separate dataset
    for i, bin in enumerate(bins):
        print(f"Bin {i}: {bin[0], bin[-1]} len:{len(bin)} \n {bin} \n")

In [132]:
def sampling_method2(data, num_bins):
#> Equal frequency binning > dividing the data into intervals such that each interval contains
# approximately the same number of data points.

    #sort the data
    data.sort()

    # determine the number of bins needed to have roughly equal frequency in each bin
    bin_size = len(data) // num_bins

    # create the bins by splitting the data into equal segments
    bins = [data[i:i + bin_size] for i in range(0, len(data), bin_size)]

    # add any remaining data points to the last bin
    bins[-1].extend(data[bin_size * num_bins:])

    # access each bin as a separate dataset
    for i, bin in enumerate(bins):
        print(f"Bin {i}: {bin[0], bin[-1]} len:{len(bin)} \n {bin} \n")

## Minibias

In [130]:
all_events = [i for i in range(995)]
nr_events = len(all_events)

event_file_name = "/datasets/minibias/velo_event_"
file_name = project_root + event_file_name

list_max_neurons =  []
list_hits_total = []
list_neurons_total = []

for i in all_events[:nr_events]:
        #size = os.path.getsize(file_name + str(i) + ".json")
        json_data_event, modules = load_event(file_name + str(i) + ".json", plot_event=False)

        modules_count = len(modules)

        hit_counts = [len(module.hits()) for module in modules]
        hits_total = sum(hit_counts)
        list_hits_total.append(hits_total)

        neuron_count = [
            hit_counts[i] * hit_counts[i + 1]
            for i in range(modules_count - 1)
            ]
        neurons_total = sum(neuron_count)
        list_neurons_total.append(neurons_total)

        max_neurons = max(neuron_count)
        list_max_neurons.append(max_neurons)


In [136]:
sampling_method1(list_max_neurons, 5)

Bin 0: (4, 713) len:199 
 [  4   8  10  20  25  30  30  36  36  49  49  55  56  56  66  66  70  72
  84  91  91  91  96  99  99 104 108 112 117 119 120 120 120 121 121 128
 130 130 132 132 132 132 143 143 147 150 154 160 170 180 182 182 182 192
 196 200 210 225 238 240 240 242 243 247 252 252 252 255 264 266 272 272
 280 280 280 285 288 289 304 304 312 320 322 323 323 330 336 336 338 342
 345 345 357 360 360 360 363 368 378 380 391 396 400 414 418 420 420 425
 435 437 440 441 441 456 459 460 462 462 462 480 483 483 486 500 500 504
 504 504 504 504 506 506 506 513 513 518 522 522 525 525 525 528 528 528
 528 540 540 550 561 567 572 572 572 572 575 576 589 594 594 598 598 598
 600 600 608 616 616 616 616 620 624 624 624 627 630 630 640 640 640 644
 648 650 651 651 660 672 672 675 676 690 690 690 693 696 700 702 702 704
 713] 

Bin 1: (714, 1620) len:199 
 [ 714  714  726  728  728  728  744  744  744  744  750  756  759  760
  775  780  783  792  792  792  792  798  806  810  816  828  8

In [134]:
sampling_method2(list_max_neurons, 5)

Bin 0: (4, 713) len:199 
 [4, 8, 10, 20, 25, 30, 30, 36, 36, 49, 49, 55, 56, 56, 66, 66, 70, 72, 84, 91, 91, 91, 96, 99, 99, 104, 108, 112, 117, 119, 120, 120, 120, 121, 121, 128, 130, 130, 132, 132, 132, 132, 143, 143, 147, 150, 154, 160, 170, 180, 182, 182, 182, 192, 196, 200, 210, 225, 238, 240, 240, 242, 243, 247, 252, 252, 252, 255, 264, 266, 272, 272, 280, 280, 280, 285, 288, 289, 304, 304, 312, 320, 322, 323, 323, 330, 336, 336, 338, 342, 345, 345, 357, 360, 360, 360, 363, 368, 378, 380, 391, 396, 400, 414, 418, 420, 420, 425, 435, 437, 440, 441, 441, 456, 459, 460, 462, 462, 462, 480, 483, 483, 486, 500, 500, 504, 504, 504, 504, 504, 506, 506, 506, 513, 513, 518, 522, 522, 525, 525, 525, 528, 528, 528, 528, 540, 540, 550, 561, 567, 572, 572, 572, 572, 575, 576, 589, 594, 594, 598, 598, 598, 600, 600, 608, 616, 616, 616, 616, 620, 624, 624, 624, 627, 630, 630, 640, 640, 640, 644, 648, 650, 651, 651, 660, 672, 672, 675, 676, 690, 690, 690, 693, 696, 700, 702, 702, 704, 713] 

Bin

## Bsphiphi

In [138]:
all_events = [i for i in range(1000)]
nr_events = len(all_events)

event_file_name = "/datasets/bsphiphi/velo_event_"
file_name = project_root + event_file_name

list_max_neurons =  []
list_hits_total = []
list_neurons_total = []

for i in all_events[:nr_events]:
        #size = os.path.getsize(file_name + str(i) + ".json")
        json_data_event, modules = load_event(file_name + str(i) + ".json", plot_event=False)

        modules_count = len(modules)

        hit_counts = [len(module.hits()) for module in modules]
        hits_total = sum(hit_counts)
        list_hits_total.append(hits_total)

        neuron_count = [
            hit_counts[i] * hit_counts[i + 1]
            for i in range(modules_count - 1)
            ]
        neurons_total = sum(neuron_count)
        list_neurons_total.append(neurons_total)

        max_neurons = max(neuron_count)
        list_max_neurons.append(max_neurons)

In [139]:
sampling_method1(list_max_neurons, 5)

Bin 0: (90, 1419) len:200 
 [  90  105  110  117  154  154  156  156  160  209  221  272  322  324
  330  336  336  368  378  384  396  408  408  414  420  420  432  432
  432  459  460  475  476  480  486  500  504  506  506  506  510  525
  528  529  540  544  552  570  572  572  575  580  621  621  621  638
  644  646  648  660  667  667  667  690  693  702  702  720  720  728
  729  736  750  750  754  756  756  759  768  775  780  780  783  783
  783  784  784  805  805  806  812  812  816  840  840  841  851  864
  864  868  870  875  884  884  896  896  902  925  928  930  936  950
  952  957  957  957  957  961  975  980  980  984  986  986  986  990
  990 1015 1015 1023 1023 1025 1050 1050 1050 1056 1056 1056 1064 1073
 1073 1075 1080 1085 1085 1085 1085 1088 1088 1107 1110 1110 1120 1120
 1122 1122 1131 1140 1140 1147 1148 1148 1155 1173 1188 1188 1190 1190
 1209 1224 1225 1232 1240 1247 1260 1260 1260 1271 1276 1280 1280 1292
 1292 1312 1312 1326 1330 1330 1330 1332 1334 136

In [140]:
sampling_method2(list_max_neurons, 5)

Bin 0: (90, 1419) len:200 
 [90, 105, 110, 117, 154, 154, 156, 156, 160, 209, 221, 272, 322, 324, 330, 336, 336, 368, 378, 384, 396, 408, 408, 414, 420, 420, 432, 432, 432, 459, 460, 475, 476, 480, 486, 500, 504, 506, 506, 506, 510, 525, 528, 529, 540, 544, 552, 570, 572, 572, 575, 580, 621, 621, 621, 638, 644, 646, 648, 660, 667, 667, 667, 690, 693, 702, 702, 720, 720, 728, 729, 736, 750, 750, 754, 756, 756, 759, 768, 775, 780, 780, 783, 783, 783, 784, 784, 805, 805, 806, 812, 812, 816, 840, 840, 841, 851, 864, 864, 868, 870, 875, 884, 884, 896, 896, 902, 925, 928, 930, 936, 950, 952, 957, 957, 957, 957, 961, 975, 980, 980, 984, 986, 986, 986, 990, 990, 1015, 1015, 1023, 1023, 1025, 1050, 1050, 1050, 1056, 1056, 1056, 1064, 1073, 1073, 1075, 1080, 1085, 1085, 1085, 1085, 1088, 1088, 1107, 1110, 1110, 1120, 1120, 1122, 1122, 1131, 1140, 1140, 1147, 1148, 1148, 1155, 1173, 1188, 1188, 1190, 1190, 1209, 1224, 1225, 1232, 1240, 1247, 1260, 1260, 1260, 1271, 1276, 1280, 1280, 1292, 1292, 1

## 2. Creation of the subsets

The sampling method 2 was selected for this research because it allows to have at least 10 events in each subsets.

In [141]:
def meets_characteristic(json_data_event, modules, min, max):
    modules_count = len(modules)
    hit_counts = [len(module.hits()) for module in modules]
    hits_total = sum(hit_counts)
    neuron_count = [
        hit_counts[i] * hit_counts[i + 1]
        for i in range(modules_count - 1)
        ]
    max_neurons = heapq.nlargest(1, neuron_count)[0]

    if min < max_neurons <= max:
        return True
    return False

In [142]:
def create_characteristic_dataset(dataset, nr_events, min, max):

    destination_folder = project_root + "\datasets\samples\\"+ dataset + "\Samples_"+ str(min) +"_to_"+str(max)+"_neurons"
    os.mkdir(destination_folder)

    file_name = project_root + ("/datasets/"+ dataset +"/velo_event_")
    
    all_events = [i for i in range(nr_events)]


    for i in all_events[:nr_events]:
        size = os.path.getsize(file_name + str(i) + ".json")
        src_file = (file_name + str(i) + ".json")
        json_data_event, modules = load_event(file_name + str(i) + ".json", plot_event=False)

     
        if meets_characteristic(json_data_event, modules, min, max):
            dst_file = os.path.join(destination_folder,os.path.basename(src_file))
            shutil.copy(src_file,dst_file)
    

In [97]:
dataset = "minibias"
create_characteristic_dataset(dataset, 995, 0, 713)
create_characteristic_dataset(dataset, 995, 713, 1620)
create_characteristic_dataset(dataset, 995, 1620, 2784)
create_characteristic_dataset(dataset, 995, 2784, 4492)
create_characteristic_dataset(dataset, 995, 4492, 22436)


In [143]:
dataset = "bsphiphi"
create_characteristic_dataset(dataset, 1000, 0, 1419)
create_characteristic_dataset(dataset, 1000, 1419, 2500)
create_characteristic_dataset(dataset, 1000, 2500, 3900)
create_characteristic_dataset(dataset, 1000, 3900, 6084)
create_characteristic_dataset(dataset, 1000, 6084, 40145)