## _PandaML_
- Perform detailed _Exploratory Data Analysis_ (EDA) on STT Data.

In [1]:
import sys, os, glob, yaml

In [2]:
import math
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import trackml.dataset
import seaborn as sns

In [4]:
sys.path.append('src')

### _Dataset_

In [5]:
input_dir = './data_sets/ctd2022p/data_10K'

In [6]:
# Find All Input Data Files (hits.csv, cells.csv, particles.csv, truth.csv)
all_files = os.listdir(input_dir)

# Extract File Prefixes (use e.g. xxx-hits.csv)
suffix = '-hits.csv'
file_prefixes = sorted(os.path.join(input_dir, f.replace(suffix, ''))
                       for f in all_files if f.endswith(suffix))

In [7]:
print("Number of Events Loaded: ", len(file_prefixes))

Number of Events Loaded:  10000


In [8]:
# load an event
event_prefix = file_prefixes[0]
hits, tubes, particles, truth = trackml.dataset.load_event(event_prefix)

In [9]:
# hits.head()

In [10]:
# tubes.head()

In [11]:
# particles.head()

In [12]:
# truth.head()

### _Read Event_

In [13]:
from src import SttCSVReader, Draw_Reader_Event

In [14]:
reader = SttCSVReader(input_dir, True, True)

In [15]:
data = reader(0)

In [16]:
data.event.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer,module_id,event_id,isochrone,skewed,...,start_time,primary,pt,peta,r,phi,eta,r3,absZ,tpt
0,1,-7.87217,-14.645,35.0,9,0,46,0,0.117062,0,...,0.203417,1,0.106593,0.199899,16.626699,-2.064018,1.489651,38.748512,35.0,0.088239
1,2,-7.87217,-15.655,35.0,9,1,153,0,0.418876,0,...,0.203417,1,0.106593,0.199899,17.522844,-2.036724,1.442469,39.141411,35.0,0.088131
2,3,-8.74686,-16.16,35.0,9,2,265,0,0.304087,0,...,0.203417,1,0.106593,0.199899,18.375341,-2.066909,1.400198,39.530407,35.0,0.087973
3,4,-8.74686,-17.17,35.0,9,3,384,0,0.090621,0,...,0.203417,1,0.106593,0.199899,19.269573,-2.041957,1.358346,39.95393,35.0,0.087953
4,5,-8.74686,-18.18,35.0,9,4,509,0,0.155938,0,...,0.203417,1,0.106593,0.199899,20.174736,-2.01923,1.318353,40.398266,35.0,0.08787


### Histograms

In [18]:
import ROOT

Welcome to JupyROOT 6.26/08


In [43]:
c1 = ROOT.TCanvas("c1", "Histograms", 600,600)  # only one canvas is enough



In [45]:
h1 = ROOT.TH1F("h1", "Number of Particles with pt > 50 MeV", 100, 1, 10)
h2 = ROOT.TH1F("h2", "Number of Particles with pt > 50 MeV and nhits > 5", 100, 1, 10)
h3 = ROOT.TH2F("h3", "Hit Distribution", 1000, -40, 40, 1000, -40, 40)



In [20]:
for i in range(len(file_prefixes)):
    
    if i != 0 and i%1000 == 0:
        print("Processed Events:", i)
    
    event = reader(i)
    truth_particles = event.particles.merge(event.truth, on='particle_id', how='left')
        
    truth_particles = truth_particles[truth_particles.pt > 0.05]       # particles with pt > 50 MeV
    h1.Fill(np.unique(truth_particles.particle_id).size)
    
    truth_particles = truth_particles[truth_particles.nhits > 5]       # particles with nhits > 4
    h2.Fill(np.unique(truth_particles.particle_id).size)
    
    #h3.Fill(event.event.x.values, event.event.y.values)

Processed Events: 1000
Processed Events: 2000
Processed Events: 3000
Processed Events: 4000
Processed Events: 5000
Processed Events: 6000
Processed Events: 7000
Processed Events: 8000
Processed Events: 9000


In [21]:
%jsroot on

In [22]:
h1.Draw();c1.Draw();c1.SaveAs("c1.pdf")

In [41]:
h2.Draw();c1.Draw();c1.SaveAs("h2.pdf")

In [None]:
for i in range(len(file_prefixes)):
    
    if i != 0 and i%1000 == 0:
        print("Processed Events:", i)
    
    event = reader(i)
    
    xvals = event.event.x.values
    yvals = event.event.y.values
    
    for idx in range(xvals.size):
        h3.Fill(xvals[idx], yvals[idx])

In [None]:
h3.Draw("COLZ");c1.Draw();c1.SaveAs("h3.pdf");