## _PandaML_
- Perform detailed _Exploratory Data Analysis (EDA)_ on STT Data.
- Use `DASK` instead of `Pandas` to load CSVs at once.

In [1]:
import sys, os, glob, yaml

In [2]:
import math
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import trackml.dataset
import seaborn as sns

In [4]:
sys.path.append('src')

In [5]:
from src.utils_data import dask_events

### _Dataset_

In [6]:
input_dir = './data_sets/ctd2022p/data_10K/'

In [7]:
# Get List of all Files
all_files = os.listdir(input_dir)

# Extract File Prefixes (use e.g. xxx-hits.csv)
file_prefixes = sorted(os.path.join(input_dir, f.replace('-hits.csv', '')) for f in all_files if f.endswith('-hits.csv'))

In [8]:
# load an event
# hits, tubes, particles, truth = trackml.dataset.load_event(file_prefixes[0])

In [9]:
# tubes.head()

In [10]:
# particles.head()

In [11]:
# truth.head()

### _Read Event_

In [12]:
from src import SttCSVReader, Draw_Reader_Event

In [13]:
reader = SttCSVReader(input_dir, True, True)

In [14]:
data = reader(0)

In [15]:
data.event.columns

Index(['hit_id', 'x', 'y', 'z', 'volume_id', 'layer', 'module_id', 'event_id',
       'isochrone', 'skewed', 'sector_id', 'tx', 'ty', 'tz', 'tpx', 'tpy',
       'tpz', 'weight', 'particle_id', 'vx', 'vy', 'vz', 'px', 'py', 'pz', 'q',
       'nhits', 'pdgcode', 'start_time', 'primary', 'pt', 'peta', 'r', 'phi',
       'eta', 'r3', 'absZ', 'tpt'],
      dtype='object')

In [16]:
# data.event.head()

In [17]:
# event = Compose_Event(file_prefixes[1], noise=False, skewed=True)

### _Histograms_

In [18]:
import ROOT

Welcome to JupyROOT 6.26/08


In [19]:
c1 = ROOT.TCanvas("c1", "Histograms", 600,500)  # only one canvas is enough

In [20]:
# Hit Distributions
h1 = ROOT.TH1F("h1", "Number of Particles with pt > 50 MeV", 100, 1, 10)
h2 = ROOT.TH1F("h2", "Number of Particles with pt > 50 MeV and nhits > 5", 100, 1, 10)
h3 = ROOT.TH2F("h3", "Hit Distribution", 1000, -40, 40, 1000, -40, 40)

In [21]:
# Position Resolutions
hrx = ROOT.TH1F("hrx", "X Position Resolution", 100, -10, 10)
hry = ROOT.TH1F("hry", "Y Position Resolution", 100, -10, 10)
hrz = ROOT.TH1F("hrz", "Z Position Resolution", 100, -10, 10)

In [None]:
for i in range(len(file_prefixes)):
    
    if i != 0 and i%1000 == 0:
        print("Processed Events:", i)
    
    data = reader(i)
    
    truth_particles = data.event
    truth_particles = truth_particles[truth_particles.pt > 0.05]       # particles with pt > 50 MeV
    h1.Fill(np.unique(truth_particles.particle_id).size)
    
    truth_particles = truth_particles[truth_particles.nhits > 5]       # particles with nhits > 4
    h2.Fill(np.unique(truth_particles.particle_id).size)
    
    xvals = data.event.x.values
    yvals = data.event.y.values
    
    r_x = ((data.event.x - data.event.tx)/data.event.tx).values
    r_y = ((data.event.y - data.event.ty)/data.event.ty).values
    r_z = ((data.event.z - data.event.tz)/data.event.tz).values
    
    for idx in range(data.event.shape[0]):
        h3.Fill(xvals[idx], yvals[idx])
        hrx.Fill(r_x[idx])
        hry.Fill(r_y[idx])
        hrz.Fill(r_z[idx])       

In [None]:
%jsroot on

### _Histograms_

In [None]:
h1.Draw();c1.Draw();c1.SaveAs("h1.pdf")

In [None]:
h2.Draw();c1.Draw();c1.SaveAs("h2.pdf")

In [None]:
c2 = ROOT.TCanvas("c2", "Histograms", 600,600)
h3.Draw("COLZ");c2.Draw();c2.SaveAs("h3.pdf");

### _Position Resolutions_

In [None]:
hrx.Draw();c1.Draw();c1.SaveAs("hrx.pdf")

In [None]:
hry.Draw();c1.Draw();c1.SaveAs("hry.pdf")

In [None]:
hrz.Draw();c1.Draw();c1.SaveAs("hrz.pdf")