## _Building Graphs: True Edges_

- _layerwise edges_
- _modulewise edges_
- _hitwise edges (new for curly tracks)_

In [None]:
import glob, os, sys, yaml

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import pprint
import seaborn as sns
import trackml.dataset

In [None]:
import torch
from torch_geometric.data import Data
import itertools

In [None]:
# append parent dir
sys.path.append('..')

In [None]:
# local imports
from src.drawing import detector_layout
from src.utils_math import polar_to_cartesian
from src import Compose_Event, Draw_Compose_Event

### _(+) - Input Data_

In [None]:
# input data
input_dir = '../train_all'

In [None]:
# Find All Input Data Files (hits.csv, cells.csv, particles.csv, truth.csv)
all_files = os.listdir(input_dir)

# Extract File Prefixes (use e.g. xxx-hits.csv)
suffix = '-hits.csv'
file_prefixes = sorted(os.path.join(input_dir, f.replace(suffix, ''))
                       for f in all_files if f.endswith(suffix))

print("Number of Files: ", len(file_prefixes))

In [None]:
event_id = 95191
event_prefix = file_prefixes[event_id]

In [None]:
# load an event
hits, tubes, particles, truth = trackml.dataset.load_event(event_prefix)

In [None]:
# hits.head()
# tubes.head()
# particles.head()
# truth.head()

### _(+) - Build Event_

- functions from _event_utils.py_

In [None]:
# compose event is exactly the same as select_hits()
event = Compose_Event(event_prefix, noise=False, skewed=False)
Draw_Compose_Event(event,figsize=(10,10));

## _1. Layerwise True Edges_

**True Graph** is the ground truth for GNN. It is built from creating edges from _`hits`_ from the same particle but in adjacent layers. For this purpose one has _`true_edges, hits = get_layerwise_edges(event)`_ function in the _`event_util.py`_.

In [None]:
from LightningModules.Processing.utils.event_utils import select_hits
from LightningModules.Processing.utils.event_utils import get_layerwise_edges

In [None]:
kwargs = {"selection": False}

In [None]:
# select hits
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
# layerwise true edges & new hits dataframe
true_edges, hits = get_layerwise_edges(hits)

In [None]:
# true_edges

In [None]:
# split as sender and recivers
senders, receivers = true_edges

In [None]:
# gives True
# senders == true_edges[0]

In [None]:
# gives True
# receivers  == true_edges[1]

### _Plotting Layerwise True Edges_

- I have hit pairs in two arrays
- Extract each pair (w/ `hit_id`) to plot
- How to plot hit pairs for one track?

In [None]:
# lets get unique pids with freq (~ hits).
sel_pids, sel_pids_fr = np.unique(hits.particle_id, return_counts=True)
print(sel_pids)

In [None]:
# check size of true_edges
size = true_edges.shape[1]
print(size)

In [None]:
# New: Plotting true_edges
fig, ax = detector_layout(figsize=(10,10))

# particle tracks
for pid in sel_pids:
    idx = hits.particle_id == pid
    ax.scatter(hits[idx].x.values, hits[idx].y.values, label='particle_id: %d' %pid)
    
# true edges
for iedge in range(true_edges.shape[1]):
    pt1 = hits.iloc[true_edges[0][iedge]]
    pt2 = hits.iloc[true_edges[1][iedge]]
    ax.plot([pt1.x, pt2.x], [pt1.y, pt2.y], color='k', alpha=0.3, lw=1.5)

# axis params
ax.legend(fontsize=12, loc='best')
fig.tight_layout()
fig.savefig("layerwise_true_edges.pdf")

### _Disect `get_layerwise_edges(hits)` Function_

In [None]:
# select hits
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
# Sort by increasing distance from production
hits = hits.assign(
        R=np.sqrt(
            (hits.x - hits.vx) ** 2 + (hits.y - hits.vy) ** 2 + (hits.z - hits.vz) ** 2
        )
    )

In [None]:
# re-indexing of hits dataframe, we get two extra columns: R and index
hits = hits.sort_values("R").reset_index(drop=True).reset_index(drop=False)

In [None]:
# assign particle_id=0 as NaN
hits.loc[hits["particle_id"] == 0, "particle_id"] = np.nan

In [None]:
hits.head()

In [None]:
# hit_list based on particle_id and layer_id
hit_list = (
        hits.groupby(["particle_id", "layer_id"], sort=False)["index"]
        .agg(lambda x: list(x))
        .groupby(level=0)
        .agg(lambda x: list(x))
    )

In [None]:
hit_list

In [None]:
# get first row of hit list i.e. first particle
# row = hit_list.values[0]

In [None]:
# get elements of array from 0 to n-1 i.e. skipping the last element
# row[0:-1]

In [None]:
# get elements of array from 1 to n i.e. skipping the first elemnet
# row[1:]

In [None]:
# now build layerwise true edges
true_edges = []
for row in hit_list.values:
    for i, j in zip(row[0:-1], row[1:]): # row is list: we take 0 to n-1 elements as row[0:-1], and 1 to n as row[1:]
        true_edges.extend(list(itertools.product(i, j))) # extend() will add an iterable (list, set, etc) to the end of true_edges list, append() add one element to end of list.

In [None]:
true_edges = np.array(true_edges).T

In [None]:
true_edges.shape

* Now we have _true_edges_ and corresponding _hits_ (changed _i.e._ sorted due to $R$ parameter.)

In [None]:
# split as sender and recivers
senders, receivers = true_edges

In [None]:
senders.shape, receivers.shape

In [None]:
true_edges[0].size, true_edges[1].size

## _2. Modulewise True Edges_

In [None]:
from LightningModules.Processing.utils.event_utils import select_hits
from LightningModules.Processing.utils.event_utils import get_modulewise_edges

In [None]:
# select hits
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
# modulewise true edges
true_edges = get_modulewise_edges(hits)

In [None]:
# split as sender and recivers
senders, receivers = true_edges

In [None]:
# gives True
# senders == true_edges[0]

In [None]:
# gives True
# receivers  == true_edges[1]

### _Plotting Modulewise True Edges_

- I have hit pairs in two arrays
- Extract each pair (w/ `hit_id`) to plot
- How to plot hit pairs for one track?

In [None]:
from src.drawing import detector_layout
from src.utils_math import polar_to_cartesian

In [None]:
# lets get unique pids with freq (~ hits).
sel_pids, sel_pids_fr = np.unique(hits.particle_id, return_counts=True)
print(sel_pids)

In [None]:
# check size of true_edges
size = true_edges.shape[1]
print(size)

In [None]:
# New: Plotting true_edges
fig, ax = detector_layout(figsize=(11,11))

# particle tracks
for pid in sel_pids:
    idx = hits.particle_id == pid
    ax.scatter(hits[idx].x.values, hits[idx].y.values, label='particle_id: %d' %pid)
    
# true edges
for iedge in range(true_edges.shape[1]):
    pt1 = hits.iloc[true_edges[0][iedge]]
    pt2 = hits.iloc[true_edges[1][iedge]]
    ax.plot([pt1.x, pt2.x], [pt1.y, pt2.y], color='k', alpha=0.3, lw=1.5)


# axis params
ax.legend(fontsize=12, loc='best')
fig.tight_layout()
fig.savefig("modulewise_true_edges.pdf")

### _Disect `get_modulewise_edges(hits)` Function_

In [None]:
# select hits
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
signal = hits[
        ((~hits.particle_id.isna()) & (hits.particle_id != 0)) & (~hits.vx.isna())
    ]

In [None]:
signal.head()

In [None]:
signal = signal.drop_duplicates(
        subset=["particle_id", "volume_id", "layer_id", "module_id"]
    )

In [None]:
signal.head()

In [None]:
# Sort by increasing distance from production
signal = signal.assign(
    R=np.sqrt(
        (signal.x - signal.vx) ** 2
        + (signal.y - signal.vy) ** 2
        + (signal.z - signal.vz) ** 2
    )
)

In [None]:
signal.head()

In [None]:
signal = signal.sort_values("R").reset_index(drop=False)

In [None]:
signal.head()

In [None]:
# Handle re-indexing
signal = signal.rename(columns={"index": "unsorted_index"}).reset_index(drop=False)
signal.loc[signal["particle_id"] == 0, "particle_id"] = np.nan

In [None]:
signal.head()

In [None]:
# Group by particle ID
signal_list = signal.groupby(["particle_id"], sort=False)["index"].agg(
    lambda x: list(x)
)

In [None]:
signal_list

In [None]:
true_edges = []
for row in signal_list.values:
    for i, j in zip(row[:-1], row[1:]):
        true_edges.append([i, j])

In [None]:
true_edges = np.array(true_edges).T

In [None]:
true_edges = signal.unsorted_index.values[true_edges]

In [None]:
true_edges.shape

In [None]:
# split as sender and recivers
senders, receivers = true_edges

In [None]:
senders.shape, receivers.shape

## _3. New Layerwise/Modulewise Truth Graph_

Ground truth constructed from layerwise or modulewise heuristics works best for high $p_t$ tracks that most likely don't re-enter the detector. However, both of these methods fails when a low $p_t$ track either re-enters the detector or simply curly inside the detector. One needs a new heuristic for such tracks. Instead of sorting hits according to $R = \sqrt(x^2 + y^2 + z^2)$, one needs something else that can gives the order of hits along the praticle trajectory. For example,

- [FAIL] use relative distace to built a truth graph, this fails similarly as of $R$
- [FAIL] use timing info of a hit position, diffence in times of two STT hits is small to separate them within precision we have
- sorting parameter, if there is a parameter that can be used to order the hits along a track
    - In current data, order of occurence of `hit_id` follows the particle trajectory
- [Success] order of occurence of `hit_id` is way to follow a curly track

### _3.1. How to build edges based on some Euclidean distance_

In [None]:
# layerwise true edges & new hits dataframe
true_edges, hits = get_layerwise_edges(hits)

In [None]:
# modulewise true edges
true_edges = get_modulewise_edges(hits)

In [None]:
# Euclidean Distance of Two Hits to get a better true graph
def distance (hits, i, j):
    """Hits dataframe containing all variables. i,j are two hit ids"""
    pt1 = hits.iloc[i]
    pt2 = hits.iloc[j]
    d = np.sqrt((pt2.x - pt1.x)**2 + (pt2.y - pt1.y)**2)
    
    return d

In [None]:
# get one edge
e = 0

In [None]:
edge = senders[e], receivers[e]

In [None]:
edge

In [None]:
# find first node
hits.loc[hits['hit_id'] == edge[0]]

In [None]:
# find second node
hits.loc[hits['hit_id'] == edge[1]]

In [None]:
distance(hits, edge[0], edge[1])

In [None]:
# get one edge
e = 1

In [None]:
edge = senders[e], receivers[e]

In [None]:
mask = []
for e in range(true_edges.shape[1]):
    edge = senders[e], receivers[e]
    d = distance(hits, edge[0], edge[1])
    if d >= 10:
        # print("edge: ({},{}), d: {}".format(edge[0], edge[1], d))
        mask.append(False)
    else:
        mask.append(True)

In [None]:
mask = np.array(mask)

In [None]:
np.where(mask == False)[0].shape

In [None]:
np.where(mask == True)[0].shape

In [None]:
senders, receivers = true_edges

In [None]:
senders = senders[mask]

In [None]:
receivers = receivers[mask]

In [None]:
senders.shape

In [None]:
receivers.shape

In [None]:
328+137

- **Distance Method** doesn't work as one might remove an edge from inner layer to outer layers _i.e._ before and after the **skewed** layers gap.

### _3.2. Order of Occurence of Hits_

This method works only when we don't renmae layers after excluding **skewed** layers. This method is a **success** and works similar to _get_modulewise_edges()_

In [None]:
# select hits
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
# hits.head()

- first attempt

In [None]:
def get_hitwise_edges(hits, column='hit_id'):
    """The function closely resembles get_modulewise_edges(), one
    can introduce layerwise variant similar to get_layerwise_edges"""
    
    # Group by particle_id, similar to modulewise edges
    groups = hits.groupby(["particle_id"])

    # Create an empty list to store the edge arrays for each group
    true_edges = []

    # Iterate over each group
    for _, group in groups:
        
        # Use 'hit_id' column to create true_edges, I assume order 
        # of occurence of hits is along the particle trajectory.
        # hit_indices = group['hit_id'].values
        
        # Or, use index of hits to create true_edges, I assume order 
        # of occurence of hits is along the particle trajectory [KEEP it].
        hit_indices = group.index.values
        
        # Create arrays for source and target nodes
        source_nodes = hit_indices[:-1]
        target_nodes = hit_indices[1:]

        # Concatenate the source and target arrays vertically
        edge_array = np.column_stack((source_nodes, target_nodes))

        # Append the edge array to the list
        true_edges.append(edge_array)

    # Concatenate for all particle groups vertically
    true_edges = np.vstack(true_edges)
    return true_edges.T

In [None]:
true_edges = get_hitwise_edges(hits)

In [None]:
# true_edges

In [None]:
# New: Plotting true_edges
fig, ax = detector_layout(figsize=(11,11))

# particle tracks
for pid in sel_pids:
    idx = hits.particle_id == pid
    ax.scatter(hits[idx].x.values, hits[idx].y.values, label='particle_id: %d' %pid)
    
# Works for True Edge Built from 'index' of 'hit_ids'
for iedge, edge in enumerate(true_edges.T):
    source_node = edge[0]
    target_node = edge[1]
    source_pos = hits.loc[source_node, ['x', 'y']].values
    target_pos = hits.loc[target_node, ['x', 'y']].values
    ax.plot([source_pos[0], target_pos[0]], [source_pos[1], target_pos[1]], 'k-', linewidth=0.5)
    

# axis params
ax.legend(fontsize=12, loc='best')
fig.tight_layout()
fig.savefig("hitwise_true_edges.pdf")

- _follow the logic of `get_modulewise_edges()`_

In [None]:
# this works perfectly
def get_modulewise_ordered_edges(hits):
    """Get modulewise (layerless) true edge list using the order
    of occurence hits. Here 'hits' represent complete event."""
    
    # Handle NaN and Null Values
    signal = hits[
        ((~hits.particle_id.isna()) & (hits.particle_id != 0)) & (~hits.vx.isna())
    ]
    signal = signal.drop_duplicates(
        subset=["particle_id", "volume_id", "layer_id", "module_id"]
    )
    
    # Handle Indexing (Keep order of occurence)
    signal = signal.reset_index()
    
    # Rename 'index' column to 'unsorted_index'
    signal = signal.rename(columns={"index": "unsorted_index"}).reset_index(drop=False)

    # Handle Particle_id 0
    signal.loc[signal["particle_id"] == 0, "particle_id"] = np.nan

    # Group by Particle ID
    signal_list = signal.groupby(["particle_id"], sort=False)["index"].agg(
        lambda x: list(x)
    )

    # Generate Edges
    true_edges = []
    for row in signal_list.values:
        for i, j in zip(row[:-1], row[1:]):
            true_edges.append([i, j])
    
    # Return Edges
    true_edges = np.array(true_edges).T
    true_edges = signal.unsorted_index.values[true_edges]

    return true_edges

In [None]:
true_edges = get_modulewise_ordered_edges(hits)

In [None]:
# Simplified Plotting (Use it in future)
fig, ax = detector_layout(figsize=(11,11))

# Group the hits DataFrame by particle_id
hits_grouped = hits.groupby('particle_id')

# Plot the hit positions for each particle
for particle_id, group in hits_grouped:
    ax.scatter(group['x'], group['y'], label=f'particle_id={particle_id}')

# Plot the true edges for each particle
for i, (source_node, target_node) in enumerate(true_edges.T):
    source_pos = hits.loc[source_node, ['x', 'y']].values
    target_pos = hits.loc[target_node, ['x', 'y']].values
    ax.plot([source_pos[0], target_pos[0]], [source_pos[1], target_pos[1]], 'k-', linewidth=0.5)

    
# axis params
ax.legend(fontsize=12, loc='best')
fig.tight_layout()
fig.savefig("modulewise_ordered_edges.pdf")

- _1st attempt ot follow the logic of `get_layerwise_edges()`_