In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

import warnings
warnings.filterwarnings('ignore')

Identify tracks Using Hough Transform

Consider a track pattern recognition method using the Hough Transform in polar system. In this system a circular track can be parametrized as follow:

r=2r0Cos(ϕ−θ)
 
where:

r  and  ϕ  : are coordinates of a hit in the polar system.

r0  and  θ  : are coordinates of a center of a circular track in the polar system.

A linear track corresponds to the  r0=∞ .

Transformation of cartesian coordinates of a hit to polar coordinates defined as:

ϕ=arctan(yx)
 
r=x2+y2‾‾‾‾‾‾‾√
 
The Hough Transform converts a hit in  (r,ϕ)  space to a curve in  (1/r0,θ)  space of the track parameters as follow:

1/r0=2Cos(ϕ−θ)r
 
A linear track in this space represents as  (0,θ)  point.

however, there are 3 dimensions: x, y, z. Thus, the track pattern recognition will be performed in cylindrical coordinate systems:  ϕ , r, z. For the simplicity (but you can create your own parameter) we suppose that for 3D tracks:

γ=z/r=const
 
which is true for high-PT tracks.

This section demonstrates the track pattern recognition method using Hough Transfrom described above and histogramming technique. In this technique each 'hot' bin represents one recognized track as it is shown in the figure:

hough.png

To assign only one track lable to a hit, only bins with the highest number of hits are selected. But there is one additional requirement for the bins: these bins must not share hits. Please, look the method script for details.

In [3]:
def cartesian_to_cylindrical(x, y, z):
    
    r = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    z = z
    
    return r, phi, z


def create_hough_matrix(hits):
    
    hough_matrix = hits[['hit_id', 'x', 'y', 'z']].copy()
    hough_matrix.columns = ['HitID', 'X', 'Y', 'Z']
    
    r, phi, z = cartesian_to_cylindrical(hough_matrix['X'].values, 
                                         hough_matrix['Y'].values, 
                                         hough_matrix['Z'].values)

    hough_matrix['R'] = r
    hough_matrix['Phi'] = phi
    
    return hough_matrix


def add_theta(hough_matrix, theta):
    hough_matrix['Theta'] = theta
    return hough_matrix

def add_r0_inv(hough_matrix):
    hough_matrix['R0Inv'] = (2. * np.cos(hough_matrix['Phi'] - hough_matrix['Theta']) / hough_matrix['R']).values
    return hough_matrix

def add_gamma(hough_matrix):
    hough_matrix['Gamma'] = hough_matrix['Z']/hough_matrix['R']
    return hough_matrix


def digitize_column(hough_matrix, col, N, min_val=None, max_val=None):
    
    x = hough_matrix[col].values
    if min_val is not None and max_val is not None:
        bins = np.linspace(min_val, max_val, N)
    else:
        bins = np.linspace(x.min(), x.max(), N)
    bin_ids = np.digitize(x, bins)
    hough_matrix[col+'Digi'] = bin_ids
    
    return hough_matrix


def combine_digi(hough_matrix, columns):
    
    hough_matrix['ComboDigi'] = np.zeros(len(hough_matrix))
    
    for i_col, acol in enumerate(columns):
        digi = hough_matrix[acol]
        hough_matrix['ComboDigi'] += digi * 10**(i_col * 5)
    
    return hough_matrix


def count_combo_digi(hough_matrix):
    
    unique, indeces, counts = np.unique(hough_matrix['ComboDigi'].values, 
                                     return_counts=True, return_inverse=True)
    hough_matrix['ComboDigiCounts'] = counts[indeces]
    
    return hough_matrix

def out_of_border_counters_to_zero(hough_matrix, col, N):
    hough_matrix['ComboDigiCounts'] *= (hough_matrix[col].values != 0) * (hough_matrix[col].values != N)
    return hough_matrix

def one_slice(hough_matrix, theta, N_bins_r0inv, N_bins_gamma, min_hits):
        
    tracks = []
    
    hough_matrix = add_theta(hough_matrix, theta)
    hough_matrix = add_r0_inv(hough_matrix)
    hough_matrix = add_gamma(hough_matrix)

    hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.019) # Tune it.
    hough_matrix = digitize_column(hough_matrix, 'Gamma', N_bins_gamma, -50, 50) # Tune it.

    hough_matrix = combine_digi(hough_matrix, ['R0InvDigi', 'GammaDigi'])
    hough_matrix = count_combo_digi(hough_matrix)

    hough_matrix = out_of_border_counters_to_zero(hough_matrix, 'R0InvDigi', N_bins_r0inv)
    hough_matrix = out_of_border_counters_to_zero(hough_matrix, 'GammaDigi', N_bins_gamma)
    
    counts = hough_matrix.ComboDigiCounts.values
    bins = hough_matrix.ComboDigi.values
    hit_ids = np.arange(len(hough_matrix))
    for abin in np.unique(bins[counts >= min_hits]):
        atrack = hit_ids[(bins == abin)]
        tracks.append(atrack)
        
    return tracks, hough_matrix



In [4]:
class Clusterer(object):
    
    def __init__(self, N_bins_r0inv, N_bins_gamma, N_theta, min_hits):
        
        self.N_bins_r0inv = N_bins_r0inv 
        self.N_bins_gamma = N_bins_gamma
        self.N_theta = N_theta
        self.min_hits = min_hits
    
    def predict(self, hits):
        
        tracks = []

        hough_matrix = create_hough_matrix(hits)
        for theta in np.linspace(-np.pi, np.pi, self.N_theta):
            slice_tracks, hough_matrix = one_slice(hough_matrix, theta, self.N_bins_r0inv, self.N_bins_gamma, self.min_hits)
            tracks += list(slice_tracks)

        labels = np.zeros(len(hits))
        used = np.zeros(len(hits))
        track_id = 0
        for atrack in tracks:
            u_track = atrack[used[atrack] == 0]
            if len(u_track) >= self.min_hits:
                labels[u_track] = track_id
                used[u_track] = 1
                track_id += 1
            
        return labels

In [5]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [6]:
path_to_train = "../data/train"
event_prefix = "event000001000"
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))

In [62]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.025, 0.025) # Tune it.
for h in [8,9, 10]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.13783569647757776
0.13777901601678022
0.1324603066060887
CPU times: user 3min 53s, sys: 2.69 s, total: 3min 55s
Wall time: 3min 48s


In [6]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.015, 0.015) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.13138324946915658
CPU times: user 1min 15s, sys: 1.02 s, total: 1min 16s
Wall time: 1min 14s


In [11]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.016, 0.016) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.1348778779230148
CPU times: user 1min 16s, sys: 977 ms, total: 1min 17s
Wall time: 1min 14s


In [16]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.018, 0.018) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.1359058514533225
CPU times: user 1min 18s, sys: 910 ms, total: 1min 19s
Wall time: 1min 16s


In [21]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.021, 0.021) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.13791190851834
CPU times: user 1min 17s, sys: 843 ms, total: 1min 18s
Wall time: 1min 16s


In [32]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.019) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.14085714317203724
CPU times: user 1min 17s, sys: 760 ms, total: 1min 18s
Wall time: 1min 16s


In [37]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.0195, 0.0195) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.140170559379583
CPU times: user 1min 16s, sys: 1 s, total: 1min 17s
Wall time: 1min 15s


In [42]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.0191, 0.0191) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.13905085748532384
CPU times: user 1min 17s, sys: 730 ms, total: 1min 18s
Wall time: 1min 16s


In [47]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.03) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.13424264991861157
CPU times: user 1min 17s, sys: 857 ms, total: 1min 18s
Wall time: 1min 16s


In [52]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.03) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.139683930249772
CPU times: user 1min 17s, sys: 729 ms, total: 1min 18s
Wall time: 1min 15s


In [57]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.019) # Tune it.
# hough_matrix = digitize_column(hough_matrix, 'Gamma', N_bins_gamma, -55, 55) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.1376100310921274
CPU times: user 1min 16s, sys: 783 ms, total: 1min 17s
Wall time: 1min 15s


In [64]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.019) # Tune it.
# hough_matrix = digitize_column(hough_matrix, 'Gamma', N_bins_gamma, -45, 55) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.13777901601678022
CPU times: user 1min 17s, sys: 910 ms, total: 1min 18s
Wall time: 1min 15s


In [69]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.019) # Tune it.
# hough_matrix = digitize_column(hough_matrix, 'Gamma', N_bins_gamma, -50, 51) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.13409354053983796
CPU times: user 1min 17s, sys: 802 ms, total: 1min 17s
Wall time: 1min 15s


In [74]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.019) # Tune it.
# hough_matrix = digitize_column(hough_matrix, 'Gamma', N_bins_gamma, -50, 50) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.14085714317203724
CPU times: user 1min 17s, sys: 808 ms, total: 1min 18s
Wall time: 1min 15s


In [7]:
%%time
# Warning: it takes about 100s per one event.
# hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.019, 0.019) # Tune it.
# hough_matrix = digitize_column(hough_matrix, 'Gamma', N_bins_gamma, -50, 50) # Tune it.
for h in [9]:
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=600, min_hits=h)
    labels = model.predict(hits)
    submission = create_one_event_submission(0, hits, labels)
    score = score_event(truth, submission)
    print(score)

0.1413972293722589
CPU times: user 2min 19s, sys: 406 ms, total: 2min 20s
Wall time: 2min 19s


In [80]:
path_to_train = "../data/train_100_events"


train_sample_scores = []
for event_id, hits, cells, particles, truth in tqdm(load_dataset(path_to_train)):

    truth1 = pd.merge(truth, particles, how='left', on='particle_id')
    hits1 = pd.merge(hits, truth1, how='left', on='hit_id')

    hits2 = hits1.dropna()
    truth2 = truth1.dropna()

    hits3 = hits2[hits2.nhits > 3]
    truth3 = truth2[truth2.nhits > 3]

    # Track pattern recognition
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=9)
    labels = model.predict(hits3)

    # Prepare submission for an event
    one_submission = create_one_event_submission(event_id, hits3, labels)
    score = score_event(truth3, one_submission)

    train_sample_scores.append(score)

#     print("Score for event %d: %.3f" % (event_id, score))

#     print('Event ID: {}, Score: {}'.format(event_id, score))


100it [1:18:10, 46.90s/it]


In [81]:
min_hits=9
print('min_hits: {0:.4f}, Mean score: {1:.8f}'.format(min_hits, (np.mean(train_sample_scores))))

min_hits: 9.0000, Mean score: 0.14964652
