In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

In [2]:
path_to_train = "../TrackML_100_events_dataset/"

In [3]:
event_prefix = "event000001000"

In [4]:
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))

In [5]:
hits.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-64.409897,-7.1637,-1502.5,7,2,1
1,2,-55.336102,0.635342,-1502.5,7,2,1
2,3,-83.830498,-1.14301,-1502.5,7,2,1
3,4,-96.1091,-8.24103,-1502.5,7,2,1
4,5,-62.673599,-9.3712,-1502.5,7,2,1


In [21]:
def cartesian_to_cylindrical(x, y, z):
    
    r = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    z = z
    
    return r, phi, z


def create_hough_matrix(hits):
    
    hough_matrix = hits[['hit_id', 'x', 'y', 'z']].copy()
    hough_matrix.columns = ['HitID', 'X', 'Y', 'Z']
    
    r, phi, z = cartesian_to_cylindrical(hough_matrix['X'].values, 
                                         hough_matrix['Y'].values, 
                                         hough_matrix['Z'].values)

    hough_matrix['R'] = r
    hough_matrix['Phi'] = phi
    
    return hough_matrix


def add_theta(hough_matrix, theta):
    hough_matrix['Theta'] = theta
    return hough_matrix

def add_r0_inv(hough_matrix):
    hough_matrix['R0Inv'] = (2. * np.cos(hough_matrix['Phi'] - hough_matrix['Theta']) / hough_matrix['R']).values
    return hough_matrix

def add_gamma(hough_matrix):
    hough_matrix['Gamma'] = hough_matrix['Z']/hough_matrix['R']
    return hough_matrix


def digitize_column(hough_matrix, col, N, min_val=None, max_val=None):
    
    x = hough_matrix[col].values
    if min_val is not None and max_val is not None:
        bins = np.linspace(min_val, max_val, N)
    else:
        bins = np.linspace(x.min(), x.max(), N)
    bin_ids = np.digitize(x, bins)
    hough_matrix[col+'Digi'] = bin_ids
    
    return hough_matrix


def combine_digi(hough_matrix, columns):
    
    hough_matrix['ComboDigi'] = np.zeros(len(hough_matrix))
    
    for i_col, acol in enumerate(columns):
        digi = hough_matrix[acol]
        hough_matrix['ComboDigi'] += digi * 10**(i_col * 5)
    
    return hough_matrix


def count_combo_digi(hough_matrix):
    
    unique, indeces, counts = np.unique(hough_matrix['ComboDigi'].values, 
                                     return_counts=True, return_inverse=True)
    hough_matrix['ComboDigiCounts'] = counts[indeces]
    
    return hough_matrix

def out_of_border_counters_to_zero(hough_matrix, col, N):
    hough_matrix['ComboDigiCounts'] *= (hough_matrix[col].values != 0) * (hough_matrix[col].values != N)
    return hough_matrix

def one_slice(hough_matrix, theta, N_bins_r0inv, N_bins_gamma, min_hits):
        
    tracks = []
    
    hough_matrix = add_theta(hough_matrix, theta)
    hough_matrix = add_r0_inv(hough_matrix)
    hough_matrix = add_gamma(hough_matrix)

    hough_matrix = digitize_column(hough_matrix, 'R0Inv', N_bins_r0inv, -0.02, 0.02) # Tune it.
    hough_matrix = digitize_column(hough_matrix, 'Gamma', N_bins_gamma, -50, 50) # Tune it.
    
    hough_matrix = combine_digi(hough_matrix, ['R0InvDigi', 'GammaDigi'])
    hough_matrix = count_combo_digi(hough_matrix)

    hough_matrix = out_of_border_counters_to_zero(hough_matrix, 'R0InvDigi', N_bins_r0inv)
    hough_matrix = out_of_border_counters_to_zero(hough_matrix, 'GammaDigi', N_bins_gamma)
    
    counts = hough_matrix.ComboDigiCounts.values
    bins = hough_matrix.ComboDigi.values
    hit_ids = np.arange(len(hough_matrix))
    for abin in np.unique(bins[counts >= min_hits]):
        atrack = hit_ids[(bins == abin)]
        tracks.append(atrack)
        
    return tracks, hough_matrix




class Clusterer(object):
    
    def __init__(self, N_bins_r0inv, N_bins_gamma, N_theta, min_hits):
        
        self.N_bins_r0inv = N_bins_r0inv 
        self.N_bins_gamma = N_bins_gamma
        self.N_theta = N_theta
        self.min_hits = min_hits
    
    def predict(self, hits):
        
        tracks = []

        hough_matrix = create_hough_matrix(hits)
        print(hough_matrix)
        for theta in np.linspace(-np.pi, np.pi, self.N_theta):
            slice_tracks, hough_matrix = one_slice(hough_matrix, theta, self.N_bins_r0inv, self.N_bins_gamma, self.min_hits)
            tracks += list(slice_tracks)
    
        labels = np.zeros(len(hits))
        used = np.zeros(len(hits))
        track_id = 0
        for atrack in tracks:
            u_track = atrack[used[atrack] == 0]
            if len(u_track) >= self.min_hits:
                labels[u_track] = track_id
                used[u_track] = 1
                track_id += 1
            
        return labels

In [22]:
%%time
# Warning: it takes about 100s per one event.

model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=9)
labels = model.predict(hits)
print(labels)

         HitID           X          Y       Z           R       Phi
0            1  -69.691002  -9.823170 -1502.5   70.379898 -3.001562
1            2  -78.112198 -13.946600 -1502.5   79.347488 -2.964909
2            3  -48.089699  -8.273580 -1502.5   48.796223 -2.971216
3            4  -58.011600  -9.702010 -1502.5   58.817299 -2.975884
4            5  -95.891701  -0.249342 -1502.5   95.892029 -3.138992
...        ...         ...        ...     ...         ...       ...
138563  138564 -880.783997  16.842100  2944.5  880.945007  3.122473
138564  138565 -826.125000  55.710499  2944.5  828.001343  3.074259
138565  138566 -868.492004  45.379200  2944.5  869.676758  3.089390
138566  138567 -896.205017  94.980598  2952.5  901.223999  3.036006
138567  138568 -890.744995  16.122200  2952.5  890.890869  3.123495

[138568 rows x 6 columns]


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



CPU times: user 2min 28s, sys: 2.75 s, total: 2min 31s
Wall time: 2min 29s


In [8]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [9]:
submission = create_one_event_submission(0, hits, labels)
score = score_event(truth, submission)

In [10]:
print("Your score: ", score)

Your score:  0.1403401478160794


In [11]:
 load_dataset(path_to_train, skip=1000, nevents=5)

<generator object load_dataset at 0x119e4d650>

In [12]:
dataset_submissions = []
dataset_scores = []

for event_id, hits, cells, particles, truth in load_dataset(path_to_train, skip=0, nevents=5):
        
    # Track pattern recognition
    model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=9)
    labels = model.predict(hits)
        
    # Prepare submission for an event
    one_submission = create_one_event_submission(event_id, hits, labels)
    dataset_submissions.append(one_submission)
    
    # Score for the event
    score = score_event(truth, one_submission)
    dataset_scores.append(score)
    
    print("Score for event %d: %.3f" % (event_id, score))
    
print('Mean score: %.3f' % (np.mean(dataset_scores)))

Score for event 1000: 0.140
Score for event 1001: 0.151
Score for event 1002: 0.135
Score for event 1003: 0.147
Score for event 1004: 0.135
Mean score: 0.142


In [13]:
path_to_test = "../input/test"
test_dataset_submissions = []

create_submission = False # True for submission 

if create_submission:
    for event_id, hits, cells in load_dataset(path_to_test, parts=['hits', 'cells']):

        # Track pattern recognition
        model = Clusterer(N_bins_r0inv=200, N_bins_gamma=500, N_theta=500, min_hits=9)
        labels = model.predict(hits)

        # Prepare submission for an event
        one_submission = create_one_event_submission(event_id, hits, labels)
        test_dataset_submissions.append(one_submission)
        
        print('Event ID: ', event_id)

    # Create submission file
    submission = pd.concat(test_dataset_submissions, axis=0)
    submission.to_csv('submission.csv.gz', index=False, compression='gzip')

array([-10.        ,  -9.95991984,  -9.91983968,  -9.87975952,
        -9.83967936,  -9.7995992 ,  -9.75951904,  -9.71943888,
        -9.67935872,  -9.63927856,  -9.5991984 ,  -9.55911824,
        -9.51903808,  -9.47895792,  -9.43887776,  -9.3987976 ,
        -9.35871743,  -9.31863727,  -9.27855711,  -9.23847695,
        -9.19839679,  -9.15831663,  -9.11823647,  -9.07815631,
        -9.03807615,  -8.99799599,  -8.95791583,  -8.91783567,
        -8.87775551,  -8.83767535,  -8.79759519,  -8.75751503,
        -8.71743487,  -8.67735471,  -8.63727455,  -8.59719439,
        -8.55711423,  -8.51703407,  -8.47695391,  -8.43687375,
        -8.39679359,  -8.35671343,  -8.31663327,  -8.27655311,
        -8.23647295,  -8.19639279,  -8.15631263,  -8.11623246,
        -8.0761523 ,  -8.03607214,  -7.99599198,  -7.95591182,
        -7.91583166,  -7.8757515 ,  -7.83567134,  -7.79559118,
        -7.75551102,  -7.71543086,  -7.6753507 ,  -7.63527054,
        -7.59519038,  -7.55511022,  -7.51503006,  -7.47