In [None]:
# https://www.kaggle.com/mrbeer/dbscan-benchmark-improvement-0-2099/code

In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from datetime import datetime

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

In [2]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [58]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

"""
updated - added self.rz_scale
"""
class Clusterer(object):
    
    def __init__(self, eps):
        self.eps = eps
        self.rz_scale = 1
        
    
    def _preprocess(self, hits):
        
        x = hits.x.values
        y = hits.y.values
        z = hits.z.values

        r = np.sqrt(x**2 + y**2 + z**2)
        hits['x2'] = x/r 
        hits['y2'] = y/r 

        r = np.sqrt(x**2 + y**2)
        hits['z2'] = z/r

        ss = StandardScaler()
        X = ss.fit_transform(hits[['x2', 'y2', 'z2']].values)
        X[:,2] = X[:,2] * self.rz_scale
        
        return X
    
    
    def predict(self, hits, rz_scale=1):
        
        self.rz_scale = rz_scale
        X = self._preprocess(hits)
        
        cl = DBSCAN(eps=self.eps, min_samples=1, algorithm='kd_tree')
        labels = cl.fit_predict(X)
        
        return labels


In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

"""
updated - added self.rz_scale
"""
class Clusterer1(object):
    
    def __init__(self, eps):
        self.eps = eps
        self.rz_scale = 1
        
    
    def _preprocess(self, hits):
        
        x = hits.x.values
        y = hits.y.values
        z = hits.z.values

        r = np.sqrt(x**2 + y**2 + z**2)
        hits['x2'] = x/r
        hits['y2'] = y/r

        r = np.sqrt(x**2 + y**2)
        hits['z2'] = z/r

        ss = StandardScaler()
        X = ss.fit_transform(hits[['x2', 'y2', 'z2']].values)
        X[:,2] = X[:,2] * self.rz_scale
        
        return X
    
    
    def predict(self, hits, rz_scale=1):
        
        self.rz_scale = rz_scale
        X = self._preprocess(hits)
        
        cl = DBSCAN(eps=self.eps, min_samples=1, algorithm='kd_tree')
        labels = cl.fit_predict(X)
        
        return labels


In [60]:
# Change this according to your directory preferred setting
path_to_train = "../data/train"
# This event is in Train_1
event_prefix = "event000001000"

In [61]:
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))

In [62]:
hits.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-64.409897,-7.1637,-1502.5,7,2,1
1,2,-55.336102,0.635342,-1502.5,7,2,1
2,3,-83.830498,-1.14301,-1502.5,7,2,1
3,4,-96.1091,-8.24103,-1502.5,7,2,1
4,5,-62.673599,-9.3712,-1502.5,7,2,1


In [63]:
cells.head()

Unnamed: 0,hit_id,ch0,ch1,value
0,1,209,617,0.013832
1,1,210,617,0.079887
2,1,209,618,0.211723
3,2,68,446,0.334087
4,3,58,954,0.034005


In [85]:
"""
updated - added to predict: rz_scale=1.5
"""
model = Clusterer(eps=0.00815)
labels = model.predict(hits, rz_scale=1.5)

In [86]:
submission = create_one_event_submission(0, hits, labels)
score = score_event(truth, submission)

In [87]:
print(score)

0.2062911201046471


In [89]:
path_to_train = "../data/train_100_events"
train_sample_scores = []

for event_id, hits, cells, particles, truth in load_dataset(path_to_train):

    # Track pattern recognition
    model = Clusterer(eps=0.00815)
    labels = model.predict(hits, rz_scale=1.5)

    # Prepare submission for an event
    one_submission = create_one_event_submission(event_id, hits, labels)
    score = score_event(truth, one_submission)

    train_sample_scores.append(score)

#     print("Score for event %d: %.3f" % (event_id, score))

#     print('Event ID: {}, Score: {}'.format(event_id, score))
print('Mean score: %.6f' % (np.mean(train_sample_scores)))

Mean score: 0.207460


In [90]:
path_to_test = "../data/test"
test_dataset_submissions = []

create_submission = True # True for submission 

if create_submission:
    for event_id, hits, cells in load_dataset(path_to_test, parts=['hits', 'cells']):

        # Track pattern recognition
        model = Clusterer(eps=0.00815)
        labels = model.predict(hits, rz_scale=1.5)

        # Prepare submission for an event
        one_submission = create_one_event_submission(event_id, hits, labels)
        test_dataset_submissions.append(one_submission)
        
        print('Event ID: ', event_id)

    # Create submission file
    submussion = pd.concat(test_dataset_submissions, axis=0)
    IDENTIFIER = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    submussion.to_csv('../submissions/submission-DBSCAN-sample-{}.csv.gz'.format(IDENTIFIER), index=False, compression='gzip')

Event ID:  0
Event ID:  1
Event ID:  2
Event ID:  3
Event ID:  4
Event ID:  5
Event ID:  6
Event ID:  7
Event ID:  8
Event ID:  9
Event ID:  10
Event ID:  11
Event ID:  12
Event ID:  13
Event ID:  14
Event ID:  15
Event ID:  16
Event ID:  17
Event ID:  18
Event ID:  19
Event ID:  20
Event ID:  21
Event ID:  22
Event ID:  23
Event ID:  24
Event ID:  25
Event ID:  26
Event ID:  27
Event ID:  28
Event ID:  29
Event ID:  30
Event ID:  31
Event ID:  32
Event ID:  33
Event ID:  34
Event ID:  35
Event ID:  36
Event ID:  37
Event ID:  38
Event ID:  39
Event ID:  40
Event ID:  41
Event ID:  42
Event ID:  43
Event ID:  44
Event ID:  45
Event ID:  46
Event ID:  47
Event ID:  48
Event ID:  49
Event ID:  50
Event ID:  51
Event ID:  52
Event ID:  53
Event ID:  54
Event ID:  55
Event ID:  56
Event ID:  57
Event ID:  58
Event ID:  59
Event ID:  60
Event ID:  61
Event ID:  62
Event ID:  63
Event ID:  64
Event ID:  65
Event ID:  66
Event ID:  67
Event ID:  68
Event ID:  69
Event ID:  70
Event ID:  71
Ev

In [16]:
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))

In [17]:
hits.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-64.409897,-7.1637,-1502.5,7,2,1
1,2,-55.336102,0.635342,-1502.5,7,2,1
2,3,-83.830498,-1.14301,-1502.5,7,2,1
3,4,-96.1091,-8.24103,-1502.5,7,2,1
4,5,-62.673599,-9.3712,-1502.5,7,2,1


In [18]:
particles.head()

Unnamed: 0,particle_id,vx,vy,vz,px,py,pz,q,nhits
0,4503668346847232,-0.009288,0.009861,-0.077879,-0.055269,0.323272,-0.203492,-1,8
1,4503737066323968,-0.009288,0.009861,-0.077879,-0.948125,0.470892,2.01006,1,11
2,4503805785800704,-0.009288,0.009861,-0.077879,-0.886484,0.105749,0.683881,-1,0
3,4503874505277440,-0.009288,0.009861,-0.077879,0.257539,-0.676718,0.991616,1,12
4,4503943224754176,-0.009288,0.009861,-0.077879,16.4394,-15.5489,-39.824902,1,3


In [19]:
truth.head()

Unnamed: 0,hit_id,particle_id,tx,ty,tz,tpx,tpy,tpz,weight
0,1,0,-64.411598,-7.16412,-1502.5,250710.0,-149908.0,-956385.0,0.0
1,2,22525763437723648,-55.338501,0.630805,-1502.5,-0.570605,0.02839,-15.4922,1e-05
2,3,0,-83.828003,-1.14558,-1502.5,626295.0,-169767.0,-760877.0,0.0
3,4,297237712845406208,-96.122902,-8.23036,-1502.5,-0.225235,-0.050968,-3.70232,8e-06
4,5,418835796137607168,-62.659401,-9.37504,-1502.5,-0.281806,-0.023487,-6.57318,9e-06
