In [1]:
#!pip uninstall trackml

In [2]:
#!pip install --user git+https://github.com/LAL/trackml-library

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import trackml

#from trackml.dataset import load_event
#from trackml.dataset import load_dataset
#from trackml.score import score_event

In [4]:
import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)
#print ("trackml.__version__:", trackml.__version__)

Python version
3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 18:10:19) 
[GCC 7.2.0]
Version info.
sys.version_info(major=3, minor=6, micro=4, releaselevel='final', serial=0)


In [5]:
from trackml.dataset import load_event

In [6]:
from trackml.dataset import load_dataset
from trackml.score import score_event

In [7]:
dataDir = "/home/ubuntu/kaggleData/"
path_to_train = dataDir+"train_1"

In [8]:
# This event is in Train_1
event_prefix = "event000001008"

In [9]:
os.path.join(path_to_train, event_prefix)

'/home/ubuntu/kaggleData/train_1/event000001008'

In [10]:
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))

In [11]:
hits.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-82.837997,-11.2402,-1502.5,7,2,1
1,2,-62.0214,-9.77127,-1502.5,7,2,1
2,3,-69.657799,-6.66082,-1502.5,7,2,1
3,4,-65.937302,-11.6342,-1502.5,7,2,1
4,5,-78.020599,-9.37532,-1502.5,7,2,1


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

class Clusterer(object):
    
    def __init__(self, eps):
        self.eps = eps
        
    
    def _preprocess(self, hits):
        
        x = hits.x.values
        y = hits.y.values
        z = hits.z.values

        r = np.sqrt(x**2 + y**2 + z**2)
        hits['x2'] = x/r
        hits['y2'] = y/r

        r = np.sqrt(x**2 + y**2)
        hits['z2'] = z/r

        ss = StandardScaler()
        X = ss.fit_transform(hits[['x2', 'y2', 'z2']].values)
        
        return X
    
    
    def predict(self, hits):
        
        X = self._preprocess(hits)
        
        cl = DBSCAN(eps=self.eps, min_samples=1, algorithm='kd_tree')
        labels = cl.fit_predict(X)
        
        return labels

In [13]:
model = Clusterer(eps=0.008)
labels = model.predict(hits)

In [14]:
print ("type(model):", type(model))
print ("type(labels):", type(labels), labels.shape)

type(model): <class '__main__.Clusterer'>
type(labels): <class 'numpy.ndarray'> (111867,)


In [15]:
print(labels)

[    0     1     2 ... 60791 60789 60792]


In [16]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [17]:
submission = create_one_event_submission(0, hits, labels)
score = score_event(truth, submission)

In [18]:
print("Your score: ", score)

Your score:  0.20589788785782545


In [19]:
load_dataset(path_to_train, skip=1000, nevents=5)


<generator object load_dataset at 0x7f8347b33258>

In [20]:
dataset_submissions = []
dataset_scores = []

for event_id, hits, cells, particles, truth in load_dataset(path_to_train, skip=0, nevents=5):
        
    # Track pattern recognition
    model = Clusterer(eps=0.008)
    labels = model.predict(hits)
        
    # Prepare submission for an event
    one_submission = create_one_event_submission(event_id, hits, labels)
    dataset_submissions.append(one_submission)
    
    # Score for the event
    score = score_event(truth, one_submission)
    dataset_scores.append(score)
    
    print("Score for event %d: %.3f" % (event_id, score))
    
print('Mean score: %.3f' % (np.mean(dataset_scores)))

Score for event 1000: 0.199
Score for event 1001: 0.204
Score for event 1002: 0.183
Score for event 1003: 0.213
Score for event 1004: 0.195
Mean score: 0.199


In [21]:
path_to_test = dataDir+"test"
test_dataset_submissions = []
submit_file = 'submission.csv.gz'

create_submission = True # True for submission 

if create_submission:
    for event_id, hits, cells in load_dataset(path_to_test, parts=['hits', 'cells']):

        # Track pattern recognition
        model = Clusterer(eps=0.008)
        labels = model.predict(hits)

        # Prepare submission for an event
        one_submission = create_one_event_submission(event_id, hits, labels)
        test_dataset_submissions.append(one_submission)
        
        print('Event ID: ', event_id)

    # Create submission file
    print("creating submission file")
    submission = pd.concat(test_dataset_submissions, axis=0)
    submission.to_csv(submit_file, index=False, compression='gzip')
    print ("submit_file:",submit_file)

Event ID:  0
Event ID:  1
Event ID:  2
Event ID:  3
Event ID:  4
Event ID:  5
Event ID:  6
Event ID:  7
Event ID:  8
Event ID:  9
Event ID:  10
Event ID:  11
Event ID:  12
Event ID:  13
Event ID:  14
Event ID:  15
Event ID:  16
Event ID:  17
Event ID:  18
Event ID:  19
Event ID:  20
Event ID:  21
Event ID:  22
Event ID:  23
Event ID:  24
Event ID:  25
Event ID:  26
Event ID:  27
Event ID:  28
Event ID:  29
Event ID:  30
Event ID:  31
Event ID:  32
Event ID:  33
Event ID:  34
Event ID:  35
Event ID:  36
Event ID:  37
Event ID:  38
Event ID:  39
Event ID:  40
Event ID:  41
Event ID:  42
Event ID:  43
Event ID:  44
Event ID:  45
Event ID:  46
Event ID:  47
Event ID:  48
Event ID:  49
Event ID:  50
Event ID:  51
Event ID:  52
Event ID:  53
Event ID:  54
Event ID:  55
Event ID:  56
Event ID:  57
Event ID:  58
Event ID:  59
Event ID:  60
Event ID:  61
Event ID:  62
Event ID:  63
Event ID:  64
Event ID:  65
Event ID:  66
Event ID:  67
Event ID:  68
Event ID:  69
Event ID:  70
Event ID:  71
Ev

In [22]:
os.getcwd()

'/home/ubuntu/gitrepo/kaggle_trackML_cern'

In [24]:
#NB: unzip file prior to submission
#note: kaggle submit tool would not work.
#kaggle competitions submit -c trackml-particle-identification -f submission.csv -m "1st submit"
#