based on https://www.kaggle.com/mikhailhushchyn/knn-approach

In [22]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import time  

In [3]:
import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)
#print ("trackml.__version__:", trackml.__version__)#weird error, worked in previous version

Python version
3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 18:10:19) 
[GCC 7.2.0]
Version info.
sys.version_info(major=3, minor=6, micro=4, releaselevel='final', serial=0)


In [35]:
dataDir = "/home/ubuntu/kaggleData/"
path_to_train = dataDir+"train_1"
path_to_test = dataDir+"test"

In [11]:
# This event is in Train_1
#event_prefix = "event000001008"
event_prefix = "event000001000"

In [6]:
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))

In [8]:
print ("type(hits):", type(hits), hits.shape)
print ("type(cells):", type(cells), cells.shape)
print ("type(particles):", type(particles), particles.shape)
print ("type(truth):", type(truth), truth.shape)

type(hits): <class 'pandas.core.frame.DataFrame'> (111867, 7)
type(cells): <class 'pandas.core.frame.DataFrame'> (612411, 4)
type(particles): <class 'pandas.core.frame.DataFrame'> (11128, 9)
type(truth): <class 'pandas.core.frame.DataFrame'> (111867, 9)


In [9]:
hits.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-82.837997,-11.2402,-1502.5,7,2,1
1,2,-62.0214,-9.77127,-1502.5,7,2,1
2,3,-69.657799,-6.66082,-1502.5,7,2,1
3,4,-65.937302,-11.6342,-1502.5,7,2,1
4,5,-78.020599,-9.37532,-1502.5,7,2,1


In [12]:
def get_training_sample(path_to_data, event_names):

    events = []
    track_id = 0

    for name in event_names:

        # Read an event
        hits, cells, particles, truth = load_event(os.path.join(path_to_data, name))

        # Generate new vector of particle id
        particle_ids = truth.particle_id.values
        particle2track = {}
        for pid in np.unique(particle_ids):
            particle2track[pid] = track_id
            track_id += 1
        hits['particle_id'] = [particle2track[pid] for pid in particle_ids]

        # Collect hits
        events.append(hits)

    # Put all hits into one sample with unique tracj ids
    data = pd.concat(events, axis=0)

    return data

In [14]:
#nb: can extend this to use more data.
start_event_id = 1000
n_train_samples = 5
train_event_names = ["event0000{:05d}".format(i) for i in range(start_event_id, start_event_id+n_train_samples)]
train_data = get_training_sample(path_to_train, train_event_names)

In [18]:
print ("type(train_event_names):", type(train_event_names), len(train_event_names))
print ("type(train_data):", type(train_data), train_data.shape)

type(train_event_names): <class 'list'> 5
type(train_data): <class 'pandas.core.frame.DataFrame'> (583142, 8)


In [19]:
range(start_event_id, start_event_id+n_train_samples)

range(1000, 1005)

In [21]:
class Clusterer(object):
    
    def __init__(self):
        self.classifier = None
        
    
    def _preprocess(self, hits):
        
        x = hits.x.values
        y = hits.y.values
        z = hits.z.values

        r = np.sqrt(x**2 + y**2 + z**2)
        hits['x2'] = x/r
        hits['y2'] = y/r
        hits['z2'] = z/r

        ss = StandardScaler()
        X = ss.fit_transform(hits[['x2', 'y2', 'z2']].values)
        
        return X
    
    def fit(self, hits):
        
        X = self._preprocess(hits)
        y = hits.particle_id.values
        
        self.classifier = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
        self.classifier.fit(X, y)
    
    
    def predict(self, hits):
        
        X = self._preprocess(hits)
        labels = self.classifier.predict(X)
        
        return labels

In [24]:
model = Clusterer()
t1 = time.clock()
model.fit(train_data)
t2 = time.clock()
print ('%0.2f seconds elapsed' % (t2-t1))

0.51 seconds elapsed


In [25]:
path_to_event = os.path.join(path_to_train, "event0000{:05d}".format(start_event_id + n_train_samples + 1))
hits, cells, particles, truth = load_event(path_to_event)

In [26]:
print ("type(hits):", type(hits), hits.shape)
print ("type(cells):", type(cells), cells.shape)
print ("type(particles):", type(particles), particles.shape)
print ("type(truth):", type(truth), truth.shape)

type(hits): <class 'pandas.core.frame.DataFrame'> (118534, 7)
type(cells): <class 'pandas.core.frame.DataFrame'> (651227, 4)
type(particles): <class 'pandas.core.frame.DataFrame'> (11773, 9)
type(truth): <class 'pandas.core.frame.DataFrame'> (118534, 9)


In [27]:
%%time
# Warning: it takes about 30s per one event
labels = model.predict(hits)

CPU times: user 20.2 s, sys: 4 ms, total: 20.2 s
Wall time: 19.6 s


In [31]:
print ("type(labels):", type(labels), labels.shape)
print (labels[0:5])

type(labels): <class 'numpy.ndarray'> (118534,)
[    0 37993 33157 22339 15135]


In [32]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [33]:
submission = create_one_event_submission(0, hits, labels)
score = score_event(truth, submission)

In [34]:
print("Your score: ", score)

Your score:  0.09900948072230796


In [40]:
#load_dataset(path_to_train, skip=1000, nevents=5)
#<generator object load_dataset at 0x7f18cfb68bf8>

In [37]:
dataset_submissions = []
dataset_scores = []
skip_start = 1000
num_events = 5

for event_id, hits, cells, particles, truth in load_dataset(path_to_train, skip=skip_start, nevents=num_events):
        
    t1 = time.clock()
    # Track pattern recognition
    labels = model.predict(hits)
        
    # Prepare submission for an event
    one_submission = create_one_event_submission(event_id, hits, labels)
    dataset_submissions.append(one_submission)
    
    # Score for the event
    score = score_event(truth, one_submission)
    dataset_scores.append(score)
    
    t2 = time.clock()
    print("Score for event %d: %.3f, %0.2f seconds elapsed" % (event_id, score, t2-t1))
    
print('Mean score: %.3f' % (np.mean(dataset_scores)))

Score for event 2000: 0.107
Score for event 2001: 0.114
Score for event 2002: 0.105
Score for event 2003: 0.105
Score for event 2004: 0.102
Mean score: 0.107


In [39]:
print ("dataset_scores:", type(dataset_scores), len(dataset_scores))

dataset_scores: <class 'list'> 5


In [43]:
test_dataset_submissions = []
csv_save_interval = 100

create_submission = True # True for submission 
t0 = time.clock()


if create_submission:
    print ("creating submission.")
    for event_id, hits, cells in load_dataset(path_to_test, parts=['hits', 'cells']):

        t1 = time.clock()
        # Track pattern recognition
        labels = model.predict(hits)

        # Prepare submission for an event
        one_submission = create_one_event_submission(event_id, hits, labels)
        test_dataset_submissions.append(one_submission)
        
        t2 = time.clock()
        print('Event ID: %d, %0.2f seconds elapsed, %d', % (event_id, t2-t1, test_dataset_submissions.shape))
        if event_id%csv_save_interval ==0 and event_id>0:
            fileout = 'submission_'+str(event_id)+ '.csv.gz'
            submission = pd.concat(test_dataset_submissions, axis=0)
            submission.to_csv(fileout, index=False, compression='gzip')            

    # Create submission file
    submission = pd.concat(test_dataset_submissions, axis=0)
    submission.to_csv('submission.csv.gz', index=False, compression='gzip')
print("total time elapsed:", time.clock()-t0)

Event ID: %d, %0.2f seconds elapsed (0, 21.364789000000002)
Event ID: %d, %0.2f seconds elapsed (1, 22.442240000000027)
Event ID: %d, %0.2f seconds elapsed (2, 18.705468999999994)
Event ID: %d, %0.2f seconds elapsed (3, 19.519570999999985)
Event ID: %d, %0.2f seconds elapsed (4, 21.538152999999966)
Event ID: %d, %0.2f seconds elapsed (5, 18.770629000000042)
Event ID: %d, %0.2f seconds elapsed (6, 18.739829999999984)
Event ID: %d, %0.2f seconds elapsed (7, 21.643175999999983)
Event ID: %d, %0.2f seconds elapsed (8, 20.690717000000006)
Event ID: %d, %0.2f seconds elapsed (9, 20.35071800000003)
Event ID: %d, %0.2f seconds elapsed (10, 21.04607699999997)
Event ID: %d, %0.2f seconds elapsed (11, 18.737503000000004)
Event ID: %d, %0.2f seconds elapsed (12, 17.97850699999998)
Event ID: %d, %0.2f seconds elapsed (13, 20.74895399999997)
Event ID: %d, %0.2f seconds elapsed (14, 17.78191899999996)
Event ID: %d, %0.2f seconds elapsed (15, 14.785299000000009)
Event ID: %d, %0.2f seconds elapsed (16