In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Bidirectional, TimeDistributed, BatchNormalization, Embedding

from numpy import array
from keras.models import load_model
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import os

import numpy as np
import pandas as pd
from tqdm import tqdm

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from scipy import sparse as ssp

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import lightgbm as lgbm

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
def append_to_csv(batch, csv_file):
    props = dict(encoding='utf-8', index=False)
    if not os.path.exists(csv_file):
        batch.to_csv(csv_file, **props)
    else:
        batch.to_csv(csv_file, mode='a', header=False, **props)

def delete_file_if_exists(filename):
    if os.path.exists(filename):
        os.remove(filename)

In [3]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [4]:
def get_coordinates(hits):
    x = hits['x'].values
    y = hits['y'].values
    z = hits['z'].values
    rho = np.sqrt(x**2 + y**2 + z**2)
    r = np.sqrt(x**2 + y**2)
    hits['r'] = r # cylindrical coordinate
    hits['rho'] = rho # spherical  coordinate
    hits['theta'] = np.arctan(y/x)
    hits['phi'] = np.arctan(r/z)
    return hits

In [5]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
def my_error(preds, train_data):
    labels = train_data.get_label()
#     print(preds.shape)
#     print(len(labels))
#     (932331380,)
# 104945
# 8884
# (104945, 32)
# (106779, 32)
    p = preds.reshape(len(hits_valid), -1)
    preds = [np.argmax(p[i]) for i in range(len(hits_valid))]
#     print(len(preds))
#     print(hits_valid.shape)
#     print(hits_train.shape)
#     print(labels)
    
    one_submission = create_one_event_submission(0, hits_valid, preds)
    score = score_event(hits_valid, one_submission)
    return 'error', (1.0-score), False


In [6]:
# hits = pd.read_csv('../cache/train_100_hits.csv')
hits = pd.read_csv('../cache/hits_2_events_ge9.csv')
print(hits.shape)
hits = hits[hits.particle_id != 0]
print(hits.shape)
hits = get_coordinates(hits)

(211724, 26)
(211724, 26)


In [7]:
hits.shape

(211724, 30)

In [8]:
hits.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,366640,366641,366643,366644,366645
hit_id,1,2,5,6,7
x,-69.691,-78.1122,-95.8917,-70.1793,-91.4769
y,-9.82317,-13.9466,-0.249342,-7.20341,-14.0455
z,-1502.5,-1502.5,-1502.5,-1502.5,-1502.5
volume_id,7,7,7,7,7
layer_id,2,2,2,2,2
module_id,1,1,1,1,1
particle_id,648527486031757312,324272917066022912,968275981468958720,869199675884830720,824162511380021248
tx,-69.6983,-78.1035,-95.9085,-70.1977,-91.4803


In [9]:
hits.volume_id.value_counts()

8     53224
13    39289
9     36208
7     35226
14    14543
12    14331
17     9281
18     4900
16     4722
Name: volume_id, dtype: int64

In [10]:
hits.layer_id.nunique()

7

In [11]:
hits.module_id.nunique()

2957

In [12]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder

In [13]:
le = LabelEncoder()
hits['v_id'] = le.fit_transform(hits['volume_id'].values)

In [14]:
hits['v_id'].value_counts()

1    53224
4    39289
2    36208
0    35226
5    14543
3    14331
7     9281
8     4900
6     4722
Name: v_id, dtype: int64

In [15]:
lb = LabelBinarizer()
hits['v_id'] = lb.fit_transform(hits['volume_id'].values)

In [16]:
hits['v_id'].value_counts()

0    176498
1     35226
Name: v_id, dtype: int64

In [17]:
cat_features = ['volume_id', 'layer_id', 'module_id']

In [18]:
ohe = OneHotEncoder()
X_cat = ohe.fit_transform(hits[cat_features])

In [19]:
X_cat.shape

(211724, 2973)

In [20]:
num_features = ['x', 'y', 'z', 'r', 'rho', 'theta', 'phi']
# num_features = ['rho', 'theta', 'phi']
# num_features = ['x', 'y', 'z']

In [21]:
hits['new_pid'] = hits.particle_id.astype('str') + '_' + hits.event_id.astype('str') 
# hits = hits[hits.nhits >= 9]


In [22]:
new_pid_count = list(range(hits.new_pid.nunique()))
new_pid_list = list(set(hits.new_pid.values))
new_pid_dict = dict(zip(new_pid_list, new_pid_count))

In [23]:
# new_pid_count = list(range(100))
# new_pid_list = list(set(hits.new_pid.values))
# new_pid_list = np.random.choice(new_pid_list,100)
# new_pid_dict = dict(zip(new_pid_list, new_pid_count))
# hits = hits[hits.new_pid.isin(new_pid_list)]

In [24]:
hits.head()

Unnamed: 0.1,Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id,particle_id,tx,...,pz,q,nhits,event_id,new_pid,r,rho,theta,phi,v_id
0,366640,1,-69.691,-9.82317,-1502.5,7,2,1,648527486031757312,-69.6983,...,-5.50525,1.0,13.0,1004,648527486031757312_1004,70.379899,1504.14746,0.140031,-0.046808,1
1,366641,2,-78.1122,-13.9466,-1502.5,7,2,1,324272917066022912,-78.1035,...,-14.8676,1.0,16.0,1004,324272917066022912_1004,79.347485,1504.593724,0.176684,-0.052761,1
2,366643,5,-95.8917,-0.249342,-1502.5,7,2,1,968275981468958720,-95.9085,...,-8.09741,-1.0,15.0,1004,968275981468958720_1004,95.892024,1505.556884,0.0026,-0.063735,1
3,366644,6,-70.1793,-7.20341,-1502.5,7,2,1,869199675884830720,-70.1977,...,-5.313,-1.0,13.0,1004,869199675884830720_1004,70.548021,1504.155335,0.102285,-0.046919,1
4,366645,7,-91.4769,-14.0455,-1502.5,7,2,1,824162511380021248,-91.4803,...,-19.9529,1.0,14.0,1004,824162511380021248_1004,92.548902,1505.347651,0.152352,-0.061519,1


In [25]:
hits['nid'] = hits['new_pid'].map(lambda x: new_pid_dict[x])

In [26]:
# hits = hits[hits.new_pid.isin(new_pid_list)]
# hits['nid'] = hits['new_pid'].map(lambda x: new_pid_dict[x])
hits.sort_values(['nid', 'z'], inplace=True)

event_list = list(set(hits.event_id.values))
valid_events = np.random.choice(event_list,1)
print(valid_events)



[1004]


In [27]:
hits_train = hits[~hits.event_id.isin(valid_events)]
hits_valid = hits[hits.event_id.isin(valid_events)]

In [28]:
hits.head(20)

Unnamed: 0.1,Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id,particle_id,tx,...,q,nhits,event_id,new_pid,r,rho,theta,phi,v_id,nid
131513,7815875,27847,28.9824,-13.4191,114.446,8,2,135,126104912734978048,28.9821,...,-1.0,13.0,1087,126104912734978048_1087,31.938249,118.818932,-0.433619,0.272145,0,0
131577,7815948,27929,31.1725,-14.4083,123.38,8,2,136,126104912734978048,31.1725,...,-1.0,13.0,1087,126104912734978048_1087,34.341285,128.070091,-0.432963,0.271466,0,0
139240,7824678,37378,65.4014,-29.4613,262.303,8,4,334,126104912734978048,65.4011,...,-1.0,13.0,1087,126104912734978048_1087,71.730825,271.934137,-0.423244,0.266939,0,0
145997,7832498,46114,105.91,-46.2741,425.062,8,6,699,126104912734978048,105.912,...,-1.0,13.0,1087,126104912734978048_1087,115.577768,440.49509,-0.411923,0.265489,0,0
152658,7840083,54803,149.383,-63.0896,598.0,9,2,48,126104912734978048,149.374,...,-1.0,13.0,1087,126104912734978048_1087,162.159114,619.596303,-0.399611,0.264801,0,0
181651,7871582,89002,241.197,-94.9876,958.4,13,2,778,126104912734978048,241.196,...,-1.0,13.0,1087,126104912734978048_1087,259.226999,992.838958,-0.375165,0.264158,0,0
195807,7888890,111096,307.791,-115.082,1215.5,14,2,74,126104912734978048,307.756,...,-1.0,13.0,1087,126104912734978048_1087,328.601836,1259.134392,-0.357803,0.264031,0,0
197093,7890320,112733,382.165,-134.993,1501.5,14,4,78,126104912734978048,382.591,...,-1.0,13.0,1087,126104912734978048_1087,405.306301,1555.241283,-0.339551,0.263651,0,0
197073,7890298,112708,383.207,-135.263,1504.5,14,4,75,126104912734978048,383.38,...,-1.0,13.0,1087,126104912734978048_1087,406.378745,1558.417125,-0.339324,0.263813,0,0
198295,7891614,114229,461.595,-154.046,1801.5,14,6,78,126104912734978048,461.858,...,-1.0,13.0,1087,126104912734978048_1087,486.62112,1866.066013,-0.322103,0.263824,0,0


In [29]:
hits_train.shape

(104945, 32)

In [30]:
hits_valid.shape

(106779, 32)

In [31]:
ohe = OneHotEncoder()
ohe.fit(hits[cat_features].values)

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [32]:
X_cat_train = ohe.transform(hits_train[cat_features].values)
X_cat_valid = ohe.transform(hits_valid[cat_features].values)

In [33]:
X_num_train = hits_train[num_features].values
X_num_valid = hits_valid[num_features].values



In [34]:

y_train = hits_train['nid'].values
y_valid = hits_valid['nid'].values

In [35]:
y_train

array([    0,     0,     0, ..., 17642, 17642, 17642])

In [36]:
train_list = [X_num_train,X_cat_train,]
valid_list = [X_num_valid,X_cat_valid,]

# train_list = [X_num_train,]
# valid_list = [X_num_valid,]

In [37]:
X_train = ssp.hstack(train_list).tocsr()
X_valid = ssp.hstack(valid_list).tocsr()

# X_train = X_num_train
# X_valid = X_num_valid

In [38]:
num_class = hits_train['nid'].nunique()
print(num_class)

8759


In [39]:
train_nid_list = list(set(hits_train['nid'].values))
train_class_list = list(range(hits_train['nid'].nunique()))
train_y_map = dict(zip(train_nid_list, train_class_list))
y_train = [train_y_map[i] for i in hits_train['nid'].values]

valid_nid_list = list(set(hits_valid['nid'].values))
valid_class_list = list(range(hits_valid['nid'].nunique()))
valid_y_map = dict(zip(valid_nid_list, valid_class_list))
y_valid = [valid_y_map[i] for i in hits_valid['nid'].values]

In [40]:
len(train_nid_list)

8759

In [41]:
learning_rate = 0.01
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "multiclass",
          "boosting_type": "gbdt",
          'metric': 'multi_logloss',
          'num_class':len(train_nid_list),
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
          "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'binary_logloss',
#     'num_leaves': 31,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'verbose': 0
# }

params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':len(train_nid_list),
    'num_threads':12,
    'metric': 'multi_logloss',
    'learning_rate': 0.002296,
    'max_depth': 7,
    'num_leaves': 17,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.6,
    'bagging_freq': 17}

In [42]:
X_train.shape

(104945, 2980)

In [43]:
X_valid.shape

(106779, 2980)

In [44]:
106779 * 104945

11205922155

In [45]:
104945 * 2980

312736100

In [46]:
106779 * 2980

318201420

In [47]:
932331380/104945

8884.0

In [48]:
y_valid

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 18,
 18,

In [49]:
dtrain = lgbm.Dataset(X_train, y_train)
dvalid = lgbm.Dataset(X_valid, y_valid, reference=dtrain)
bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=my_error, verbose_eval=100,
                early_stopping_rounds=100)
print(bst.best_iteration)
# cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 9.46666	valid_0's error: 1
Early stopping, best iteration is:
[1]	valid_0's multi_logloss: 9.43847	valid_0's error: 1
1
