In [76]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Bidirectional, TimeDistributed, BatchNormalization, Embedding

from numpy import array
from keras.models import load_model
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import os

import numpy as np
import pandas as pd
from tqdm import tqdm

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from scipy import sparse as ssp

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import lightgbm as lgbm

import warnings
warnings.filterwarnings('ignore')

In [2]:
def append_to_csv(batch, csv_file):
    props = dict(encoding='utf-8', index=False)
    if not os.path.exists(csv_file):
        batch.to_csv(csv_file, **props)
    else:
        batch.to_csv(csv_file, mode='a', header=False, **props)

def delete_file_if_exists(filename):
    if os.path.exists(filename):
        os.remove(filename)

In [3]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [4]:
def get_coordinates(hits):
    x = hits['x'].values
    y = hits['y'].values
    z = hits['z'].values
    rho = np.sqrt(x**2 + y**2 + z**2)
    r = np.sqrt(x**2 + y**2)
    hits['r'] = r # cylindrical coordinate
    hits['rho'] = rho # spherical  coordinate
    hits['theta'] = np.arctan(y/x)
    hits['phi'] = np.arctan(r/z)
    return hits

In [16]:
hits = pd.read_csv('../cache/train_100_hits.csv')
print(hits.shape)
hits = hits[hits.particle_id != 0]
print(hits.shape)
hits = get_coordinates(hits)

(9009603, 24)
(9009603, 24)


In [17]:
hits.shape

(9009603, 28)

In [19]:
hits.head().T

Unnamed: 0,0,1,2,3,4
hit_id,2.0,4.0,5.0,6.0,7.0
x,-55.3361,-96.1091,-62.6736,-57.0687,-73.8723
y,0.635342,-8.24103,-9.3712,-8.17777,-2.5789
z,-1502.5,-1502.5,-1502.5,-1502.5,-1502.5
volume_id,7.0,7.0,7.0,7.0,7.0
layer_id,2.0,2.0,2.0,2.0,2.0
module_id,1.0,1.0,1.0,1.0,1.0
particle_id,2.252576e+16,2.972377e+17,4.188358e+17,1.080877e+17,9.682862e+17
tx,-55.3385,-96.1229,-62.6594,-57.0856,-73.8608
ty,0.630805,-8.23036,-9.37504,-8.18971,-2.57586


In [20]:
hits.volume_id.value_counts()

8     2321389
13    1815247
7     1443970
9     1438880
12     597421
14     594635
17     401105
18     201739
16     195217
Name: volume_id, dtype: int64

In [21]:
hits.layer_id.nunique()

7

In [22]:
hits.module_id.nunique()

3192

In [32]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder

In [26]:
le = LabelEncoder()
hits['v_id'] = le.fit_transform(hits['volume_id'].values)

In [27]:
hits['v_id'].value_counts()

1    2321389
4    1815247
0    1443970
2    1438880
3     597421
5     594635
7     401105
8     201739
6     195217
Name: v_id, dtype: int64

In [29]:
lb = LabelBinarizer()
hits['v_id'] = lb.fit_transform(hits['volume_id'].values)

In [30]:
hits['v_id'].value_counts()

0    7565633
1    1443970
Name: v_id, dtype: int64

In [46]:
cat_features = ['volume_id', 'layer_id', 'module_id']

In [47]:
ohe = OneHotEncoder()
X_cat = ohe.fit_transform(hits[cat_features])

In [49]:
X_cat.shape

(9009603, 3208)

In [119]:
# num_features = ['x', 'y', 'z', 'r', 'rho', 'theta', 'phi']
# num_features = ['rho', 'theta', 'phi']
num_features = ['x', 'y', 'z']

In [51]:
hits['new_pid'] = hits.particle_id.astype('str') + '_' + hits.event_id.astype('str') 
# hits = hits[hits.nhits >= 9]


In [52]:
new_pid_count = list(range(hits.new_pid.nunique()))
new_pid_list = list(set(hits.new_pid.values))
new_pid_dict = dict(zip(new_pid_list, new_pid_count))

In [53]:
# new_pid_count = list(range(100))
# new_pid_list = list(set(hits.new_pid.values))
# new_pid_list = np.random.choice(new_pid_list,100)
# new_pid_dict = dict(zip(new_pid_list, new_pid_count))
# hits = hits[hits.new_pid.isin(new_pid_list)]

In [54]:
hits.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id,particle_id,tx,ty,...,pz,q,nhits,event_id,r,rho,theta,phi,v_id,new_pid
0,2,-55.3361,0.635342,-1502.5,7,2,1,22525763437723648,-55.3385,0.630805,...,-15.496,1.0,10.0,1000,55.339747,1503.518785,-0.011481,-0.036815,1,22525763437723648_1000
1,4,-96.1091,-8.24103,-1502.5,7,2,1,297237712845406208,-96.1229,-8.23036,...,-3.70766,-1.0,11.0,1000,96.461773,1505.59328,0.085537,-0.064113,1,297237712845406208_1000
2,5,-62.6736,-9.3712,-1502.5,7,2,1,418835796137607168,-62.6594,-9.37504,...,-6.58619,1.0,10.0,1000,63.370336,1503.835779,0.148424,-0.042152,1,418835796137607168_1000
3,6,-57.0687,-8.17777,-1502.5,7,2,1,108087696726949888,-57.0856,-8.18971,...,-10.4714,1.0,11.0,1000,57.651647,1503.605654,0.142328,-0.038352,1,108087696726949888_1000
4,7,-73.8723,-2.5789,-1502.5,7,2,1,968286151951515648,-73.8608,-2.57586,...,-9.13374,-1.0,13.0,1000,73.917301,1504.317127,0.034896,-0.049157,1,968286151951515648_1000


In [55]:
hits['nid'] = hits['new_pid'].map(lambda x: new_pid_dict[x])

In [56]:
# hits = hits[hits.new_pid.isin(new_pid_list)]
# hits['nid'] = hits['new_pid'].map(lambda x: new_pid_dict[x])
hits.sort_values(['nid', 'z'], inplace=True)

event_list = list(set(hits.event_id.values))
valid_events = np.random.choice(event_list,25)
print(valid_events)



[1023 1088 1018 1099 1042 1069 1076 1022 1078 1030 1041 1079 1014 1064 1045
 1044 1093 1085 1047 1071 1051 1005 1068 1069 1086]


In [136]:
hits_train = hits[~hits.event_id.isin(valid_events)]
hits_valid = hits[hits.event_id.isin(valid_events)]

In [137]:
hits.head(20)

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id,particle_id,tx,ty,...,q,nhits,event_id,r,rho,theta,phi,v_id,new_pid,nid
5428535,65911,-248.954,87.8693,4.2,13,2,438,409837118114762753,-248.951,87.8743,...,-1.0,5.0,1059,264.005886,264.039293,-0.339304,1.554889,0,409837118114762753_1059,0
5433604,71942,-337.747,132.098,21.0,13,4,613,409837118114762753,-337.733,132.121,...,-1.0,5.0,1059,362.660885,363.268382,-0.372824,1.512956,0,409837118114762753_1059,0
5437940,77544,-471.877,172.562,45.0,13,6,854,409837118114762753,-471.865,172.583,...,-1.0,5.0,1059,502.439595,504.450738,-0.350586,1.481472,0,409837118114762753_1059,0
5441814,83344,-630.383,186.751,72.4,13,8,1220,409837118114762753,-630.388,186.741,...,-1.0,5.0,1059,657.463811,661.438147,-0.288013,1.461118,0,409837118114762753_1059,0
5452471,98136,-799.741,163.758,97.6,17,2,1437,409837118114762753,-799.744,163.747,...,-1.0,5.0,1059,816.334704,822.148472,-0.201972,1.451802,0,409837118114762753_1059,0
2438497,106522,47.0772,878.803,-1214.5,16,12,73,878203301726781440,47.2854,881.967,...,1.0,13.0,1025,880.063052,1499.8404,1.517278,-0.627066,0,878203301726781440_1025,1
2426454,90863,-94.2158,656.833,-873.8,13,8,283,878203301726781440,-94.1766,656.832,...,1.0,13.0,1025,663.55573,1097.193076,-1.428329,-0.649484,0,878203301726781440_1025,1
2422092,85014,-135.561,485.499,-650.4,13,6,374,878203301726781440,-135.575,485.497,...,1.0,13.0,1025,504.069503,822.864645,-1.298511,-0.659319,0,878203301726781440_1025,1
2422094,85018,-136.179,479.627,-642.9,13,6,375,878203301726781440,-136.186,479.626,...,1.0,13.0,1025,498.584776,813.576788,-1.29415,-0.659638,0,878203301726781440_1025,1
2417048,78921,-134.777,334.864,-458.2,13,4,382,878203301726781440,-134.782,334.863,...,1.0,13.0,1025,360.969165,583.306076,-1.188151,-0.667256,0,878203301726781440_1025,1


In [141]:
hits_q1 = hits[hits.q == 1]
hits_q0 = hits[hits.q == -1]

In [142]:
hits_q1.volume_id.value_counts()

8     1180053
13    1051676
7      729033
9      727838
12     307228
14     305324
17     246908
18     106279
16     102517
Name: volume_id, dtype: int64

In [143]:
hits_q0.volume_id.value_counts()

8     1141336
13     763571
7      714937
9      711042
12     290193
14     289311
17     154197
18      95460
16      92700
Name: volume_id, dtype: int64

In [144]:
hits_q1.layer_id.value_counts()

2     1164800
4     1041349
6      874081
8      778217
10     355973
12     347896
14     194540
Name: layer_id, dtype: int64

In [145]:
hits_q0.layer_id.value_counts()

2     1022563
4      907861
6      771495
8      686743
10     343069
12     331273
14     189743
Name: layer_id, dtype: int64

In [122]:
hits_train.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id,particle_id,tx,ty,...,q,nhits,event_id,r,rho,theta,phi,v_id,new_pid,nid
5428535,65911,-248.954,87.8693,4.2,13,2,438,409837118114762753,-248.951,87.8743,...,-1.0,5.0,1059,264.005886,264.039293,-0.339304,1.554889,0,409837118114762753_1059,0
5433604,71942,-337.747,132.098,21.0,13,4,613,409837118114762753,-337.733,132.121,...,-1.0,5.0,1059,362.660885,363.268382,-0.372824,1.512956,0,409837118114762753_1059,0
5437940,77544,-471.877,172.562,45.0,13,6,854,409837118114762753,-471.865,172.583,...,-1.0,5.0,1059,502.439595,504.450738,-0.350586,1.481472,0,409837118114762753_1059,0
5441814,83344,-630.383,186.751,72.4,13,8,1220,409837118114762753,-630.388,186.741,...,-1.0,5.0,1059,657.463811,661.438147,-0.288013,1.461118,0,409837118114762753_1059,0
5452471,98136,-799.741,163.758,97.6,17,2,1437,409837118114762753,-799.744,163.747,...,-1.0,5.0,1059,816.334704,822.148472,-0.201972,1.451802,0,409837118114762753_1059,0


In [123]:
hits_train.shape

(6842601, 31)

In [124]:
hits_valid.shape

(2167002, 31)

In [125]:
ohe = OneHotEncoder()
ohe.fit(hits[cat_features].values)

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [126]:
X_cat_train = ohe.transform(hits_train[cat_features].values)
X_cat_valid = ohe.transform(hits_valid[cat_features].values)

In [127]:
X_num_train = hits_train[num_features].values
X_num_valid = hits_valid[num_features].values



In [128]:
hits_train.q[hits_train.q == -1] = 0
hits_valid.q[hits_valid.q == -1] = 0
y_train = hits_train['q'].values
y_valid = hits_valid['q'].values

In [129]:
hits_train.q.value_counts()

1.0    3612487
0.0    3230114
Name: q, dtype: int64

In [130]:
train_list = [X_num_train,X_cat_train,]
valid_list = [X_num_valid,X_cat_valid,]

train_list = [X_num_train,]
valid_list = [X_num_valid,]

In [131]:
# X_train = ssp.hstack(train_list).tocsr()
# X_valid = ssp.hstack(valid_list).tocsr()

X_train = X_num_train
X_valid = X_num_valid

In [132]:
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
# params = {"objective": "binary",
#           "boosting_type": "gbdt",
#           'metric': 'binary_logloss',
#           "learning_rate": learning_rate,
#           "num_leaves": num_leaves,
#           "max_bin": 256,
#           "feature_fraction": feature_fraction,
#           "verbosity": 0,
#           "drop_rate": 0.1,
#           "is_unbalance": False,
#           "max_drop": 50,
#           "min_child_samples": 10,
#           "min_child_weight": 150,
#           "min_split_gain": 0,
#           "subsample": 0.9
#           }

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [133]:
X_train.shape

(6842601, 3)

In [134]:
X_valid.shape

(2167002, 3)

In [135]:
dtrain = lgbm.Dataset(X_train, y_train)
dvalid = lgbm.Dataset(X_valid, y_valid, reference=dtrain)
bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, verbose_eval=100,
                early_stopping_rounds=100)
print(bst.best_iteration)
# cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.689021
[200]	valid_0's binary_logloss: 0.689014
Early stopping, best iteration is:
[179]	valid_0's binary_logloss: 0.689013
179
