In [2]:
import pandas as pd
import glob
import numpy as np
from sklearn.neighbors import BallTree, KDTree, DistanceMetric
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import subprocess
%matplotlib inline

In [34]:
test = pd.concat([pd.read_hdf(file) for file in ['test_close0.h5', 'test_close10.h5']])
test = test.reset_index(drop=True)
test.head()

Unnamed: 0,TX,TY,X,Y,Z,data_ind
0,0.08724,0.119438,86536.21875,62988.3125,56892.0,0
1,-0.380208,0.198382,93346.765625,58062.9375,14223.0,0
2,-0.348549,-0.099981,66129.578125,23038.673828,46548.0,0
3,0.585342,-0.126879,68825.523438,55186.625,45255.0,0
4,0.038579,-0.13151,36366.941406,47564.878906,9051.0,0


## Grouping

Let's generate features using BallTree

In [5]:
from tqdm import tqdm

columns = ['TX', 'TY', 'X', 'Y', 'Z']
for_metric = columns[:-1]

def add_neighbours(df, k, same_pair, metric='minkowski'):
    result = []
    
    for data_ind in tqdm(df.data_ind.unique()):
        ind = df[df.data_ind == data_ind]
        values = list(sorted(ind.Z.unique()))
        for z, z_next in zip(values, values[1:]):
            z, z_next = ind[ind.Z == z].copy(), ind[ind.Z == z_next].copy()
#             1293 is a typical distance between neighbour slices along OZ
            z[['TX', 'TY']] *= 1293
            z_next[['TX', 'TY']] *= 1293
            
            b_tree = BallTree(z_next[for_metric], metric=metric)
            d, idx = b_tree.query(z[for_metric], k=min(k, len(z_next)))
            
            for i in range(idx.shape[1]):
                data = z_next.iloc[idx[:, i]]
                temp = z.copy()
                for col in columns:
                    temp[col + '_pair'] = data[col].values
                if same_pair:
                    temp['same_pair'] = data.event_id.values == z.event_id.values
                result.append(temp)
            
        result.append(z_next)
        
    result = pd.concat(result)
    for col in columns:
        result['d' + col] = result[col].values - result[col + '_pair'].values
    return result

def make_train(df, k):
    t = add_neighbours(df, k=k, same_pair=True)
    noise = t.event_id == -999
    signal, not_signal = t[~noise], t[noise]
    noise_part = not_signal.sample(len(signal))
    return pd.concat([signal, noise_part]).reset_index(drop=True)

In [6]:
train = []
for file in glob.glob('hdf5/open*.h5')[:5]:
    train.append(make_train(pd.read_hdf(file), k=3))
train = pd.concat(train)

100%|██████████| 10/10 [00:33<00:00,  3.30s/it]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


100%|██████████| 10/10 [00:31<00:00,  3.19s/it]
100%|██████████| 10/10 [00:32<00:00,  3.25s/it]
100%|██████████| 10/10 [00:38<00:00,  3.87s/it]
100%|██████████| 10/10 [00:39<00:00,  3.94s/it]


In [7]:
train.head()

Unnamed: 0,TX,TX_pair,TY,TY_pair,X,X_pair,Y,Y_pair,Z,Z_pair,data_ind,event_id,same_pair,signal,dTX,dTY,dX,dY,dZ
0,64.295792,85.199005,120.471031,128.52536,49545.425781,49540.316406,58027.636719,58018.058594,5172.0,5427.205078,271,153997.0,False,1.0,-20.903214,-8.054329,5.109375,9.578125,-255.205078
1,85.199005,70.662689,128.52536,122.620712,49540.316406,49611.359375,58018.058594,58146.925781,5427.205078,6465.0,271,189715.0,False,1.0,14.536316,5.904648,-71.042969,-128.867188,-1037.794922
2,85.199005,-117.009766,128.52536,153.375397,49540.316406,49843.59375,58018.058594,58355.070312,5427.205078,6465.0,271,189715.0,False,1.0,202.208771,-24.850037,-303.277344,-337.011719,-1037.794922
3,85.199005,507.651611,128.52536,42.33036,49540.316406,50004.304688,58018.058594,58237.914062,5427.205078,6465.0,271,189715.0,False,1.0,-422.452606,86.195,-463.988281,-219.855469,-1037.794922
4,-1019.46344,-967.443237,-316.895508,-311.478729,60857.125,60936.945312,34515.269531,34540.082031,6465.0,6702.123047,271,183074.0,True,1.0,-52.020203,-5.416779,-79.820312,-24.8125,-237.123047


## Training

In [8]:
y_train = train.signal
X_train = train.drop(['event_id', 'signal', 'data_ind', 'same_pair'], axis=1)

In [9]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [10]:
param_grid = {
        'n_estimators':[20], 
        'max_depth':[15],
}

#class XGBClassifier_tmp(XGBClassifier):
#    def predict(self, X):
#        return XGBClassifier.predict_proba(self, X)[:, 1]

In [12]:
clf = GridSearchCV(XGBClassifier(learning_rate=0.05, subsample=0.8,
                                     colsample_bytree=0.8), 
                   param_grid=param_grid, n_jobs=1,
                   scoring='roc_auc',
                   cv=StratifiedKFold(3, shuffle=True, random_state=0),
                   verbose=7)

In [13]:
clf.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=20, max_depth=15 ...................................
[CV]  n_estimators=20, max_depth=15, score=0.9292994407509952, total=11.3min
[CV] n_estimators=20, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 11.5min remaining:    0.0s


[CV]  n_estimators=20, max_depth=15, score=0.9281347717044828, total=11.4min
[CV] n_estimators=20, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 23.2min remaining:    0.0s


[CV]  n_estimators=20, max_depth=15, score=0.9278846054317196, total=11.4min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 34.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 34.9min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=0, shuffle=True),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [20], 'max_depth': [15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=7)

In [14]:
clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=20,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [15]:
xgb_clf = clf.best_estimator_

In [16]:
xgb_clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=20,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

# Fit

In [35]:
prepared_test = add_neighbours(test, k=3, same_pair=False)
X_test = prepared_test.drop(['data_ind'], axis=1)

100%|██████████| 11/11 [00:35<00:00,  3.20s/it]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [36]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=20,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

### Final prediction == average probability for each point

In [37]:
probs = xgb_clf.predict_proba(X_test)[:,1]

In [38]:
df = pd.DataFrame({'id': prepared_test.index, 'signal': probs}).groupby('id')
agg = df.aggregate(('mean')).loc[:, ['signal']]

In [39]:
agg.shape

(7698899, 1)

In [40]:
agg.head()

Unnamed: 0_level_0,signal
id,Unnamed: 1_level_1
0,0.207938
1,0.184213
2,0.261881
3,0.330666
4,0.196086


In [41]:
agg.to_csv('submit.csv', index=True)