In [1]:
import pandas as pd
import glob
import numpy as np
from sklearn.neighbors import BallTree, KDTree, DistanceMetric
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import subprocess
from tqdm import tqdm
%matplotlib inline

In [2]:
test = pd.concat([pd.read_hdf(file) for file in ['test_close0.h5', 'test_close10.h5']])
test.head()

Unnamed: 0,TX,TY,X,Y,Z,data_ind
0,0.08724,0.119438,86536.21875,62988.3125,56892.0,0
1,-0.380208,0.198382,93346.765625,58062.9375,14223.0,0
2,-0.348549,-0.099981,66129.578125,23038.673828,46548.0,0
3,0.585342,-0.126879,68825.523438,55186.625,45255.0,0
4,0.038579,-0.13151,36366.941406,47564.878906,9051.0,0


# Группировка

In [3]:
def neighbours(data, k):
    columns =  ['TX', 'TY', 'X', 'Y', 'Z']
    result = []
    df = data.copy()
    df[['TX', 'TY']] *= 1293
    
    for data_ind in tqdm(df.data_ind.unique()):
        ind = df[df.data_ind == data_ind]
        values = np.unique(ind.Z)
        for i, next_val in enumerate(values[1:]):
            z, z_next = ind[ind.Z == values[i]].copy(), ind[ind.Z == next_val].copy()
            
            b_tree = BallTree(z_next[['TX', 'TY', 'X', 'Y']], metric='minkowski')
            d, idx = b_tree.query(z[['TX', 'TY', 'X', 'Y']], k=min(k, len(z_next)))
            
            for i in range(idx.shape[1]):
                next_val_z = z_next.iloc[idx[:, i]]
                curr_val_z = z.copy()
                for col in columns:
                    curr_val_z[col + '_pair'] = next_val_z[col].values
                    
                result.append(curr_val_z)
            
        result.append(z_next)
        
    result = pd.concat(result)
    for col in columns:
        result['d' + col] = result[col].values - result[col + '_pair'].values
    return result


def train_prep(df, k):
    neigh = neighbours(df, k=k)
    noise = neigh.event_id == -999
    not_signal = neigh[noise]
    signal = neigh[np.logical_not(noise)]
    noise_part = not_signal.sample(len(signal))
    return pd.concat([signal, noise_part]).reset_index(drop=True)

In [4]:
train = []
for file in glob.glob('hdf5/open*.h5')[:5]:
    train.append(train_prep(pd.read_hdf(file), k=3))
train = pd.concat(train)

train.head()

100%|██████████| 10/10 [00:34<00:00,  3.45s/it]
100%|██████████| 10/10 [00:33<00:00,  3.34s/it]
100%|██████████| 10/10 [00:33<00:00,  3.37s/it]
100%|██████████| 10/10 [00:33<00:00,  3.39s/it]
100%|██████████| 10/10 [00:35<00:00,  3.58s/it]


Unnamed: 0,TX,TX_pair,TY,TY_pair,X,X_pair,Y,Y_pair,Z,Z_pair,data_ind,event_id,signal,dTX,dTY,dX,dY,dZ
0,64.295792,85.199005,120.471031,128.52536,49545.425781,49540.316406,58027.636719,58018.058594,5172.0,5427.205078,271,153997.0,1.0,-20.903214,-8.054329,5.109375,9.578125,-255.205078
1,85.199005,70.662689,128.52536,122.620712,49540.316406,49611.359375,58018.058594,58146.925781,5427.205078,6465.0,271,189715.0,1.0,14.536316,5.904648,-71.042969,-128.867188,-1037.794922
2,85.199005,-117.009766,128.52536,153.375397,49540.316406,49843.59375,58018.058594,58355.070312,5427.205078,6465.0,271,189715.0,1.0,202.208771,-24.850037,-303.277344,-337.011719,-1037.794922
3,85.199005,507.651611,128.52536,42.33036,49540.316406,50004.304688,58018.058594,58237.914062,5427.205078,6465.0,271,189715.0,1.0,-422.452606,86.195,-463.988281,-219.855469,-1037.794922
4,-1019.46344,-967.443237,-316.895508,-311.478729,60857.125,60936.945312,34515.269531,34540.082031,6465.0,6702.123047,271,183074.0,1.0,-52.020203,-5.416779,-79.820312,-24.8125,-237.123047


# Обучение

In [5]:
y_train = train.signal
X_train = train.drop(['event_id', 'signal', 'data_ind'], axis=1)

In [6]:
lgb_train = lgb.Dataset(X_train, y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 34,
    'learning_rate': 0.05,
    'max_depth': 18,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 4
}

lgb.cv(params, lgb_train, 20, nfold=3)

{'auc-mean': [0.84347011426734175,
  0.85814211897803894,
  0.86163576679686493,
  0.86717653683487927,
  0.87223090070583564,
  0.87316757006164814,
  0.87661728173135944,
  0.87781046985647226,
  0.87888478227505251,
  0.87989160990421,
  0.88020800235446517,
  0.88131253295365386,
  0.88272769644690463,
  0.88447497194395763,
  0.88582093916928484,
  0.88665501384557122,
  0.88851007223715373,
  0.88911800109855255,
  0.88968813349531495,
  0.89049794740905697],
 'auc-stdv': [0.00036750118839408923,
  0.00056014680796931838,
  0.00066312726796723522,
  0.00051376486052064475,
  0.00043770101019425606,
  0.00038339406021743933,
  0.00046761078506460288,
  0.0004909328427065273,
  6.1500715232622088e-05,
  0.00015339219528023688,
  2.6884734141372337e-05,
  0.00083924853547832229,
  0.00059221592557973394,
  0.00046536310209787063,
  0.00055586825707147126,
  4.829989062295398e-05,
  9.4845401142689152e-05,
  0.0001243798345725507,
  0.00031095065102485831,
  0.00033663909669184394]}

In [7]:
bst = lgb.train(params, lgb_train, 100)

# Предсказание

In [8]:
test_neighbours = neighbours(test, k=3)

100%|██████████| 11/11 [00:35<00:00,  3.20s/it]


In [9]:
X_test = test_neighbours.drop(['data_ind'], axis=1).reset_index(drop=True)
X_test.head()

Unnamed: 0,TX,TX_pair,TY,TY_pair,X,X_pair,Y,Y_pair,Z,Z_pair,dTX,dTY,dX,dY,dZ
0,-193.084152,-350.331818,623.975891,437.50589,37949.0,38081.851562,24967.570312,24522.3125,0.0,1293.0,157.247665,186.470001,-132.851562,445.257812,-1293.0
1,-225.433212,-380.564331,-367.239899,458.562836,39335.953125,39409.578125,49094.96875,49534.917969,0.0,1293.0,155.131119,-825.802734,-73.625,-439.949219,-1293.0
2,-288.543915,-166.982605,744.821838,638.900269,51992.125,51851.539062,74045.695312,73873.390625,0.0,1293.0,-121.56131,105.92157,140.585938,172.304688,-1293.0
3,-131.043716,-486.173767,-389.968414,107.317078,29508.035156,29299.15625,67410.984375,67238.296875,0.0,1293.0,355.130051,-497.285492,208.878906,172.6875,-1293.0
4,-233.201782,135.409042,573.095337,775.31897,67685.78125,67272.015625,75164.359375,75664.359375,0.0,1293.0,-368.610825,-202.223633,413.765625,-500.0,-1293.0


# Усреднение для каждой точки в качестве финального предсказания

In [11]:
pred = bst.predict(X_test)
raw = pd.DataFrame({'id': test_neighbours.index, 'prob': pred}).groupby('id')
agg = raw.aggregate('mean')['prob']
result = pd.DataFrame(data={'signal': agg})
result.head()

Unnamed: 0_level_0,signal
id,Unnamed: 1_level_1
0,0.111293
1,0.07053
2,0.233427
3,0.098327
4,0.029916
