In [3]:
import numpy as np
import xgboost as xgb
import scipy.sparse as sp
import gc

In [4]:
# from http://stackoverflow.com/a/8980156/861423

def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices, indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return sp.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

In [5]:
X_train = np.load('X_train_0.npy')
y_train = load_sparse_csr('y_0.npz')

X_val = np.load('X_val.npy')
y_val = load_sparse_csr('y_val.npz')

In [6]:
from tqdm import tqdm_notebook as tqdm

In [7]:
def explode(X, y):
    sizes = y.indptr[1:] - y.indptr[:-1]
    idx = np.repeat(np.arange(len(sizes)), sizes)
    return X[idx], y.indices

In [8]:
X_multi, y_multi = explode(X_train[:1227855], y_train[:1227855, :2000])

In [9]:
dtrain = xgb.DMatrix(X_multi, y_multi)

In [10]:
del X_multi, y_multi
gc.collect()

0

In [11]:
xgb_pars = {
    'eta': 0.3,
    'gamma': 0,
    'max_depth': 2,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
    'objective': 'multi:softprob',
    'num_class': 2000,
    'nthread': 20,
    'seed': 42,
    'silent': 1
}

In [15]:
from time import time

In [None]:
t0 = time()
model = xgb.train(xgb_pars, dtrain, num_boost_round=1)
time() - t0

In [27]:
def prepare_pred_row(prow):
    classes = (-prow).argsort()[:20]
    scores = prow[classes]
    return ' '.join(['%d %0.3f' % (c, s) for (c, s) in zip(classes, scores)])

In [31]:
with open('subm_xgb.csv', 'w') as f:
    f.write('VideoId,LabelConfidencePairs\n')

    for id, prow in tqdm(zip(test_ids, y_pred), total=len(test_ids)):
        lab_conf = prepare_pred_row(prow)
        f.write('%s,%s\n' % (id, lab_conf))

In [32]:
!gzip subm_xgb.csv