In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [4]:
import cudf
import glob
import gc

import pandas as pd

from tqdm import tqdm

In [5]:
def calc_recall(sub, ty='clicks', file='./data/xgb_train_y.parquet'):
    sub = sub.sort_values(['session', 'score'], ascending=[True, False])
    sub['dummy'] = 1
    sub['rank'] = sub.groupby(['session']).dummy.cumsum()
    sub.drop(['dummy'], axis=1, inplace=True)
    sub = sub[sub['rank']<21]
    sub = sub[['session', 'cand']]
    sub.columns = ['session', 'aid']
    test_labels = cudf.read_parquet(file)
    test_labels = test_labels[test_labels['type']==ty]
    test_labels = test_labels.merge(
        test_labels.groupby(['session', 'type']).count().reset_index().rename(columns={'aid': 'no_gt'}),
        how='left',
        on=['session', 'type']
    )
    sub['target'] = 1
    sub.columns = ['session', 'aid', 'target']
    test_labels = test_labels.merge(
        sub[['session', 'aid', 'target']].drop_duplicates(['session', 'aid']),
        how='left',
        on=['session', 'aid']

    )
    test_labels['target'] = test_labels['target'].fillna(0)
    test_labels = test_labels[test_labels['session'].isin(sub['session'])]
    test_labels = test_labels.groupby(['session', 'aid']).agg({'no_gt': 'min', 'target': 'sum'}).reset_index()
    recall = test_labels.target.sum()/test_labels.groupby(['session']).no_gt.min().clip(0,20).sum()
    print(recall)
    return(recall, test_labels.target.sum(), test_labels.groupby(['session']).no_gt.min().clip(0,20).sum())

In [6]:
!ls ./data/split/chunks_c_2/

ls: cannot access './data/split/chunks_c_2/': No such file or directory


In [7]:
import xgboost as xgb

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist' 
}

bl_sub = False
bl_pos = True
outputfolder = '24_XGB_Rerun_RedruceCandidate_DifferentWeights_Folds_ChrisCo_SameDay_v9'
no_seeds = 3
no_trees = 1000

In [8]:
ty = 'orders'
labels = cudf.read_parquet('./data/xgb_train_y.parquet')
labels.columns = ['session', 'cand', 'type']
labels['target'] = 1

In [9]:
files = sorted(glob.glob('./data/split/chunks/' + ty + '/chunk*.parquet'))
files

['./data/split/chunks/orders/chunk_0.parquet',
 './data/split/chunks/orders/chunk_1.parquet']

In [10]:
[y for x in [
    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + ty + '/chunk*.parquet'))
    for igfold2 in range(0,5)
] for y in x ]

['./data_folds/fold_0/split/chunks/orders/chunk_0.parquet',
 './data_folds/fold_0/split/chunks/orders/chunk_1.parquet',
 './data_folds/fold_1/split/chunks/orders/chunk_0.parquet',
 './data_folds/fold_1/split/chunks/orders/chunk_1.parquet',
 './data_folds/fold_2/split/chunks/orders/chunk_0.parquet',
 './data_folds/fold_2/split/chunks/orders/chunk_1.parquet',
 './data_folds/fold_3/split/chunks/orders/chunk_0.parquet',
 './data_folds/fold_3/split/chunks/orders/chunk_1.parquet',
 './data_folds/fold_4/split/chunks/orders/chunk_0.parquet',
 './data_folds/fold_4/split/chunks/orders/chunk_1.parquet']

In [11]:
#session_filter = labels[(labels['type']=='orders')]['session'].drop_duplicates().to_pandas().tolist()

In [12]:
ingore_cols = []

In [13]:
%%time

model_bests = []
hist_recall = []
total_no_hit = 0
total_no_gt = 0
for igfold in range(5):
    files = sorted(glob.glob('./data_folds/fold_' + str(igfold) + '/split/chunks_c_2/' + ty + '/chunk*.parquet'))
    print(files)
    for ifile, file in enumerate(files):
        if bl_sub:
            if ty == 'clicks':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            elif ty == 'carts':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            elif ty == 'orders':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            else:
                assert 0!=0
            if ty == 'clicks':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'carts':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'orders':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
        else:
            if ty == 'clicks':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            elif ty == 'carts':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            elif ty == 'orders':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            else:
                assert 0!=0
            
            if ty == 'clicks':
                train_files = [
                    y for x in [
                        sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c_2/' + ty + '/chunk*.parquet'))
                        for igfold2 in range(0,5)
                    ] for y in x if y not in test_files][0:2]
            elif ty == 'carts':
                train_files = [
                    y for x in [
                        sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c_2/' + ty + '/chunk*.parquet'))
                        for igfold2 in range(0,5)
                    ] for y in x if y not in test_files][0:5]
            else:
                train_files = [
                    y for x in [
                        sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c_2/' + ty + '/chunk*.parquet'))
                        for igfold2 in range(0,5)
                    ] for y in x if y not in test_files]
            
            if ty == 'clicks':
                df_test = pd.read_parquet(test_files)
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'carts':
                df_test = pd.read_parquet(test_files)
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'orders':
                df_test = pd.read_parquet(test_files)
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])

        print(train_files)
        print(test_files)

        if bl_pos and ty!='clicks':
            df_train = df_train[df_train['session'].isin(
                df_train[df_train['target']==1]['session'].drop_duplicates().values
            )]
        
        print(df_train.shape)

        train_cols = [x for x in df_train.columns if x not in [
            'session', 'cand', 'target', 'target_clicks', 'target_carts', 'target_orders'
        ] + ingore_cols]

        print('Recall Train: ' + str(df_train[df_train['target']==1].shape[0]/labels[
        (labels['session'].isin(df_train['session'].drop_duplicates()))&(labels['type']==ty)
    ].shape[0]))
        if not bl_sub:
            print('Recall Test: ' + str(df_test[df_test['target']==1].shape[0]/labels[
            (labels['session'].isin(df_test['session'].drop_duplicates()))&(labels['type']==ty)
        ].shape[0]))

        dtrain = xgb.DMatrix(data=df_train[train_cols].values,label=df_train['target'].values)
        if not bl_sub:
            dtest =  xgb.DMatrix(data=df_test[train_cols].values, label=df_test['target'].values)
            df_test['score'] = 0.0

        del df_train
        gc.collect()

        models = []
        for iseed in range(no_seeds):
            print('iseed: ' + str(iseed))
            xgb_parms['seed'] = iseed
            if bl_sub:
                model = xgb.train(
                    xgb_parms, 
                    dtrain=dtrain,
                    evals=[(dtrain,'train')],
                    num_boost_round=no_trees,
                    verbose_eval=10
                )
            else:
                model = xgb.train(
                    xgb_parms, 
                    dtrain=dtrain,
                    evals=[(dtest,'test')],
                    num_boost_round=no_trees,
                    verbose_eval=10,
                    early_stopping_rounds=50
                )
                df_test['score'] += model.predict(dtest)
                model_bests.append(model.best_iteration)
            models.append(model)
        del dtrain
        
        sub = cudf.from_pandas(df_test[['session', 'cand', 'score']])
        recall, no_hit, no_gt = calc_recall(sub, ty=ty)
        hist_recall.append(recall)
        total_no_hit += no_hit
        total_no_gt += no_gt
        print(total_no_hit/total_no_gt)

        del sub
        gc.collect()

['./data_folds/fold_0/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_0/split/chunks_c_2/orders/chunk_1.parquet']
['./data_folds/fold_0/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_1/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_1/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_2/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_2/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_3/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_3/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_4/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_4/split/chunks_c_2/orders/chunk_1.parquet']
./data_folds/fold_0/split/chunks_c_2/orders/chunk_0.parquet
(17045999, 171)
Recall Train: 0.7876550911485409
Recall Test: 0.7337211181127807
iseed: 0
[0]	test-logloss:0.60039
[10]	test-logloss:0.18959
[20]	test-logloss:0.07821
[30]	test-logloss:0.04159
[40]	test-logloss:0.02919
[50]	test-logloss:0.02503


['./data_folds/fold_0/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_0/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_1/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_2/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_2/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_3/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_3/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_4/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_4/split/chunks_c_2/orders/chunk_1.parquet']
./data_folds/fold_1/split/chunks_c_2/orders/chunk_0.parquet
(17048625, 171)
Recall Train: 0.7882866566993942
Recall Test: 0.7280929903991266
iseed: 0
[0]	test-logloss:0.60038
[10]	test-logloss:0.18950
[20]	test-logloss:0.07809
[30]	test-logloss:0.04142
[40]	test-logloss:0.02900
[50]	test-logloss:0.02483
[60]	test-logloss:0.02345
[70]	test-logloss:0.02298
[80]	test-logloss:0.02279
[90]	test-logloss:0.02270
[100]	test-logloss:0.02

[300]	test-logloss:0.02259
[310]	test-logloss:0.02259
[320]	test-logloss:0.02259
[330]	test-logloss:0.02259
[340]	test-logloss:0.02260
[350]	test-logloss:0.02260
[355]	test-logloss:0.02260
0.6778743644415286
0.6746516487683656
['./data_folds/fold_2/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_2/split/chunks_c_2/orders/chunk_1.parquet']
['./data_folds/fold_0/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_0/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_1/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_1/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_2/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_3/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_3/split/chunks_c_2/orders/chunk_1.parquet', './data_folds/fold_4/split/chunks_c_2/orders/chunk_0.parquet', './data_folds/fold_4/split/chunks_c_2/orders/chunk_1.parquet']
./data_folds/fold_2/split/chunks_c_2/orders/chunk_0.parquet
(17037836, 171)
Re

(17056920, 171)
Recall Train: 0.7878238162364523
Recall Test: 0.7327931301652892
iseed: 0
[0]	test-logloss:0.60040
[10]	test-logloss:0.18957
[20]	test-logloss:0.07817
[30]	test-logloss:0.04154
[40]	test-logloss:0.02914
[50]	test-logloss:0.02499
[60]	test-logloss:0.02363
[70]	test-logloss:0.02318
[80]	test-logloss:0.02301
[90]	test-logloss:0.02292
[100]	test-logloss:0.02287
[110]	test-logloss:0.02284
[120]	test-logloss:0.02282
[130]	test-logloss:0.02280
[140]	test-logloss:0.02278
[150]	test-logloss:0.02277
[160]	test-logloss:0.02276
[170]	test-logloss:0.02276
[180]	test-logloss:0.02274
[190]	test-logloss:0.02273
[200]	test-logloss:0.02273
[210]	test-logloss:0.02272
[220]	test-logloss:0.02272
[230]	test-logloss:0.02272
[240]	test-logloss:0.02272
[250]	test-logloss:0.02272
[260]	test-logloss:0.02271
[270]	test-logloss:0.02271
[280]	test-logloss:0.02270
[290]	test-logloss:0.02270
[300]	test-logloss:0.02270
[310]	test-logloss:0.02270
[320]	test-logloss:0.02270
[330]	test-logloss:0.02270
[34

(17015015, 171)
Recall Train: 0.7881116287694412
Recall Test: 0.7332996749865892
iseed: 0
[0]	test-logloss:0.60038
[10]	test-logloss:0.18955
[20]	test-logloss:0.07816
[30]	test-logloss:0.04155
[40]	test-logloss:0.02916
[50]	test-logloss:0.02502
[60]	test-logloss:0.02367
[70]	test-logloss:0.02322
[80]	test-logloss:0.02304
[90]	test-logloss:0.02294
[100]	test-logloss:0.02289
[110]	test-logloss:0.02285
[120]	test-logloss:0.02283
[130]	test-logloss:0.02281
[140]	test-logloss:0.02279
[150]	test-logloss:0.02278
[160]	test-logloss:0.02277
[170]	test-logloss:0.02276
[180]	test-logloss:0.02275
[190]	test-logloss:0.02274
[200]	test-logloss:0.02273
[210]	test-logloss:0.02272
[220]	test-logloss:0.02272
[230]	test-logloss:0.02272
[240]	test-logloss:0.02271
[250]	test-logloss:0.02270
[260]	test-logloss:0.02270
[270]	test-logloss:0.02269
[280]	test-logloss:0.02269
[290]	test-logloss:0.02269
[300]	test-logloss:0.02270
[310]	test-logloss:0.02269
[320]	test-logloss:0.02269
[325]	test-logloss:0.02269
ise

In [15]:
import numpy as np

np.mean(model_bests), np.std(model_bests), np.min(model_bests), np.max(model_bests)

(345.23333333333335, 45.05528702481972, 275, 464)