In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
import cudf
import glob
import gc

import pandas as pd

from tqdm import tqdm

In [4]:
def calc_recall(sub, ty='clicks', file='./data/xgb_train_y.parquet'):
    sub = sub.sort_values(['session', 'score'], ascending=[True, False])
    sub['dummy'] = 1
    sub['rank'] = sub.groupby(['session']).dummy.cumsum()
    sub.drop(['dummy'], axis=1, inplace=True)
    sub = sub[sub['rank']<21]
    sub = sub[['session', 'cand']]
    sub.columns = ['session', 'aid']
    test_labels = cudf.read_parquet(file)
    test_labels = test_labels[test_labels['type']==ty]
    test_labels = test_labels.merge(
        test_labels.groupby(['session', 'type']).count().reset_index().rename(columns={'aid': 'no_gt'}),
        how='left',
        on=['session', 'type']
    )
    sub['target'] = 1
    sub.columns = ['session', 'aid', 'target']
    test_labels = test_labels.merge(
        sub[['session', 'aid', 'target']].drop_duplicates(['session', 'aid']),
        how='left',
        on=['session', 'aid']

    )
    test_labels['target'] = test_labels['target'].fillna(0)
    test_labels = test_labels[test_labels['session'].isin(sub['session'])]
    test_labels = test_labels.groupby(['session', 'aid']).agg({'no_gt': 'min', 'target': 'sum'}).reset_index()
    recall = test_labels.target.sum()/test_labels.groupby(['session']).no_gt.min().clip(0,20).sum()
    print(recall)
    return(recall, test_labels.target.sum(), test_labels.groupby(['session']).no_gt.min().clip(0,20).sum())

In [5]:
!ls ./data/split/chunks_c/

ls: cannot access './data/split/chunks_c/': No such file or directory


In [6]:
import xgboost as xgb

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist' 
}

bl_sub = True
bl_pos = True
outputfolder = '24_XGB_Rerun_RedruceCandidate_DifferentWeights_Folds_ChrisCo_SameDay_v9'
no_seeds = 10
no_trees = 308

In [7]:
ty = 'carts'
labels = cudf.read_parquet('./data/xgb_train_y.parquet')
labels.columns = ['session', 'cand', 'type']
labels['target'] = 1

In [8]:
files = sorted(glob.glob('./data/split/chunks/' + ty + '/chunk*.parquet'))
files

[]

In [9]:
[y for x in [
    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + ty + '/chunk*.parquet'))
    for igfold2 in range(0,5)
] for y in x ]

['./data_folds/fold_0/split/chunks/carts/chunk_0.parquet',
 './data_folds/fold_0/split/chunks/carts/chunk_1.parquet',
 './data_folds/fold_1/split/chunks/carts/chunk_0.parquet',
 './data_folds/fold_1/split/chunks/carts/chunk_1.parquet',
 './data_folds/fold_2/split/chunks/carts/chunk_0.parquet',
 './data_folds/fold_2/split/chunks/carts/chunk_1.parquet',
 './data_folds/fold_3/split/chunks/carts/chunk_0.parquet',
 './data_folds/fold_3/split/chunks/carts/chunk_1.parquet',
 './data_folds/fold_4/split/chunks/carts/chunk_0.parquet',
 './data_folds/fold_4/split/chunks/carts/chunk_1.parquet']

In [10]:
session_filter = labels[(labels['type']=='orders')]['session'].drop_duplicates().to_pandas().tolist()

In [11]:
ingore_cols = []

In [12]:
%%time

models = []
hist_recall = []
total_no_hit = 0
total_no_gt = 0
for igfold in range(1):
    files = ['x', 'y']
    print(files)
    for ifile, file in enumerate(files):
        if bl_sub:
            if ty == 'clicks':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            elif ty == 'carts':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            elif ty == 'orders':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            else:
                assert 0!=0
            
            train_files = sorted([
                y for x in [
                    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c/' + ty + '/chunk*.parquet'))
                    for igfold2 in range(0,5)
                ] for y in x
            ])
            
            step=5
            train_files_list = []
            for i in range(0, len(train_files), step):
                x = i
                train_files_list.append(train_files[x:x+step])
            train_files = train_files_list[ifile]
            print(train_files)
            
            if ty == 'clicks':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'carts':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'orders':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            df_train = df_train[~(df_train['session'].isin(session_filter))]
        else:
            if ty == 'clicks':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            elif ty == 'carts':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            elif ty == 'orders':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            else:
                assert 0!=0
            
            train_files = [
                y for x in [
                    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c/' + ty + '/chunk*.parquet'))
                    for igfold2 in range(0,5)
                ] for y in x if y not in test_files]
            
            if ty == 'clicks':
                df_test = pd.read_parquet(test_files)
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'carts':
                df_test = cudf.read_parquet(test_files)
                df_train = cudf.concat([
                    cudf.read_parquet(x) for x in train_files
                ])
            elif ty == 'orders':
                df_test = pd.read_parquet(test_files)
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])

        print(train_files)
        print(test_files)

        if bl_pos:
            df_train = df_train[df_train['session'].isin(
                df_train[df_train['target']==1]['session'].drop_duplicates().values
            )]

        train_cols = [x for x in df_train.columns if x not in [
            'session', 'cand', 'target', 'target_clicks', 'target_carts', 'target_orders'
        ] + ingore_cols]

        print('Recall Train: ' + str(df_train[df_train['target']==1].shape[0]/labels[
        (labels['session'].isin(df_train['session'].drop_duplicates()))&(labels['type']==ty)
    ].shape[0]))
        if not bl_sub:
            print('Recall Test: ' + str(df_test[df_test['target']==1].shape[0]/labels[
            (labels['session'].isin(df_test['session'].drop_duplicates()))&(labels['type']==ty)
        ].shape[0]))

        dtrain = xgb.DMatrix(data=df_train[train_cols].values,label=df_train['target'].values)
        if not bl_sub:
            dtest =  xgb.DMatrix(data=df_test[train_cols].values, label=df_test['target'].values)
            df_test['score'] = 0.0

        del df_train
        gc.collect()

        for iseed in range(no_seeds):
            print('iseed: ' + str(iseed))
            xgb_parms['seed'] = iseed
            if bl_sub:
                model = xgb.train(
                    xgb_parms, 
                    dtrain=dtrain,
                    evals=[(dtrain,'train')],
                    num_boost_round=no_trees,
                    verbose_eval=10
                )
            else:
                model = xgb.train(
                    xgb_parms, 
                    dtrain=dtrain,
                    evals=[(dtest,'test')],
                    num_boost_round=no_trees,
                    verbose_eval=10,
                    early_stopping_rounds=50
                )
                df_test['score'] += model.predict(dtest)
            models.append(model)
        del dtrain
        
        gc.collect()

['x', 'y']
['./data_folds/fold_0/split/chunks_c/carts/chunk_0.parquet', './data_folds/fold_0/split/chunks_c/carts/chunk_1.parquet', './data_folds/fold_1/split/chunks_c/carts/chunk_0.parquet', './data_folds/fold_1/split/chunks_c/carts/chunk_1.parquet', './data_folds/fold_2/split/chunks_c/carts/chunk_0.parquet']
['./data_folds/fold_0/split/chunks_c/carts/chunk_0.parquet', './data_folds/fold_0/split/chunks_c/carts/chunk_1.parquet', './data_folds/fold_1/split/chunks_c/carts/chunk_0.parquet', './data_folds/fold_1/split/chunks_c/carts/chunk_1.parquet', './data_folds/fold_2/split/chunks_c/carts/chunk_0.parquet']
[]
Recall Train: 0.7304464949219718
iseed: 0
[0]	train-logloss:0.60094
[10]	train-logloss:0.19275
[20]	train-logloss:0.08325
[30]	train-logloss:0.04801
[40]	train-logloss:0.03652
[50]	train-logloss:0.03286
[60]	train-logloss:0.03163
[70]	train-logloss:0.03111
[80]	train-logloss:0.03083
[90]	train-logloss:0.03062
[100]	train-logloss:0.03045
[110]	train-logloss:0.03030
[120]	train-loglo

[150]	train-logloss:0.02979
[160]	train-logloss:0.02967
[170]	train-logloss:0.02956
[180]	train-logloss:0.02946
[190]	train-logloss:0.02935
[200]	train-logloss:0.02925
[210]	train-logloss:0.02915
[220]	train-logloss:0.02904
[230]	train-logloss:0.02894
[240]	train-logloss:0.02885
[250]	train-logloss:0.02875
[260]	train-logloss:0.02866
[270]	train-logloss:0.02856
[280]	train-logloss:0.02848
[290]	train-logloss:0.02839
[300]	train-logloss:0.02831
[307]	train-logloss:0.02825
iseed: 9
[0]	train-logloss:0.60081
[10]	train-logloss:0.19274
[20]	train-logloss:0.08324
[30]	train-logloss:0.04807
[40]	train-logloss:0.03652
[50]	train-logloss:0.03285
[60]	train-logloss:0.03163
[70]	train-logloss:0.03113
[80]	train-logloss:0.03084
[90]	train-logloss:0.03063
[100]	train-logloss:0.03045
[110]	train-logloss:0.03031
[120]	train-logloss:0.03015
[130]	train-logloss:0.03003
[140]	train-logloss:0.02991
[150]	train-logloss:0.02980
[160]	train-logloss:0.02971
[170]	train-logloss:0.02960
[180]	train-logloss:0.

[300]	train-logloss:0.02847
[307]	train-logloss:0.02840
iseed: 7
[0]	train-logloss:0.60078
[10]	train-logloss:0.19278
[20]	train-logloss:0.08336
[30]	train-logloss:0.04820
[40]	train-logloss:0.03670
[50]	train-logloss:0.03303
[60]	train-logloss:0.03175
[70]	train-logloss:0.03127
[80]	train-logloss:0.03099
[90]	train-logloss:0.03078
[100]	train-logloss:0.03062
[110]	train-logloss:0.03048
[120]	train-logloss:0.03034
[130]	train-logloss:0.03021
[140]	train-logloss:0.03009
[150]	train-logloss:0.02997
[160]	train-logloss:0.02987
[170]	train-logloss:0.02978
[180]	train-logloss:0.02968
[190]	train-logloss:0.02956
[200]	train-logloss:0.02945
[210]	train-logloss:0.02935
[220]	train-logloss:0.02926
[230]	train-logloss:0.02917
[240]	train-logloss:0.02909
[250]	train-logloss:0.02898
[260]	train-logloss:0.02889
[270]	train-logloss:0.02881
[280]	train-logloss:0.02872
[290]	train-logloss:0.02863
[300]	train-logloss:0.02852
[307]	train-logloss:0.02846
iseed: 8
[0]	train-logloss:0.60081
[10]	train-logl

In [13]:
no_seeds = len(models)
ifile = 0

In [14]:
no_seeds

20

In [15]:
for igfold2 in range(5):
    os.system('mkdir -p ' + './data_folds/fold_' + str(igfold2) + '/split/sub_c_2/')
    os.system('mkdir -p ' + './data_folds/fold_' + str(igfold2) + '/split/chunks_c_2/' + 'orders' + '/')

Predict Orders Dataset and add carts score

In [16]:
for igfold2 in range(5):
    if ifile==0:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c/' + 'orders' + '/chunk*.parquet')
    else:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c_2/' + 'orders' + '/chunk*.parquet')
    for isubfile, sub_file in enumerate(sssfiles):
        print('isubfile: ' + str(isubfile))
        df_sub = cudf.read_parquet(sub_file).fillna(-999)
        if 'xgb_ca_score' not in df_sub.columns:
            df_sub['xgb_ca_score'] = 0.0
        
        dsub = xgb.DMatrix(data=df_sub[train_cols].values)
        for iseed in range(no_seeds):
            df_sub['xgb_ca_score'] = df_sub['xgb_ca_score'] + models[iseed].predict(dsub)/no_seeds
        if ifile==0:
            df_sub.to_parquet(
                sub_file.replace('/chunks_c/', '/chunks_c_2/')
            )
        else:
            df_sub.to_parquet(
                sub_file
            )
        del df_sub
        gc.collect()
gc.collect()

isubfile: 0




isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1


0

In [17]:
if bl_sub:
    for igfold2 in [0,1,2,3,4]:
        if ifile==0:
            sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/sub_c/*.parquet')
        else:
            sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/sub_c_2/*.parquet')
        for isubfile, sub_file in enumerate(sssfiles):
            print('isubfile: ' + str(isubfile))
            df_sub = cudf.read_parquet(sub_file).fillna(-999)
            if 'xgb_ca_score' not in df_sub.columns:
                df_sub['xgb_ca_score'] = 0.0
            
            dsub = xgb.DMatrix(data=df_sub[train_cols].values)
            for iseed in range(no_seeds):
                df_sub['xgb_ca_score'] = df_sub['xgb_ca_score'] + models[iseed].predict(dsub)/no_seeds
            if ifile==0:
                df_sub.to_parquet(
                    sub_file.replace('/sub_c/', '/sub_c_2/')
                )
            else:
                df_sub.to_parquet(
                    sub_file
                )
            del df_sub
            gc.collect()
    gc.collect()

isubfile: 0
isubfile: 1
isubfile: 2
isubfile: 3
isubfile: 4
isubfile: 5
isubfile: 6
isubfile: 7
isubfile: 8
isubfile: 9
isubfile: 10
isubfile: 11
isubfile: 12
isubfile: 13
isubfile: 14
isubfile: 15
isubfile: 16
isubfile: 17
isubfile: 18
isubfile: 19
isubfile: 20
isubfile: 21
isubfile: 22
isubfile: 23
isubfile: 24
isubfile: 25
isubfile: 26
isubfile: 27
isubfile: 28
isubfile: 29
isubfile: 30
isubfile: 31
isubfile: 32
isubfile: 33
isubfile: 34
isubfile: 35
isubfile: 36
isubfile: 37
isubfile: 38
isubfile: 39
isubfile: 40
isubfile: 41
isubfile: 42
isubfile: 43
isubfile: 44
isubfile: 45
isubfile: 46
isubfile: 47
isubfile: 48
isubfile: 49
isubfile: 50
isubfile: 51
isubfile: 52
isubfile: 53
isubfile: 54
isubfile: 55
isubfile: 56
isubfile: 57
isubfile: 58
isubfile: 59
isubfile: 60
isubfile: 61
isubfile: 62
isubfile: 63
isubfile: 64
isubfile: 65
isubfile: 66
isubfile: 67
isubfile: 68
isubfile: 69
isubfile: 70
isubfile: 71
isubfile: 72
isubfile: 73
isubfile: 74
isubfile: 75
isubfile: 76
isubfile: