In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
import cudf
import glob
import gc

import pandas as pd

from tqdm import tqdm

In [4]:
def calc_recall(sub, ty='clicks', file='./data/xgb_train_y.parquet'):
    sub = sub.sort_values(['session', 'score'], ascending=[True, False])
    sub['dummy'] = 1
    sub['rank'] = sub.groupby(['session']).dummy.cumsum()
    sub.drop(['dummy'], axis=1, inplace=True)
    sub = sub[sub['rank']<21]
    sub = sub[['session', 'cand']]
    sub.columns = ['session', 'aid']
    test_labels = cudf.read_parquet(file)
    test_labels = test_labels[test_labels['type']==ty]
    test_labels = test_labels.merge(
        test_labels.groupby(['session', 'type']).count().reset_index().rename(columns={'aid': 'no_gt'}),
        how='left',
        on=['session', 'type']
    )
    sub['target'] = 1
    sub.columns = ['session', 'aid', 'target']
    test_labels = test_labels.merge(
        sub[['session', 'aid', 'target']].drop_duplicates(['session', 'aid']),
        how='left',
        on=['session', 'aid']

    )
    test_labels['target'] = test_labels['target'].fillna(0)
    test_labels = test_labels[test_labels['session'].isin(sub['session'])]
    test_labels = test_labels.groupby(['session', 'aid']).agg({'no_gt': 'min', 'target': 'sum'}).reset_index()
    recall = test_labels.target.sum()/test_labels.groupby(['session']).no_gt.min().clip(0,20).sum()
    print(recall)
    return(recall, test_labels.target.sum(), test_labels.groupby(['session']).no_gt.min().clip(0,20).sum())

In [5]:
!ls ./data/split/chunks

carts  clicks  orders


In [6]:
import xgboost as xgb

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist' 
}

bl_sub = True
bl_pos = True
outputfolder = '24_XGB_Rerun_RedruceCandidate_DifferentWeights_Folds_ChrisCo_SameDay_v9'
no_trees = 394

In [7]:
ty = 'clicks'
labels = cudf.read_parquet('./data/xgb_train_y.parquet')
labels.columns = ['session', 'cand', 'type']
labels['target'] = 1

In [8]:
files = sorted(glob.glob('./data/split/chunks/' + ty + '/chunk*.parquet'))
files

['./data/split/chunks/clicks/chunk_0.parquet',
 './data/split/chunks/clicks/chunk_1.parquet']

In [9]:
[y for x in [
    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + ty + '/chunk*.parquet'))
    for igfold2 in range(0,5)
] for y in x ]

['./data_folds/fold_0/split/chunks/clicks/chunk_0.parquet',
 './data_folds/fold_0/split/chunks/clicks/chunk_1.parquet',
 './data_folds/fold_1/split/chunks/clicks/chunk_0.parquet',
 './data_folds/fold_1/split/chunks/clicks/chunk_1.parquet',
 './data_folds/fold_2/split/chunks/clicks/chunk_0.parquet',
 './data_folds/fold_2/split/chunks/clicks/chunk_1.parquet',
 './data_folds/fold_3/split/chunks/clicks/chunk_0.parquet',
 './data_folds/fold_3/split/chunks/clicks/chunk_1.parquet',
 './data_folds/fold_4/split/chunks/clicks/chunk_0.parquet',
 './data_folds/fold_4/split/chunks/clicks/chunk_1.parquet']

In [10]:
session_filter = labels[(labels['type']=='carts')|(labels['type']=='orders')]['session'].drop_duplicates().to_pandas().tolist()

In [11]:
ingore_cols = []

In [12]:
%%time

models = []
hist_recall = []
total_no_hit = 0
total_no_gt = 0
for igfold in range(1):
    files = ['x', 'y', 'z', 'zz', 'zzz']
    print(files)
    for ifile, file in enumerate(files):
        if bl_sub:
            if ty == 'clicks':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            elif ty == 'carts':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            elif ty == 'orders':
                test_files = []
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]]
            else:
                assert 0!=0
            
            train_files = sorted([
                y for x in [
                    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + ty + '/chunk*.parquet'))
                    for igfold2 in range(0,5)
                ] for y in x
            ])
            
            step=2
            train_files_list = []
            for i in range(0, len(train_files), step):
                x = i
                train_files_list.append(train_files[x:x+step])
            train_files = train_files_list[ifile]
            print(train_files)
            
            if ty == 'clicks':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'carts':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'orders':
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            df_train = df_train[~(df_train['session'].isin(session_filter))]
        else:
            if ty == 'clicks':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            elif ty == 'carts':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            elif ty == 'orders':
                test_files = files[ifile]
                train_files = [files[x] for x in [(ifile+i+1)%len(files) for i in range(len(files))]][0:1]
            else:
                assert 0!=0
            
            train_files = [
                y for x in [
                    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + ty + '/chunk*.parquet'))
                    for igfold2 in range(0,5)
                ] for y in x if y not in test_files]
            
            if ty == 'clicks':
                df_test = pd.read_parquet(test_files)
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])
            elif ty == 'carts':
                df_test = cudf.read_parquet(test_files)
                df_train = cudf.concat([
                    cudf.read_parquet(x) for x in train_files
                ])
            elif ty == 'orders':
                df_test = pd.read_parquet(test_files)
                df_train = pd.concat([
                    pd.read_parquet(x) for x in train_files
                ])

        print(train_files)
        print(test_files)

        if bl_pos:
            df_train = df_train[df_train['session'].isin(
                df_train[df_train['target']==1]['session'].drop_duplicates().values
            )]

        train_cols = [x for x in df_train.columns if x not in [
            'session', 'cand', 'target', 'target_clicks', 'target_carts', 'target_orders'
        ] + ingore_cols]

        print('Recall Train: ' + str(df_train[df_train['target']==1].shape[0]/labels[
        (labels['session'].isin(df_train['session'].drop_duplicates()))&(labels['type']==ty)
    ].shape[0]))
        if not bl_sub:
            print('Recall Test: ' + str(df_test[df_test['target']==1].shape[0]/labels[
            (labels['session'].isin(df_test['session'].drop_duplicates()))&(labels['type']==ty)
        ].shape[0]))

        dtrain = xgb.DMatrix(data=df_train[train_cols].values,label=df_train['target'].values)
        if not bl_sub:
            dtest =  xgb.DMatrix(data=df_test[train_cols].values, label=df_test['target'].values)
            df_test['score'] = 0.0

        del df_train
        gc.collect()

        for iseed in range(no_seeds):
            print('iseed: ' + str(iseed))
            xgb_parms['seed'] = iseed
            if bl_sub:
                model = xgb.train(
                    xgb_parms, 
                    dtrain=dtrain,
                    evals=[(dtrain,'train')],
                    num_boost_round=no_trees,
                    verbose_eval=10
                )
            else:
                model = xgb.train(
                    xgb_parms, 
                    dtrain=dtrain,
                    evals=[(dtest,'test')],
                    num_boost_round=no_trees,
                    verbose_eval=10,
                    early_stopping_rounds=50
                )
                df_test['score'] += model.predict(dtest)
            models.append(model)
        del dtrain
        gc.collect()

['x', 'y', 'z', 'zz', 'zzz']
['./data_folds/fold_0/split/chunks/clicks/chunk_0.parquet', './data_folds/fold_0/split/chunks/clicks/chunk_1.parquet']
['./data_folds/fold_0/split/chunks/clicks/chunk_0.parquet', './data_folds/fold_0/split/chunks/clicks/chunk_1.parquet']
[]
Recall Train: 1.0
iseed: 0
[0]	train-logloss:0.60190
[10]	train-logloss:0.19911
[20]	train-logloss:0.09192
[30]	train-logloss:0.05802
[40]	train-logloss:0.04712
[50]	train-logloss:0.04363
[60]	train-logloss:0.04245
[70]	train-logloss:0.04196
[80]	train-logloss:0.04170
[90]	train-logloss:0.04152
[100]	train-logloss:0.04138
[110]	train-logloss:0.04126
[120]	train-logloss:0.04114
[130]	train-logloss:0.04104
[140]	train-logloss:0.04094
[150]	train-logloss:0.04085
[160]	train-logloss:0.04077
[170]	train-logloss:0.04069
[180]	train-logloss:0.04060
[190]	train-logloss:0.04051
[200]	train-logloss:0.04043
[210]	train-logloss:0.04036
[220]	train-logloss:0.04029
[230]	train-logloss:0.04023
[240]	train-logloss:0.04017
[250]	train-lo

[190]	train-logloss:0.04059
[200]	train-logloss:0.04051
[210]	train-logloss:0.04045
[220]	train-logloss:0.04038
[230]	train-logloss:0.04032
[240]	train-logloss:0.04026
[250]	train-logloss:0.04019
[260]	train-logloss:0.04013
[270]	train-logloss:0.04006
[280]	train-logloss:0.03999
[290]	train-logloss:0.03992
[300]	train-logloss:0.03988
[310]	train-logloss:0.03981
[320]	train-logloss:0.03975
[330]	train-logloss:0.03968
[340]	train-logloss:0.03963
[350]	train-logloss:0.03957
[360]	train-logloss:0.03950
[370]	train-logloss:0.03943
[380]	train-logloss:0.03937
[390]	train-logloss:0.03931
[393]	train-logloss:0.03930
iseed: 1
[0]	train-logloss:0.60189
[10]	train-logloss:0.19913
[20]	train-logloss:0.09206
[30]	train-logloss:0.05815
[40]	train-logloss:0.04727
[50]	train-logloss:0.04376
[60]	train-logloss:0.04258
[70]	train-logloss:0.04210
[80]	train-logloss:0.04180
[90]	train-logloss:0.04162
[100]	train-logloss:0.04145
[110]	train-logloss:0.04133
[120]	train-logloss:0.04121
[130]	train-logloss:0.

[70]	train-logloss:0.04213
[80]	train-logloss:0.04182
[90]	train-logloss:0.04165
[100]	train-logloss:0.04149
[110]	train-logloss:0.04136
[120]	train-logloss:0.04125
[130]	train-logloss:0.04116
[140]	train-logloss:0.04106
[150]	train-logloss:0.04098
[160]	train-logloss:0.04090
[170]	train-logloss:0.04082
[180]	train-logloss:0.04075
[190]	train-logloss:0.04066
[200]	train-logloss:0.04059
[210]	train-logloss:0.04052
[220]	train-logloss:0.04044
[230]	train-logloss:0.04038
[240]	train-logloss:0.04030
[250]	train-logloss:0.04025
[260]	train-logloss:0.04018
[270]	train-logloss:0.04011
[280]	train-logloss:0.04005
[290]	train-logloss:0.03999
[300]	train-logloss:0.03993
[310]	train-logloss:0.03987
[320]	train-logloss:0.03981
[330]	train-logloss:0.03976
[340]	train-logloss:0.03971
[350]	train-logloss:0.03964
[360]	train-logloss:0.03959
[370]	train-logloss:0.03953
[380]	train-logloss:0.03948
[390]	train-logloss:0.03941
[393]	train-logloss:0.03939
iseed: 2
[0]	train-logloss:0.60189
[10]	train-loglo

In [13]:
no_seeds = len(models)
ifile = 0

In [15]:
for igfold2 in range(5):
    os.system('mkdir -p ' + './data_folds/fold_' + str(igfold2) + '/split/sub_c/')
    os.system('mkdir -p ' + './data_folds/fold_' + str(igfold2) + '/split/chunks_c/' + 'carts' + '/')
    os.system('mkdir -p ' + './data_folds/fold_' + str(igfold2) + '/split/chunks_c/' + 'orders' + '/')

Predict Carts Dataset and add click score

In [17]:
for igfold2 in range(5):
    if ifile==0:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + 'carts' + '/chunk*.parquet')
    else:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c/' + 'carts' + '/chunk*.parquet')
    for isubfile, sub_file in enumerate(sssfiles):
        print('isubfile: ' + str(isubfile))
        df_sub = cudf.read_parquet(sub_file).fillna(-999)
        if 'xgb_c_score' not in df_sub.columns:
            df_sub['xgb_c_score'] = 0.0
                    
        dsub = xgb.DMatrix(data=df_sub[train_cols].values)
        for iseed in range(no_seeds):
            df_sub['xgb_c_score'] = df_sub['xgb_c_score'] + models[iseed].predict(dsub)/no_seeds
        if ifile==0:
            df_sub.to_parquet(
                sub_file.replace('/chunks/', '/chunks_c/')
            )
        else:
            df_sub.to_parquet(
                sub_file
            )
        del df_sub
        gc.collect()

isubfile: 0




isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1


0

Predict Orders Dataset and add click score

In [28]:
for igfold2 in range(5):
    if ifile==0:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + 'orders' + '/chunk*.parquet')
    else:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks_c/' + 'orders' + '/chunk*.parquet')
    for isubfile, sub_file in enumerate(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + 'orders' + '/chunk*.parquet')):
        print('isubfile: ' + str(isubfile))
        df_sub = cudf.read_parquet(sub_file).fillna(-999)
        if 'xgb_c_score' not in df_sub.columns:
            df_sub['xgb_c_score'] = 0.0

        dsub = xgb.DMatrix(data=df_sub[train_cols].values)
        for iseed in range(no_seeds):
            df_sub['xgb_c_score'] = df_sub['xgb_c_score'] + models[iseed].predict(dsub)/no_seeds
        if ifile==0:
            df_sub.to_parquet(
                sub_file.replace('/chunks/', '/chunks_c/')
            )
        else:
            df_sub.to_parquet(
                sub_file
            )
        del df_sub
        gc.collect()

isubfile: 0




isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1
isubfile: 0
isubfile: 1


Predict Submission Dataset and add click score

In [30]:
for igfold2 in [0,1,2,3,4]:
    if ifile==0:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/sub/*.parquet')
    else:
        sssfiles = glob.glob('./data_folds/fold_' + str(igfold2) + '/split/sub_c/*.parquet')
    for isubfile, sub_file in enumerate(sssfiles):
        print('isubfile: ' + str(isubfile))
        df_sub = cudf.read_parquet(sub_file).fillna(-999)
        if 'xgb_c_score' not in df_sub.columns:
            df_sub['xgb_c_score'] = 0.0
        
        dsub = xgb.DMatrix(data=df_sub[train_cols].values)
        for iseed in range(no_seeds):
            df_sub['xgb_c_score'] = df_sub['xgb_c_score'] + models[iseed].predict(dsub)/no_seeds
        if ifile==0:
            df_sub.to_parquet(
                sub_file.replace('/sub/', '/sub_c/')
            )
        else:
            df_sub.to_parquet(
                sub_file
            )
        del df_sub
        gc.collect()
gc.collect()

isubfile: 0
isubfile: 1
isubfile: 2
isubfile: 3
isubfile: 4
isubfile: 5
isubfile: 6
isubfile: 7
isubfile: 8
isubfile: 9
isubfile: 10
isubfile: 11
isubfile: 12
isubfile: 13
isubfile: 14
isubfile: 15
isubfile: 16
isubfile: 17
isubfile: 18
isubfile: 19
isubfile: 20
isubfile: 21
isubfile: 22
isubfile: 23
isubfile: 24
isubfile: 25
isubfile: 26
isubfile: 27
isubfile: 28
isubfile: 29
isubfile: 30
isubfile: 31
isubfile: 32
isubfile: 33
isubfile: 34
isubfile: 35
isubfile: 36
isubfile: 37
isubfile: 38
isubfile: 39
isubfile: 40
isubfile: 41
isubfile: 42
isubfile: 43
isubfile: 44
isubfile: 45
isubfile: 46
isubfile: 47
isubfile: 48
isubfile: 49
isubfile: 50
isubfile: 51
isubfile: 52
isubfile: 53
isubfile: 54
isubfile: 55
isubfile: 56
isubfile: 57
isubfile: 58
isubfile: 59
isubfile: 60
isubfile: 61
isubfile: 62
isubfile: 63
isubfile: 64
isubfile: 65
isubfile: 66
isubfile: 67
isubfile: 68
isubfile: 69
isubfile: 70
isubfile: 71
isubfile: 72
isubfile: 73
isubfile: 74
isubfile: 75
isubfile: 76
isubfile: