<a href="https://colab.research.google.com/github/afdebbas/DataScience/blob/master/1_Target_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from scipy.stats import rankdata
import lightgbm as lgb

# Loading train + test and taking a look

In [3]:
# https://www.kaggle.com/c/amazon-employee-access-challenge

train = pd.read_csv( '/content/amazon-employee-access-challenge/train.csv' )
test  = pd.read_csv( '/content/amazon-employee-access-challenge/test.csv' )
print(train.shape, test.shape)
train.head()

(32769, 10) (58921, 10)


Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [0]:
!mkdir /content/amazon-employee-access-challenge

In [4]:
train['ACTION'].value_counts()

1    30872
0     1897
Name: ACTION, dtype: int64

In [5]:
train.nunique()

ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64

# Label encode the categorical features

In [6]:
from sklearn.preprocessing import LabelEncoder

features = []
for col in train.columns[1:]:
    rd = LabelEncoder()
    rd.fit_transform( train[col].append( test[col] ) )
    train[col] = rd.transform( train[col] )
    test [col] = rd.transform( test [col] )
    features.append(col)

train[features].head(10)

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,3050,4440,21,65,319,4,7,65,4
1,644,162,21,69,310,34,62,67,38
2,2706,1679,50,58,14,0,2590,3,0
3,2615,931,21,69,184,22,2357,65,23
4,3616,1010,15,13,160,70,380,4,77
5,4172,1685,19,18,23,36,68,3,40
6,1329,2051,21,69,320,60,2813,8,66
7,756,615,21,22,119,224,2605,21,234
8,1842,67,21,74,229,240,2823,1,251
9,5481,3797,33,38,13,0,2836,3,0


# Let's fit a benchmark model using raw features

In [7]:
features = train.columns[1:10]
target_name = 'ACTION'

params = {'objective': 'binary',
         'learning_rate':0.01,
         'feature_fraction' : 0.70,
         'bagging_fraction' : 0.80,
         'bagging_freq' : 1,
         'min_child_weight': 0,
         'num_leaves'  : 127, #7, 15, 31, 63, 127, 255, 511, 1023
         'n_estimators': 9999,
         'seed': 1
         }

kf = KFold(n_splits=5, shuffle=True, random_state=1)

y_pred_train = np.zeros( train.shape[0] )
y_pred_test  = np.zeros( test.shape[0] )
for fold, (train_index, valid_index) in enumerate(kf.split( train )):
    
    train_fold = train[features].iloc[train_index].copy()
    valid_fold = train[features].iloc[valid_index].copy()
    test_fold  = test [features].copy()

    model = lgb.LGBMRegressor( **params )
    model.fit( 
        train_fold, train[target_name].iloc[train_index],
        eval_set = (valid_fold, train[target_name].iloc[valid_index] ),
        eval_metric = 'auc', early_stopping_rounds = 50, verbose=100
    )
    
    y_pred_train[valid_index] = model.predict( valid_fold )
    y_pred_test              += model.predict( test_fold ) 
y_pred_test /= 5.          

print( 'CV AUC:', roc_auc_score( train[target_name], y_pred_train ) )

sub = test[['id']]
sub['ACTION'] = y_pred_test
sub.to_csv( 'sub-amazon-lightgbm-benchmark.csv', index=False )
sub.head(), sub.shape

Training until validation scores don't improve for 50 rounds.
[100]	valid_0's binary_logloss: 0.169148	valid_0's auc: 0.864952
[200]	valid_0's binary_logloss: 0.156258	valid_0's auc: 0.874547
[300]	valid_0's binary_logloss: 0.149428	valid_0's auc: 0.880937
[400]	valid_0's binary_logloss: 0.145428	valid_0's auc: 0.88346
[500]	valid_0's binary_logloss: 0.142666	valid_0's auc: 0.885588
[600]	valid_0's binary_logloss: 0.141047	valid_0's auc: 0.88593
[700]	valid_0's binary_logloss: 0.139942	valid_0's auc: 0.886517
[800]	valid_0's binary_logloss: 0.139278	valid_0's auc: 0.886855
[900]	valid_0's binary_logloss: 0.138916	valid_0's auc: 0.886915
Early stopping, best iteration is:
[884]	valid_0's binary_logloss: 0.138805	valid_0's auc: 0.88723
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's binary_logloss: 0.17935	valid_0's auc: 0.832584
[200]	valid_0's binary_logloss: 0.166891	valid_0's auc: 0.841988
[300]	valid_0's binary_logloss: 0.160929	valid_0's auc: 0.84852
[

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(   id    ACTION
 0   1  0.950332
 1   2  0.988110
 2   3  0.989183
 3   4  0.982371
 4   5  0.993227, (58921, 2))

In [0]:
LB: 0.87788

# Now some feature engineering

# Let's add frequency/count feature

In [8]:
features = []
for col in train.columns[1:]:
    freq_encoding = train.append(test).groupby([col]).size()
    freq_encoding = freq_encoding.reset_index().rename(columns={0:'count'})
    
    train['count_'+col] = train.merge(freq_encoding, on=col, how='left')['count']
    test ['count_'+col] = test.merge( freq_encoding, on=col, how='left')['count']
    features.append('count_'+col  )

train[features].head(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,count_RESOURCE,count_MGR_ID,count_ROLE_ROLLUP_1,count_ROLE_ROLLUP_2,count_ROLE_DEPTNAME,count_ROLE_TITLE,count_ROLE_FAMILY_DESC,count_ROLE_FAMILY,count_ROLE_CODE
0,7,145,59065,12155,180,9569,17996,28861,9569
1,93,34,59065,10920,406,213,29,3506,213
2,8,7,518,518,1645,3838,92,7768,3838
3,2,153,59065,10920,494,12082,3244,28861,12082
4,28,18,815,396,143,187,41,945,187
5,17,17,254,252,568,2970,233,7768,2970
6,1257,45,59065,10920,61,424,416,1325,424
7,58,12,59065,1119,820,73,12,2289,73
8,5,71,59065,3400,94,421,469,839,421
9,58,3,770,770,3396,3838,86,7768,3838


# Fit benchmark again + count features

In [9]:
features = train.columns[1:]

y_pred_train = np.zeros( train.shape[0] )
y_pred_test  = np.zeros( test.shape[0] )

for fold, (train_index, valid_index) in enumerate(kf.split( train )):
    
    train_stack = train[features].iloc[train_index].copy()
    valid_stack = train[features].iloc[valid_index].copy()
    test_stack  = test [features].copy()

    model = lgb.LGBMRegressor( **params )
    model.fit( 
        train_stack, train[target_name].iloc[train_index],
        eval_set = (valid_stack, train[target_name].iloc[valid_index] ),
        eval_metric = 'auc', early_stopping_rounds = 50, verbose=100
    )
    
    y_pred_train[valid_index] = model.predict( valid_stack )
    y_pred_test              += model.predict( test_stack ) 
y_pred_test /= 5.          

print( 'CV AUC:', roc_auc_score( train[target_name], y_pred_train ) )

sub = test[['id']]
sub['ACTION'] = y_pred_test
sub.to_csv( 'sub-amazon-lightgbm-benchmark-count.csv', index=False )
sub.head(), sub.shape

Training until validation scores don't improve for 50 rounds.
[100]	valid_0's binary_logloss: 0.164212	valid_0's auc: 0.876572
[200]	valid_0's binary_logloss: 0.150162	valid_0's auc: 0.884345
[300]	valid_0's binary_logloss: 0.143148	valid_0's auc: 0.890967
[400]	valid_0's binary_logloss: 0.139127	valid_0's auc: 0.892109
[500]	valid_0's binary_logloss: 0.136522	valid_0's auc: 0.894234
Early stopping, best iteration is:
[513]	valid_0's binary_logloss: 0.136312	valid_0's auc: 0.894368
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's binary_logloss: 0.174546	valid_0's auc: 0.847942
[200]	valid_0's binary_logloss: 0.160329	valid_0's auc: 0.858344
[300]	valid_0's binary_logloss: 0.154167	valid_0's auc: 0.864283
[400]	valid_0's binary_logloss: 0.150957	valid_0's auc: 0.868217
[500]	valid_0's binary_logloss: 0.149573	valid_0's auc: 0.868597
Early stopping, best iteration is:
[452]	valid_0's binary_logloss: 0.150002	valid_0's auc: 0.868797
Training until validation 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(   id    ACTION
 0   1  0.849429
 1   2  0.981961
 2   3  0.990208
 3   4  0.987769
 4   5  0.991851, (58921, 2))

In [0]:
LB: 0.88830

# Target Encoding

# Basically Target Enconding uses Average Target value per category level

In [0]:
train.columns

Index(['ACTION', 'RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2',
       'ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY',
       'ROLE_CODE', 'count_RESOURCE', 'count_MGR_ID', 'count_ROLE_ROLLUP_1',
       'count_ROLE_ROLLUP_2', 'count_ROLE_DEPTNAME', 'count_ROLE_TITLE',
       'count_ROLE_FAMILY_DESC', 'count_ROLE_FAMILY', 'count_ROLE_CODE'],
      dtype='object')

In [11]:
col = 'MGR_ID'
train.groupby(col)[target_name].agg(['mean','count']).reset_index(drop=False)

Unnamed: 0,MGR_ID,mean,count
0,0,0.793103,29
1,1,1.000000,17
2,2,1.000000,8
3,3,1.000000,4
4,4,0.967742,31
5,5,0.909091,11
6,6,1.000000,4
7,7,0.666667,3
8,8,0.875000,8
9,9,1.000000,5


In [12]:
for col in train.columns[1:10]:
    
    dt = train.groupby(col)[target_name].agg(['mean']).reset_index(drop=False)
    
    train['TE_'+col] = train.merge( dt, on=col, how='left' )['mean']
    
    score = roc_auc_score( train[target_name], train['TE_'+col] )
    print( str(score)[:6], col )

0.9182 RESOURCE
0.9460 MGR_ID
0.6178 ROLE_ROLLUP_1
0.6802 ROLE_ROLLUP_2
0.7769 ROLE_DEPTNAME
0.7148 ROLE_TITLE
0.8286 ROLE_FAMILY_DESC
0.6387 ROLE_FAMILY
0.7148 ROLE_CODE


# As you can see above it's possible to calculate AUC for each individual feature but, is it going to work in a model?

# Target encodings must be calculated out-of-fold

In [13]:
def target_encode_simple( df_train, df_valid, col, target ):
    global_mean = df_train[target].mean()

    dt  = df_train.groupby(col)[target].agg(['mean']).reset_index(drop=False)
    
    tmp = df_valid.merge( dt, on=col, how='left' )['mean'].values
    
    tmp[ np.isnan(tmp) ] = global_mean
    return tmp


print('Leak   OOF')
for col in train.columns[1:10]:
    dt = train.groupby(col)[target_name].agg(['mean']).reset_index(drop=False)
    train['TE_'+col] = train.merge( dt, on=col, how='left' )['mean']
    leak_score = roc_auc_score( train[target_name], train['TE_'+col] )
    
    te_array = np.zeros( train.shape[0] )
    for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):
        te_array[valid_fold] = target_encode_simple( train.iloc[train_fold], train.iloc[valid_fold], col, target_name  )
    oof_score = roc_auc_score( train[target_name], te_array )
    
    print( str(leak_score)[:6], str(oof_score)[:6], col )

Leak   OOF
0.9182 0.6118 RESOURCE
0.9460 0.7902 MGR_ID
0.6178 0.5890 ROLE_ROLLUP_1
0.6802 0.6503 ROLE_ROLLUP_2
0.7769 0.7239 ROLE_DEPTNAME
0.7148 0.6789 ROLE_TITLE
0.8286 0.7354 ROLE_FAMILY_DESC
0.6387 0.6245 ROLE_FAMILY
0.7148 0.6789 ROLE_CODE


# Using simple target average is good, but imagine some category levels happens just a few times in the dataset.

# In this case count must be take into account

# Bayesian Average is a good solution in this case

In [0]:
def target_encode_bayesian( df_train, df_valid, col, target, global_count=1 ):
    global_mean = df_train[target].mean()
    
    dt = df_train.groupby(col)[target].agg(['mean','count']).reset_index(drop=False)
    
    dt['bayesian'] =  ((dt['mean']*dt['count']) + (global_mean*global_count)) / (dt['count']+global_count)
    
    tmp = df_valid.merge( dt, on=col, how='left' )['bayesian'].values
    
    tmp[ np.isnan(tmp) ] = global_mean

    return tmp

print( 'Simple Bayesian' )
for col in train.columns[1:10]:
    #Calc Simple average
    te_array = np.zeros( train.shape[0] )
    for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):
        te_array[valid_fold] = target_encode_simple( train.iloc[train_fold],
                                                      train.iloc[valid_fold],
                                                      col,
                                                      target_name
                                                     )

    #Calc Bayesian average
    te_array_bayesian = np.zeros( train.shape[0] )
    for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):
        te_array_bayesian[valid_fold] = target_encode_bayesian( train.iloc[train_fold],
                                                      train.iloc[valid_fold],
                                                      col,
                                                      target_name,
                                                      global_count=2
                                                     )
        
    print( str(roc_auc_score( train[target_name], te_array ))[:6],
          str(roc_auc_score( train[target_name], te_array_bayesian ))[:6],
          col )


Simple Bayesian
0.6118 0.6389 RESOURCE
0.7902 0.8150 MGR_ID
0.5890 0.5890 ROLE_ROLLUP_1
0.6503 0.6508 ROLE_ROLLUP_2
0.7239 0.7257 ROLE_DEPTNAME
0.6789 0.6795 ROLE_TITLE
0.7354 0.7404 ROLE_FAMILY_DESC
0.6245 0.6246 ROLE_FAMILY
0.6789 0.6795 ROLE_CODE


# Exploring N-way interactions using Target Encoding

In [0]:
train.groupby(['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC'])[target_name].agg(['mean','count']).reset_index(drop=False).head()

Unnamed: 0,MGR_ID,ROLE_DEPTNAME,ROLE_FAMILY_DESC,mean,count
0,0,288,7,0.647059,17
1,0,288,2357,1.0,11
2,0,288,2676,1.0,1
3,1,217,7,1.0,4
4,1,217,315,1.0,5


In [0]:
feat3way = ['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC']

te_array = np.zeros( train.shape[0] )
for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):    
    te_array[valid_fold] = target_encode_bayesian( train.iloc[train_fold],
                                                  train.iloc[valid_fold],
                                                  feat3way,
                                                  target_name,
                                                  global_count = 2
                                                 )

score = roc_auc_score( train[target_name], te_array )
print( 'AUC:', str(score)[:6], feat3way )

AUC: 0.8409 ['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC']


# Build a solution using a simple Target Encoding technique 

In [0]:
print( feat3way )
te_array      = np.zeros( train.shape[0] )
for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):
    te_array[valid_fold] = target_encode_simple( train.iloc[train_fold],
                                                train.iloc[valid_fold],
                                                feat3way,
                                                target_name  )
score = roc_auc_score( train[target_name], te_array )
print( str(score)[:6], col )

te_array_test = target_encode_simple( train,
                                     test,
                                     feat3way,
                                     target_name
                                    )

sub = test[['id']]
sub['ACTION'] = te_array_test
sub.to_csv( 'sub-amazon-1-target-encode-simple.csv', index=False )
sub.head(), sub.shape

['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC']
0.8206 ROLE_CODE


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(   id    ACTION
 0   1  0.666667
 1   2  1.000000
 2   3  1.000000
 3   4  1.000000
 4   5  1.000000, (58921, 2))

# Same as before but this time using Bayesian Average

In [0]:
print( feat3way )

te_array      = np.zeros( train.shape[0] )
for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):
    te_array[valid_fold] = target_encode_bayesian( train.iloc[train_fold],
                                                  train.iloc[valid_fold],
                                                  feat3way,
                                                  target_name,
                                                  global_count = 2 )
score = roc_auc_score( train[target_name], te_array )
print( str(score)[:6], col )

te_array_test = target_encode_bayesian( train,
                                       test,
                                       feat3way,
                                       target_name,
                                       global_count = 2
                                      )

sub = test[['id']]
sub['ACTION'] = te_array_test
sub.to_csv( 'sub-amazon-target-encode-bayesian.csv', index=False )
sub.head(), sub.shape

['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC']
0.8409 ROLE_CODE


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(   id    ACTION
 0   1  0.776844
 1   2  0.990352
 2   3  0.991094
 3   4  0.994966
 4   5  0.990352, (58921, 2))

# Challenge:

## Given the 9 features, find the best 4-way combination of features in terms of AUC

In [0]:
features = train.columns[1:10]
print( 'Initial Features:', features )

RESULTS = []
for f1 in range(9):
    for f2 in range(f1+1, 9):
        for f3 in range(f2+1, 9):
            for f4 in range(f3+1, 9):
                feat1 = features[f1]
                feat2 = features[f2]
                feat3 = features[f3]
                feat4 = features[f4]

                te_array      = np.zeros( train.shape[0] )
                for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):
                    te_array[valid_fold] = target_encode_bayesian( train.iloc[train_fold],
                                                                  train.iloc[valid_fold],
                                                                  [feat1,feat2,feat3,feat4],
                                                                  target_name,
                                                                  global_count = 2  )
                score = roc_auc_score( train[target_name], te_array )
                print( str(score)[:6], feat1, feat2, feat3, feat4 )

                RESULTS.append( [score, feat1, feat2, feat3, feat4 ]   )

Initial Features: Index(['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_DEPTNAME',
       'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY', 'ROLE_CODE'],
      dtype='object')
0.5876 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2
0.5732 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_DEPTNAME
0.5138 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_TITLE
0.5182 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_FAMILY_DESC
0.5681 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_FAMILY
0.5138 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_CODE
0.5727 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_DEPTNAME
0.5140 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_TITLE
0.5184 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_FAMILY_DESC
0.5671 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_FAMILY
0.5140 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_CODE
0.5068 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_TITLE
0.5108 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_FAMILY_DESC
0.5553 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_FAMILY
0.5068 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_CODE
0.4917 RESOURCE MGR_ID ROLE_TITLE ROLE_FAMILY_DESC
0.5148 RESOU

In [0]:
way4 = pd.DataFrame( RESULTS )
way4.sort_values( 0, ascending=False )

Unnamed: 0,0,1,2,3,4
72,0.843111,MGR_ID,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC
78,0.841910,MGR_ID,ROLE_ROLLUP_2,ROLE_FAMILY_DESC,ROLE_FAMILY
58,0.841700,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_FAMILY_DESC
84,0.841143,MGR_ID,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY
62,0.840985,MGR_ID,ROLE_ROLLUP_1,ROLE_DEPTNAME,ROLE_FAMILY_DESC
75,0.840118,MGR_ID,ROLE_ROLLUP_2,ROLE_TITLE,ROLE_FAMILY_DESC
79,0.840118,MGR_ID,ROLE_ROLLUP_2,ROLE_FAMILY_DESC,ROLE_CODE
85,0.838996,MGR_ID,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_CODE
81,0.838996,MGR_ID,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC
68,0.838522,MGR_ID,ROLE_ROLLUP_1,ROLE_FAMILY_DESC,ROLE_FAMILY


# Build a solution taking the average of all 4-way possible interactions

In [0]:
train_stack = pd.DataFrame()
test_stack  = pd.DataFrame()

for f1 in range(9):
    for f2 in range(f1+1, 9):
        for f3 in range(f2+1, 9):
            for f4 in range(f3+1, 9):
                feat1 = features[f1]
                feat2 = features[f2]
                feat3 = features[f3]
                feat4 = features[f4]
                name4way = feat1+'_'+feat2+'_'+feat3+'_'+feat4

                te_array = np.zeros( train.shape[0] )
                for fold, (train_fold, valid_fold) in enumerate(kf.split( train )):
                    te_array[valid_fold] = target_encode_bayesian( train.iloc[train_fold],
                                                                            train.iloc[valid_fold],
                                                                            [feat1,feat2,feat3,feat4],
                                                                            target_name,
                                                                            global_count = 2  ) 

                score = roc_auc_score( train[target_name], te_array )
                train_stack[name4way] = te_array
                print( str(score)[:6], feat1,feat2,feat3,feat4 )

                test_stack [name4way] = target_encode_bayesian( train,
                                                                test,
                                                                [feat1,feat2,feat3,feat4],
                                                                target_name,
                                                                2  ) 

0.5876 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2
0.5732 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_DEPTNAME
0.5138 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_TITLE
0.5182 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_FAMILY_DESC
0.5681 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_FAMILY
0.5138 RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_CODE
0.5727 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_DEPTNAME
0.5140 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_TITLE
0.5184 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_FAMILY_DESC
0.5671 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_FAMILY
0.5140 RESOURCE MGR_ID ROLE_ROLLUP_2 ROLE_CODE
0.5068 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_TITLE
0.5108 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_FAMILY_DESC
0.5553 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_FAMILY
0.5068 RESOURCE MGR_ID ROLE_DEPTNAME ROLE_CODE
0.4917 RESOURCE MGR_ID ROLE_TITLE ROLE_FAMILY_DESC
0.5148 RESOURCE MGR_ID ROLE_TITLE ROLE_FAMILY
0.5148 RESOURCE MGR_ID ROLE_TITLE ROLE_CODE
0.5183 RESOURCE MGR_ID ROLE_FAMILY_DESC ROLE_FAMILY
0.4917 RESOURCE MGR_ID ROLE_FAMILY_DESC ROLE_CODE
0.5148 RES

In [0]:
train_stack.head()

Unnamed: 0,RESOURCE_MGR_ID_ROLE_ROLLUP_1_ROLE_ROLLUP_2,RESOURCE_MGR_ID_ROLE_ROLLUP_1_ROLE_DEPTNAME,RESOURCE_MGR_ID_ROLE_ROLLUP_1_ROLE_TITLE,RESOURCE_MGR_ID_ROLE_ROLLUP_1_ROLE_FAMILY_DESC,RESOURCE_MGR_ID_ROLE_ROLLUP_1_ROLE_FAMILY,RESOURCE_MGR_ID_ROLE_ROLLUP_1_ROLE_CODE,RESOURCE_MGR_ID_ROLE_ROLLUP_2_ROLE_DEPTNAME,RESOURCE_MGR_ID_ROLE_ROLLUP_2_ROLE_TITLE,RESOURCE_MGR_ID_ROLE_ROLLUP_2_ROLE_FAMILY_DESC,RESOURCE_MGR_ID_ROLE_ROLLUP_2_ROLE_FAMILY,...,ROLE_ROLLUP_2_ROLE_DEPTNAME_ROLE_FAMILY_ROLE_CODE,ROLE_ROLLUP_2_ROLE_TITLE_ROLE_FAMILY_DESC_ROLE_FAMILY,ROLE_ROLLUP_2_ROLE_TITLE_ROLE_FAMILY_DESC_ROLE_CODE,ROLE_ROLLUP_2_ROLE_TITLE_ROLE_FAMILY_ROLE_CODE,ROLE_ROLLUP_2_ROLE_FAMILY_DESC_ROLE_FAMILY_ROLE_CODE,ROLE_DEPTNAME_ROLE_TITLE_ROLE_FAMILY_DESC_ROLE_FAMILY,ROLE_DEPTNAME_ROLE_TITLE_ROLE_FAMILY_DESC_ROLE_CODE,ROLE_DEPTNAME_ROLE_TITLE_ROLE_FAMILY_ROLE_CODE,ROLE_DEPTNAME_ROLE_FAMILY_DESC_ROLE_FAMILY_ROLE_CODE,ROLE_TITLE_ROLE_FAMILY_DESC_ROLE_FAMILY_ROLE_CODE
0,0.971181,0.942361,0.961574,0.971181,0.971181,0.961574,0.942361,0.961574,0.971181,0.971181,...,0.992315,0.982777,0.982777,0.986249,0.982777,0.993596,0.993596,0.994236,0.993596,0.968906
1,0.942743,0.942743,0.942743,0.942743,0.942743,0.942743,0.942743,0.942743,0.942743,0.942743,...,0.980914,0.980914,0.980914,0.993264,0.980914,0.980914,0.980914,0.980914,0.980914,0.990457
2,0.941102,0.941102,0.941102,0.941102,0.941102,0.941102,0.941102,0.941102,0.941102,0.941102,...,0.976441,0.960735,0.960735,0.925767,0.960735,0.960735,0.960735,0.874182,0.960735,0.960735
3,0.942361,0.942361,0.942361,0.942361,0.942361,0.942361,0.942361,0.942361,0.942361,0.942361,...,0.985325,0.979854,0.979854,0.947968,0.979854,0.997255,0.997255,0.986563,0.997255,0.953329
4,0.961574,0.942361,0.961574,0.961574,0.961574,0.961574,0.942361,0.961574,0.961574,0.961574,...,0.942361,0.980787,0.980787,0.980787,0.980787,0.961574,0.961574,0.961574,0.961574,0.898611


In [0]:
roc_auc_score( train[target_name], train_stack.mean( axis=1 ) )

0.8713272603610425

In [0]:
sub = test[['id']]
sub['ACTION'] = test_stack.mean( axis=1 )
sub.to_csv( 'sub-amazon-target-encode-bayesian-stack.csv', index=False )
sub.head(), sub.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(   id    ACTION
 0   1  0.821059
 1   2  0.973523
 2   3  0.969538
 3   4  0.955638
 4   5  0.982195, (58921, 2))

In [0]:
sub['ACTION'].describe()

count    58921.000000
mean         0.942722
std          0.055697
min          0.470747
25%          0.945753
50%          0.961452
75%          0.968287
max          0.985672
Name: ACTION, dtype: float64

In [0]:
features = train.columns[1:10]
features_count = train.columns[10:19]

params = {'objective': 'binary',
         'learning_rate':0.01,
         'feature_fraction' : 0.70,
         'bagging_fraction' : 0.80,
         'bagging_freq' : 1,
         'min_child_weight': 0,
         'num_leaves'  : 63, #7, 15, 31, 63, 127, 255, 511, 1023
         'n_estimators': 9999,
         'seed': 1
         }

y_pred_train = np.zeros( train.shape[0] )
y_pred_train_rank = np.zeros( train.shape[0] )
y_pred_test  = np.zeros( test.shape[0] )
FOLD_AUC = []
for fold, (train_index, valid_index) in enumerate(kf.split( train )):
    train_fold = train.iloc[train_index].copy()
    valid_fold = train.iloc[valid_index].copy()
    
    train_stack = train[features_count].iloc[train_index].copy()
    valid_stack = train[features_count].iloc[valid_index].copy()
    test_stack  = test [features_count].copy()
    for f1 in range(9):
        for f2 in range(f1+1, 9):
            feat1 = features[f1]
            feat2 = features[f2]
            name4way = feat1+'_'+feat2
            #Calc on train apply to valid fold
            valid_stack[name4way] = target_encode_bayesian( train_fold,
                                                            valid_fold,
                                                            [feat1, feat2],
                                                            target_name,
                                                            global_count = 2 )

            #Calc on train apply to test
            test_stack [name4way] = target_encode_bayesian( train_fold,
                                                            test,
                                                            [feat1, feat2],
                                                            target_name,
                                                            global_count = 2 )

            #Calc inner train folds
            te_train_inner = np.zeros( train_fold.shape[0] )
            kf_inner = KFold(n_splits=6, shuffle=True, random_state=2019)
            for fold, (train_index_inner, valid_index_inner) in enumerate(kf_inner.split( train_fold )):
                te_train_inner[valid_index_inner] = target_encode_bayesian( train_fold.iloc[train_index_inner].copy(), 
                                                                            train_fold.iloc[valid_index_inner].copy(),
                                                                            [feat1, feat2],
                                                                            target_name,
                                                                            global_count = 2 )
            train_stack[name4way] = te_train_inner

    model = lgb.LGBMRegressor( **params )
    model.fit( 
        train_stack, train_fold[target_name],
        eval_set = (valid_stack, valid_fold[target_name] ),
        eval_metric = 'auc', early_stopping_rounds = 30, verbose=False
    )
    
    y_pred_train[valid_index] = model.predict( valid_stack ) 
    y_pred_train_rank[valid_index] = rankdata( y_pred_train[valid_index] ) / len(valid_index)
    y_pred_test              += model.predict( test_stack )
    
    score = roc_auc_score( train[target_name].iloc[valid_index], y_pred_train[valid_index] )
    print( 'Fold AUC:', score )
    FOLD_AUC.append( score )
y_pred_test /= 5.          

print( '-----------------' )
print( 'CV AUC:', roc_auc_score( train[target_name], y_pred_train_rank ) )
print( 'FOLD Mean AUC:', np.mean(FOLD_AUC) , np.std(FOLD_AUC) )

sub = test[['id']]
sub['ACTION'] = y_pred_test
sub.to_csv( 'sub-amazon-target-encode-2way-lightgbm.csv', index=False )
sub.head(), sub.shape

Fold AUC: 0.9161580901919776
Fold AUC: 0.8868636301015438
Fold AUC: 0.8812103444339939
Fold AUC: 0.8904943393007699
Fold AUC: 0.8812558110704767
-----------------
CV AUC: 0.8908050268402954
FOLD Mean AUC: 0.8911964430197525 0.012968333551250988


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(   id    ACTION
 0   1  0.895944
 1   2  0.964975
 2   3  0.963578
 3   4  0.962077
 4   5  0.965942, (58921, 2))