In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
import warnings
import time
import sys
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.cluster import MiniBatchKMeans, KMeans

from numpy import nansum
from numpy import nanmean

from scipy import stats
import os

pd.options.display.max_rows = 999
pd.options.display.max_columns  = 999

<a id="1"></a> <br>
## 1. Loading the data

First, we load the `new_merchant_transactions.csv` and `historical_transactions.csv`. In practice, these two files contain the same variables and the difference between the two tables only concern the position with respect to a reference date.  Also, booleans features are made numeric:

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def aggregate_transactions(history,agg_func):
    
#     if 'purchase_date' in history.columns:
#         history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
#                                           astype(np.int64) * 1e-9

    agg_history = history.groupby(['card_id']).agg(agg_func)
#     print('groupby complete')
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
#     print('reset index complete')
    
#     df = (history.groupby('card_id')
#           .size()
#           .reset_index(name='transactions_count'))
    
#     agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

In [None]:
from tqdm import tqdm
import gc

def getenccolname(colname,cols_agg):
    if 'var' in cols_agg:
        colname ="targetvarenc_"+colname
    elif 'std' in cols_agg:
        colname ="targetstdenc_"+colname 
    elif 'sum' in cols_agg:   
        colname ="targetsumenc_"+colname
    elif 'min' in cols_agg:   
        colname ="targetminenc_"+colname
    elif 'max' in cols_agg:   
        colname ="targetmaxenc_"+colname
    elif 'median' in cols_agg:   
        colname ="targetmedianenc_"+colname
    elif 'count' in cols_agg:   
        colname ="targetcountenc_"+colname
    elif 'iqmean' in cols_agg:   
        colname ="targetiqmeanenc_"+colname
    else:
        colname ="targetenc_"+colname
        
    return colname

def _zscore(x):
    if len(x) > 3:
        v = x.values
        m = (v.sum() - v) / (v.size - 1)
        print('v shape',v.shape)
        print('m shape',m.shape)
        print('v values',m.shape)
        print(v[0:5])
        print()
        print()
        print()
        print('m values',m.shape)
        print(m[0:5])
        vm = v - m[:, None]
        np.fill_diagonal(vm, 0)
        s = ((vm ** 2).sum(1) / (v.size - 2)) ** .5
        return (v - m) / s
    else:
        return np.zeros_like(x)

def iqrdata(df):
    return df[(df[targetcolname] >= df[targetcolname].quantile(0.25)) & (df[targetcolname] <= df[targetcolname].quantile(0.75)) ]

In [None]:
# # filename ='temp_concat.csv'
# # os.remove(filename)
# # os.listdir('.')

# tempdf = pd.DataFrame()
# tempdf['test'] = pd.Series([-3.5,12,4.2,18,np.nan,25,40,np.nan,1,5,32,15,12,45,98,152,3,6,51,12,8,92,77,102,31,42,np.nan,52,65,35,7,95.4,132])

# tempdf = tempdf[(tempdf['test'] >=10) | (tempdf['test'].isnull())]
# tempdf['indcol'] = tempdf.index
# print(tempdf)
# print(tempdf.shape)
# tempdf1 = tempdf[0:6]
# # print(tempdf1)
# # tempdf1['indcol'] = tempdf1.index
# tempdf1.to_csv('xyz.csv')
# tempdf2 = tempdf[6:10]
# # tempdf2['indcol'] = tempdf2.index
# tempdf2.to_csv('xyz.csv',mode='a')
# tempdf3 = tempdf[10:]
# # tempdf3['indcol'] = tempdf3.index
# tempdf3.to_csv('xyz.csv',mode='a')


# temp = pd.read_csv('xyz.csv',index_col =0,squeeze=True)
# print()
# print( temp[temp.isnull()].shape)
# # temp = pd.read_csv('xyz.csv',squeeze=True)
# print('temp')
# print(temp)
# print()

# temp_byhand= concat_byhand([tempdf1[['test','indcol']],tempdf2[['test','indcol']],tempdf3[['test','indcol']]])
# print('temp_byhand')
# print(temp_byhand)

# # temp_new = pd.DataFrame()
# temp_byhand.rename(columns={0: 'enccol', 1: 'indcol'}, inplace=True)
# temp_byhand['indcol'] = temp_byhand['indcol'].astype('int64')
# temp_byhand.set_index('indcol',inplace=True)
# print(temp_byhand)

# # train[enccol].to_frame().to_csv(filename)
# # val[enccol].to_frame().to_csv(filename, mode='a',header=False)
# # test[enccol].to_frame().to_csv(filename, mode='a',header=False)

# # filename ='temp_concat.csv'
# # enccol = 'targetenc_merchant_id'
# # tempdf = pd.DataFrame()
# # tempdf[enccol] = pd.read_csv(filename,index_col =0,squeeze=True,dtype={enccol: np.float64},nrows=15203900)
# # print(tempdf[enccol].head())

In [None]:
def concat_byhand(dfs):
    mtot=0
    with open('df_all.bin','wb') as f:
        for df in dfs:
            m,n =df.shape
            mtot += m
            f.write(df.values.tobytes())
            typ=df.values.dtype                
    #del dfs
    with open('df_all.bin','rb') as f:
        buffer=f.read()
        data=np.frombuffer(buffer,dtype=typ).reshape(mtot,n)
        df_all=pd.DataFrame(data=data,columns=list(range(n))) 
    os.remove('df_all.bin')
    return df_all

In [None]:
def targetencode_train_main(train,val,test,catcolnames,targetcolname,
                             smoothing, min_samples_leaf,noise_level,cutoff,cols_agg=['mean']):

    for i,curcol in enumerate(catcolnames):
        print()
        print('curcol: ',curcol)
        print()
        
        enccol = getenccolname(curcol,'mean')
        #remove encoding column from test if exist
        if enccol in test:
            del test[enccol]
        #Smoothing the target encoding values
        averages = train[[curcol,targetcolname]].groupby(curcol)[targetcolname].agg(["mean", "count"])
#         print('averages before smoothing:',averages.head(15))
        
        smoothing_v = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf[i]) / smoothing[i]))
        averages[enccol] = train[targetcolname].mean() * (1 - smoothing_v) + averages["mean"] * smoothing_v

        averages.drop(["mean", "count"], axis=1, inplace=True)

        np.random.seed(42)
        noise = np.random.randn(len(averages[enccol])) * noise_level[i]
        averages[enccol] = averages[enccol] + noise
        
#         print('averages after smoothing:',averages.head(15))
        del smoothing_v,noise;gc.collect()

        start = time.time()
        train[enccol] = train[curcol].map(averages[enccol])
        val[enccol] = val[curcol].map(averages[enccol])
        test[enccol] = test[curcol].map(averages[enccol])
        end = time.time()
        print('update exec time:',end- start)
        
        print(train[enccol].shape)
        print(val[enccol].shape)
        print(test[enccol].shape)
        
        del averages;gc.collect()

    return train,val,test

In [None]:
# curcol ='merchant_id'
# targetcolname= 'target'
# averages = merged_trans[[curcol,targetcolname]].groupby(curcol)[targetcolname].agg("count")
# averages = averages.reset_index()

In [None]:
# print(averages.loc[averages['target'] ==1,curcol].nunique())
# print(averages.loc[averages['target'] <2,curcol].nunique())
# print(averages.loc[averages['target'] <=5,curcol].nunique())
# print(averages.loc[averages['target'] <=15,curcol].nunique())
# print(averages.loc[averages['target'] <=25,curcol].nunique())
# print(averages.loc[averages['target'] <=50,curcol].nunique())
# print(averages.loc[averages['target'] <=100,curcol].nunique())
# print(averages.loc[averages['target'] >100,curcol].nunique())
# print(averages.loc[averages['target'] >1000,curcol].nunique())
# print(averages.loc[averages['target'] >2000,curcol].nunique())
# print(averages.loc[averages['target'] >10000,curcol].nunique())
# print(averages.loc[averages['target'] >50000,curcol].nunique())
# print(averages.loc[averages['target'] >100000,curcol].nunique())
# # print(averages.loc[averages['target'] >500000,curcol].nunique())

In [None]:
from functools import partial

def performsmoothing(averages,targetcolname,train,agg,countSeries,smoothing,min_samples_leaf,noise_level,global_agg_val=None):
    
#         smoothing_v = 1 / (1 + np.exp(-((averages["count"] ) - min_samples_leaf) / smoothing))
        smoothing_v = 1 / (1 + np.exp(-((countSeries - min_samples_leaf) / smoothing)) )
        
#         print('averages[count] describe:',averages["count"].describe())
        print('smoothing_v describe:',smoothing_v.describe())
        if agg=='mean':
#             global_agg_val = train[targetcolname].mean()
#             print('train[targetcolname] describe:',train[targetcolname].describe())
            if global_agg_val is None:
                global_agg_val = np.nanmean(train[targetcolname].values)
        elif agg=='std':
            global_agg_val= train[targetcolname].std()
         
        newcol ='newcol'
        if agg=='std':
            print('std before smoothing:',averages[agg].head(25))
        
        print('averages[agg].shape:',averages[agg].shape)
        print('smoothing_v.shape:',smoothing_v.shape)
#         print('global_agg_val:',global_agg_val)
        
        averages[newcol] = global_agg_val * (1 - smoothing_v) + averages[agg] * smoothing_v

        np.random.seed(42)
        noise = np.random.randn(len(averages[newcol])) * noise_level
        averages[newcol] = averages[newcol] + noise
        
        if agg=='std':
            print('std after smoothing:',averages[newcol].head(25))
        
        del smoothing_v,noise;gc.collect()
        
        return averages[newcol]
    

In [None]:
def leaveoneoutmean(data,enccol_sum,enccol_count,targetcol):
    return (data[enccol_sum] - data[targetcol]) / (data[enccol_count] - 1)
def targetencode_merchant(card_ids_tr,card_ids_val,trans,catcolnames,targetcolname,
                         smoothing,min_samples_leaf,noise_level):
    #Target Encoding
    
#     trans['indcol'] = trans.index
   
    tr_mask =( trans['card_id'].isin(card_ids_tr) ) # | (trans['card_id'].isin(card_ids_val))
    val_mask = trans['card_id'].isin(card_ids_val)
    test_mask = (~tr_mask) & (~val_mask)
#     test_mask = (~tr_mask) 
    
    for curcol in catcolnames:
        print()
        print('curcol: ',curcol)
        print()
        
        train = trans.loc[tr_mask,[curcol,targetcolname,'card_id','merchant_category_id_mean']]
        print('merchant nunique before:',train['merchant_id'].nunique())
        train['group_count'] = train.groupby([curcol])['card_id'].transform('count')
        mask = (train['group_count']>=3) 
        print('merchant nunique of less than cutoff:',train.loc[~mask,'merchant_id'].nunique())
        train= train[mask]
        print('merchant nunique after cutoff:',train['merchant_id'].nunique())
        print('valid merchant nunique :',trans.loc[val_mask,'merchant_id'].nunique())
        print('test merchant nunique :',trans.loc[test_mask,'merchant_id'].nunique())    
        
        enccol_sum = getenccolname(curcol,'sum')
        enccol_count = getenccolname(curcol,'count')
        enccol_mean = getenccolname(curcol,'mean')
        
        averages = train.groupby(curcol).agg({targetcolname: "mean",
                                           'group_count': "first",
                                           'merchant_category_id_mean': "mean"})
        averages.columns =['mean',enccol_count,'merchant_category_id_mean']
#         averages = train[[curcol,targetcolname,'group_count']].groupby(curcol).agg({targetcolname:"sum", 
#                                                                      'group_count': "first"})
#         averages.columns =[enccol_sum,enccol_count]
        start = time.time()
    
#         trans.loc[(tr_mask),enccol_sum] = trans.loc[(tr_mask),curcol].map(averages[enccol_sum])
#         trans.loc[(tr_mask),enccol_count] = trans.loc[(tr_mask),curcol].map(averages[enccol_count])
        #LOO - Leave one out - exclude the current card id's target and then compute mean for merchant id
#         trans.loc[(tr_mask),enccol_mean]  = trans[tr_mask][enccol_sum]  / trans[tr_mask][enccol_count]
        averages[enccol_mean] = performsmoothing(averages,targetcolname,trans[tr_mask],'mean',averages[enccol_count],
                                          smoothing,min_samples_leaf,noise_level,global_agg_val=averages['merchant_category_id_mean'])
        averages.drop(['mean'],axis=1,inplace=True)
        trans.loc[(tr_mask),enccol_mean] = trans.loc[(tr_mask),curcol].map(averages[enccol_mean])
        trans.loc[(val_mask),enccol_mean] = trans.loc[(val_mask),curcol].map(averages[enccol_mean])
        #Fill NA in validation
        trans.loc[(val_mask) & (trans[enccol_mean].isnull()),enccol_mean]=trans.loc[(val_mask) & (trans[enccol_mean].isnull()),'merchant_category_id_mean']
        trans.loc[(test_mask),enccol_mean] = trans.loc[(test_mask),curcol].map(averages[enccol_mean])
        #Fill NA in test
        trans.loc[(test_mask) & (trans[enccol_mean].isnull()),enccol_mean]=trans.loc[(test_mask) & (trans[enccol_mean].isnull()),'merchant_category_id_mean']
#         trans.loc[(tr_mask),enccol_mean]  = leaveoneoutmean(trans[tr_mask],enccol_sum,enccol_count,targetcolname)
        del train;gc.collect()
#         trans.loc[(val_mask),enccol_sum] = trans.loc[(val_mask),curcol].map(averages[enccol_sum])
#         trans.loc[(val_mask),enccol_count] = trans.loc[(val_mask),curcol].map(averages[enccol_count])
        # For validation, the target of the validation cards have not been used in sum, so compute the mean directly 
#         trans.loc[(val_mask),enccol_mean]  = trans[val_mask][enccol_sum]  / trans[val_mask][enccol_count]
        
#         trans.loc[(test_mask),enccol_sum]  = trans.loc[(test_mask),curcol].map(averages[enccol_sum])
#         trans.loc[(test_mask),enccol_count] = trans.loc[(test_mask),curcol].map(averages[enccol_count])
# #         For test, target is NA always, so compute the mean directly 
#         trans.loc[(test_mask),enccol_mean]  = trans[test_mask][enccol_sum]  / trans[test_mask][enccol_count]
        
        end = time.time()
        print('trans update exec time:',end- start)
        
        del averages;gc.collect()

#         print('concat end')
#             print('Fill NA for test...')
#             global_mean= train[enccol].mean()
#             test[enccol].fillna(global_mean,inplace=True)
            
#     print('Target Encoding per Trans completed')
    return trans

In [None]:
def targetencode_card(trans,targetcolnames,
                        smoothing, min_samples_leaf,noise_level):
    #Target Encoding
    
    trans_cutoff = trans[trans['targetcountenc_merchant_id'] >=2]
    
    print('Target enc card:')
    #curcol belongs to target col name and not catcolnames
    for curcol in targetcolnames:
        print()
        print('curcol: ',curcol)
        print()
        
        trans_cutoff['card_merch_count'] = trans_cutoff[['card_id','merchant_id']].groupby(['card_id'])['merchant_id'].transform('count')
        trans_cutoff['card_merch_card_count_max'] = trans_cutoff[['card_id','targetcountenc_merchant_id']].groupby(['card_id'])['targetcountenc_merchant_id'].transform('max')
        trans_cutoff['card_net_count'] = trans_cutoff['card_merch_card_count_max'] + trans_cutoff['card_merch_count']

#         enccol = getenccolname(curcol,'mean')

        averages = trans_cutoff[['card_id',curcol,'card_net_count']].groupby('card_id').agg({curcol: nanmean,
                                                                                     'card_net_count': 'first'})
        averages.columns =['mean','count']
        print('averages columns:',averages.columns)
        print('averages head:',averages.head())
        
#         print(trans_cutoff[(trans_cutoff['card_id'].isin(['C_ID_0001238066'])) & (~trans_cutoff[curcol].isnull())].head(10))
#         print(trans_cutoff[(trans_cutoff['card_id'].isin(['C_ID_0001793786'])) & (~trans_cutoff[curcol].isnull())].head(10))

        agg_history = pd.DataFrame()
        agg_history[curcol+'_mean']= performsmoothing(averages,curcol,trans_cutoff,'mean',averages['count'],
                                          smoothing,min_samples_leaf,noise_level)

        averages.drop(["mean"], axis=1, inplace=True)
#         agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
        print(agg_history.columns)
        agg_history.reset_index(inplace=True)
        
        del averages;gc.collect()

    #         print('concat end')
    #             print('Fill NA for test...')
    #             global_mean= train[enccol].mean()
    #             test[enccol].fillna(global_mean,inplace=True)

    #     print('Target Encoding per Trans completed')
    return agg_history

In [None]:
def targetenc_merge(train,val,test,trans_agg,dispname):
    train = pd.merge(train, trans_agg, on='card_id', how='left')
    val = pd.merge(val, trans_agg, on='card_id', how='left')
    test = pd.merge(test, trans_agg, on='card_id', how='left')
    del trans_agg;gc.collect()
    print('{0} encoding merge complete'.format(dispname))
    return train,val,test
    

In [None]:
def gensumtargetenc(train,val,test):
    targetenccols = [col for col in train.columns if ('targetenc_' in col) & ('_mean' in col)]
    for df in [train,val,test]:
        df['sum_targetenc'] = df[targetenccols].sum(axis=1)
        df['mean_targetenc'] = df[targetenccols].mean(axis=1)
        df['std_targetenc'] = df[targetenccols].std(axis=1)
#         print('sum target enc describe:',df['sum_targetenc'].describe())
#         print('mean target enc describe:',df['mean_targetenc'].describe())
    return train,val,test

def droptargetenccols(train, val,test):
     #remove target encoding fields if present
    targetenccols = [col for col in train.columns if ('targetenc' in col) or ('targetstdenc' in col)]
    train.drop(targetenccols,axis=1,inplace=True)
    val.drop(targetenccols,axis=1,inplace=True)
    targetenccols_test = [col for col in test.columns if ('targetenc' in col) or ('targetstdenc' in col)]
    test.drop(targetenccols_test,axis=1,inplace=True)
    
    return train, val,test

In [None]:
def targetencode_modelfold_mergedtrans(train, val,test,df_trans_data,df_trans_col_names,catcolnames,targetcolname,
                           smoothing, min_samples_leaf,noise_level, cutoff):
    #Retrieve current train card ids and filter out hist and new only for these card ids
    card_ids_tr= list(train['card_id'].unique())
    card_ids_val= list(val['card_id'].unique())

    #Perform target encoding for each transaction data
    for i,df in enumerate(df_trans_data):
        print()
        print('************** TRANS {0} **********************'.format(df_trans_col_names[i]))
        
        df= targetencode_merchant(card_ids_tr,card_ids_val,df,catcolnames,targetcolname,
                                 smoothing, min_samples_leaf,noise_level)
        
#         targetcolnames_card =[]
#         for col in catcolnames:
#             targetcolnames_card +=['targetenc_'+col]
#         df_agg= targetencode_card(df,targetcolnames_card,
#                         smoothing[i], min_samples_leaf[i],noise_level[i])
        
#         df_agg.columns = [df_trans_col_names[i] + '_' + c if c != 'card_id' else c for c in df_agg.columns]
    
        agg_func={}
        for curcol in catcolnames:
            agg_func['targetenc_'+curcol] =['mean']

        df_agg = aggregate_transactions(df,agg_func)
        df_agg.columns = [df_trans_col_names[i] + '_' + c if c != 'card_id' else c for c in df_agg.columns]
    
    
    
        print('aggregate_transactions complete')

        train_index = train.index
        val_index = val.index
        test_index = test.index

        train,val,test = targetenc_merge(train,val,test,df_agg,df_trans_col_names[i])    
        
        #restore indices
        train.index = train_index
        val.index = val_index
        test.index = test_index
    
        #drop target enc in trans to clear memory
        targetenccols = [col for col in df.columns if ('targetenc' in col) or ('targetstdenc' in col)]
        df.drop(targetenccols,axis=1,inplace=True)

    print('Target Encoding completed')
    #generate sum target encoding columns
#     if len(df_trans_data) >1:
#     print('computing sum target encoding...')
#     train,val,test=gensumtargetenc(train,val,test)

    return train,val,test

In [None]:
# def targetencode_modelfold_mergedtrans(train, val,test,df_trans_data,df_trans_col_names,catcolnames,targetcolname,
#                            smoothing, min_samples_leaf,noise_level, cutoff):
#     #Retrieve current train card ids and filter out hist and new only for these card ids
#     card_ids_tr= list(train['card_id'].unique())
#     card_ids_val= list(val['card_id'].unique())

#     #Perform target encoding for each transaction data
#     for i,df in enumerate(df_trans_data):
#         print()
#         print('************** TRANS {0} **********************'.format(df_trans_col_names[i]))
        
#         df= targetencode_withoutfold(card_ids_tr,card_ids_val,df,catcolnames,targetcolname,
#                                      smoothing[i], 2000,noise_level[i], cutoff[i],50)
#         print('cutoff 100 complete')
#         df= targetencode_withoutfold(card_ids_tr,card_ids_val,df,catcolnames,targetcolname,
#                                      smoothing[i], 1000,noise_level[i], 100,1000)
# #         print('cutoff 1000 complete')
# #         df= targetencode_withoutfold(card_ids_tr,card_ids_val,df,catcolnames,targetcolname,
# #                                      smoothing[i], 1000,noise_level[i],1000,5000)        
# #         print('cutoff 5000 complete')
# #         df= targetencode_withoutfold(card_ids_tr,card_ids_val,df,catcolnames,targetcolname,
# #                                      smoothing[i], 5000,noise_level[i],5000,10000) 
# #         print('cutoff 10000 complete')
# #         df= targetencode_withoutfold(card_ids_tr,card_ids_val,df,catcolnames,targetcolname,
# #                                      smoothing[i], 10000,noise_level[i],10000,100000)         
# #         print('cutoff 100000 complete')
#         #Aggregate encoding data
# #         agg_func = {
# #         'targetenc_merchant_id' : ['mean'],
# #         }
#         agg_func={}
#         for curcol in catcolnames:
#             agg_func['targetenc_'+curcol] =['mean']
#             agg_func['targetstdenc_'+curcol] =['min','max']
# #             agg_func['targetenc_'+curcol] =['mean','std','min','max','count']

#         df_agg = aggregate_transactions(df,agg_func)
#         df_agg.columns = [df_trans_col_names[i] + '_' + c if c != 'card_id' else c for c in df_agg.columns]
    
#         print('aggregate_transactions complete')

#         train_index = train.index
#         val_index = val.index
#         test_index = test.index

#         train,val,test = targetenc_merge(train,val,test,df_agg,df_trans_col_names[i])    
        
#         #restore indices
#         train.index = train_index
#         val.index = val_index
#         test.index = test_index
    
#         #drop target enc in trans to clear memory
#         targetenccols = [col for col in df.columns if ('targetenc' in col) or ('targetstdenc' in col)]
#         df.drop(targetenccols,axis=1,inplace=True)

#     print('Target Encoding completed')
#     #generate sum target encoding columns
# #     if len(df_trans_data) >1:
# #     print('computing sum target encoding...')
# #     train,val,test=gensumtargetenc(train,val,test)

#     return train,val,test

In [None]:
# def targetencode_withoutfold(card_ids_tr,card_ids_val,trans,catcolnames,targetcolname,
#                              smoothing, min_samples_leaf,noise_level,cutoff_low,cutoff_high,cols_agg=['mean']):
#     #Target Encoding
    
# #     trans['indcol'] = trans.index
   
#     tr_mask = trans['card_id'].isin(card_ids_tr)
#     val_mask = trans['card_id'].isin(card_ids_val)
#     test_mask = (~tr_mask) & (~val_mask)

#     for curcol in catcolnames:
#         print()
#         print('curcol: ',curcol)
#         print()
# #         train = trans[tr_mask]
# #         val = trans[val_mask]
# #         test = trans[test_mask]
#         train = trans.loc[tr_mask,[curcol,targetcolname,'card_id']]
#         val = trans.loc[val_mask,[curcol,targetcolname,'card_id']]
#         test = trans.loc[test_mask,[curcol,targetcolname,'card_id']]

#         train['group_count'] = train.groupby([curcol])['card_id'].transform('count')
#         mask = (train['group_count']>=cutoff_low) &  (train['group_count']<cutoff_high)
# #         mask = (train['group_count']>=cutoff)
#         train= train[mask]
        
#         enccol = getenccolname(curcol,'mean')
#         enccol_std = getenccolname(curcol,'std')
        
#         averages = train[[curcol,targetcolname]].groupby(curcol)[targetcolname].agg(["std","mean", "count"])
# #         print('enccol_std name;',enccol_std)
# #         averages[enccol_std]= performsmoothing(averages,targetcolname,train,'std',enccol_std,
# #                                               smoothing,2000,noise_level)
        
#         averages[enccol]= performsmoothing(averages,targetcolname,train,'mean',enccol,
#                                           smoothing,min_samples_leaf,noise_level)

        
       
#         averages.drop(["std","mean", "count"], axis=1, inplace=True)

# #         #Smoothing the target encoding values
# #         #Add standard deviation also through smoothing
# #         averages = train[[curcol,targetcolname]].groupby(curcol)[targetcolname].agg(["std","mean", "count"])
# # #         print('averages before smoothing:',averages.head(15))
        
# #         smoothing_v = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
# #         averages[enccol] = train[targetcolname].mean() * (1 - smoothing_v) + averages["mean"] * smoothing_v

# #         averages.drop(["std","mean", "count"], axis=1, inplace=True)

# #         np.random.seed(42)
# #         noise = np.random.randn(len(averages[enccol])) * noise_level
# #         averages[enccol] = averages[enccol] + noise
        
# #         print('averages after smoothing:',averages.head(15))
# #         del smoothing_v,noise;gc.collect()

#         start = time.time()
    
#         trans.loc[(tr_mask),enccol] = trans.loc[(tr_mask),curcol].map(averages[enccol])
#         trans.loc[(tr_mask),enccol_std] = trans.loc[(tr_mask),curcol].map(averages[enccol_std])
#         del train;gc.collect()
#         trans.loc[(val_mask),enccol] = trans.loc[(val_mask),curcol].map(averages[enccol])
#         trans.loc[(val_mask),enccol_std] = trans.loc[(val_mask),curcol].map(averages[enccol_std])
#         del val;gc.collect()
#         trans.loc[(test_mask),enccol]  = trans.loc[(test_mask),curcol].map(averages[enccol])
#         trans.loc[(test_mask),enccol_std] = trans.loc[(test_mask),curcol].map(averages[enccol_std])
#         del test;gc.collect()
        
#         end = time.time()
#         print('trans update exec time:',end- start)
#         print('train cur merchant null but enccol nonnull  =', trans.loc[(tr_mask) & (trans[curcol].isnull()) & (~trans[enccol].isnull())].shape )
        
# #         print('train total  =',trans.loc[(tr_mask),curcol].nunique() )
# #         print('train na  =', trans.loc[(tr_mask) & (trans[enccol].isnull()),curcol].nunique() )
# #         print('test total  =', trans.loc[(val_mask),curcol].nunique())
# #         print('test na  =', trans.loc[(val_mask) & (trans[enccol].isnull()),curcol].nunique() )
# #         print('valid total  =', trans.loc[(test_mask),curcol].nunique() )
# #         print('valid na  =', trans.loc[(test_mask) & (trans[enccol].isnull()),curcol].nunique() )


#         print(trans[enccol].shape)
# #         print(trans.loc[trans[curcol].isin(list(averages.index[0:15])),[curcol,enccol]].head(30))
#         del averages;gc.collect()

# #         print('concat end')
# #             print('Fill NA for test...')
# #             global_mean= train[enccol].mean()
# #             test[enccol].fillna(global_mean,inplace=True)
            
# #     print('Target Encoding per Trans completed')
#     return trans

In [None]:
#Target Encoding on categorical features

# def targetencode_modelfold(train, val,test,hist,new,catcolnames,targetcolname,
#                            smoothing, min_samples_leaf,noise_level):
#     #Retrieve current train card ids and filter out hist and new only for these card ids
#     card_ids_tr= list(train['card_id'].unique())
#     card_ids_val= list(val['card_id'].unique())
    
#     #Perform target encoding for each transaction data
#     print()
#     print('************** HIST TRANS **********************')
#     hist= targetencode_withoutfold(card_ids_tr,card_ids_val,hist,catcolnames,targetcolname,smoothing[0], min_samples_leaf[0],noise_level[0])
#     print()
#     print('************** NEW TRANS **********************')
#     new = targetencode_withoutfold(card_ids_tr,card_ids_val,new,catcolnames,targetcolname,smoothing[1], min_samples_leaf[1],noise_level[1])
    
#     #Aggregate encoding data
#     agg_func = {
# #     'targetenc_merchant_id' : ['mean'],
#     'targetenc_merchant_id' : ['mean','std',np.ptp],
#     }

#     hist_agg = aggregate_transactions(hist,agg_func)
#     hist_agg.columns = ['hist_' + c if c != 'card_id' else c for c in hist_agg.columns]
# #     print('hist aggregate_transactions complete')

#     new_agg = aggregate_transactions(new,agg_func)
#     new_agg.columns = ['new_' + c if c != 'card_id' else c for c in hist_agg.columns]
    
#     print('aggregate_transactions complete')

#     #remove target encoding fields if present
#     targetenccols = [col for col in train.columns if 'targetenc_' in col]
#     train.drop(targetenccols,axis=1,inplace=True)
#     val.drop(targetenccols,axis=1,inplace=True)
#     targetenccols_test = [col for col in test.columns if 'targetenc_' in col]
#     test.drop(targetenccols_test,axis=1,inplace=True)

        
#     train_index = train.index
#     val_index = val.index
#     test_index = test.index
    
#     train,val,test = targetenc_merge(train,val,test,hist_agg,'history')    
#     train,val,test = targetenc_merge(train,val,test,new_agg,'new')    

#     #restore indices
#     train.index = train_index
#     val.index = val_index
#     test.index = test_index

#     print('Target Encoding completed')
    
#     #drop target enc in trans to clear memory
#     hist.drop(targetenccols,axis=1,inplace=True)
#     new.drop(targetenccols,axis=1,inplace=True)
    
#     return train,val,test

Read Data

In [None]:
# Path = '../input/elo-ref-2-data-conversion/'
# historical_transactions = pd.read_hdf(Path+'historical_transactions.hdf')
# new_transactions = pd.read_hdf(Path+'new_transactions.hdf')
# print('transactions read complete')

In [None]:
Path = '../input/elo-ref-2-data-conversion/'
historical_transactions = pd.read_csv(Path+'historical_transactions.csv',index_col=0)
new_transactions = pd.read_csv(Path+'new_transactions.csv',index_col=0)
print('transactions read complete')

In [None]:
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

In [None]:
# train_raw = pd.read_csv('../input/elo-merchant-category-recommendation/train.csv')

In [None]:
# Path = '../input/elo-preproc-3/'
# train = pd.read_hdf(Path+'train_preproc.hdf')
# print('train read complete')
# test = pd.read_hdf(Path+'test_preproc.hdf')
# print('test read complete')

In [None]:
Path = '../input/elo-preproc-3/'
train = pd.read_csv(Path+'train_preproc.csv')
print('train read complete')
test = pd.read_csv(Path+'test_preproc.csv')
print('test read complete')

In [None]:
# target = train_raw['target']
# train['target'] = target

**Test with Label**

In [None]:
# #train and test sample
# train_copy = train.copy()
# test_copy = test.copy()

In [None]:
# train = train_copy.sample(frac=0.01)
# print(train.shape)

In [None]:
# #split train to train and test for target enc test check
# n_splits=5
# folds_test = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)
# for fold_, (trn_idx, val_idx) in enumerate(folds_test.split(train,train['outliers'].values)):
#     print("fold {}".format(fold_))
#     test = train.iloc[val_idx]
#     test_target=train['target'].iloc[val_idx]
#     train = train.iloc[trn_idx]
#     target=train['target']
    
#     break

# print(train.shape)
# print(test.shape)

In [None]:
# # Filter history and new trans to contain only the new train and test ids
# filter_cardids = list(set(train['card_id'].unique()).union(test['card_id'].unique()))
# historical_transactions = historical_transactions[historical_transactions['card_id'].isin(filter_cardids)]
# new_transactions = new_transactions[new_transactions['card_id'].isin(filter_cardids)]
# print(historical_transactions.shape)
# print(new_transactions.shape)

![](http://)K FOLD Setting

In [None]:
hastestlabels = False

In [None]:
n_splits=5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)

In [None]:
target = train['target']

In [None]:
# historical_group =historical_transactions[trans_cols].groupby(['merchant_id','card_id']).agg('first').reset_index()
# print(historical_group.head())
# historical_transactions['merch_card_count'] =historical_group.groupby(['merchant_id'])['card_id'].transform('count')

In [None]:
#merge target to history data 
# also add merch_card_purchase_count
# historical_transactions =historical_transactions.groupby(['merchant_id','card_id']).size().reset_index(name='merch_card_purchase_count')
# print('history grouping complete')

# new_transactions =new_transactions.groupby(['merchant_id','card_id']).size().reset_index(name='merch_card_purchase_count')
# print('new grouping complete')

# trans_cols =['merchant_id','card_id','merchant_category_id','subsector_id']

# historical_transactions =historical_transactions[trans_cols].groupby(['merchant_id','card_id']).agg('first').reset_index()
# # historical_transactions['merch_card_count'] =historical_transactions.groupby(['merchant_id'])['card_id'].transform('count')
# print('history grouping complete')

# new_transactions =new_transactions[trans_cols].groupby(['merchant_id','card_id']).agg('first').reset_index()
# # new_transactions['merch_card_count'] =new_transactions.groupby(['merchant_id'])['card_id'].transform('count')
# print('new grouping complete')


historical_transactions = pd.merge(historical_transactions,train[['target','card_id']],on='card_id',how='left')
print('history merge complete')

new_transactions = pd.merge(new_transactions,train[['target','card_id']],on='card_id',how='left')
print('new merge complete')

for i,df in enumerate([historical_transactions,new_transactions]):
    trans_cols =['target','merchant_category_id']
    df['merchant_category_id_mean'] =df[trans_cols].groupby(['merchant_category_id']).transform('mean')
    trans_cols =['target','subsector_id']
    df['subsector_id_mean'] =df[trans_cols].groupby(['subsector_id']).transform('mean')
    print('grouping target mean completed:',i)


historical_transactions['outliers'] = 0
historical_transactions.loc[historical_transactions['target'] < -30, 'outliers'] = 1
historical_transactions['outliers'].value_counts()

new_transactions['outliers'] = 0
new_transactions.loc[new_transactions['target'] < -30, 'outliers'] = 1
new_transactions['outliers'].value_counts()

print('target process complete')

In [None]:

# tr_mask = ~historical_transactions['target'].isnull()
# train_temp = historical_transactions.loc[tr_mask,['merchant_id','target','card_id','merch_card_count']]
# mask = (train_temp['merch_card_count']>=2) 
# train_temp= train_temp[mask]

In [None]:
# averages = train_temp[['merchant_id','card_id']].groupby('merchant_id')['card_id'].agg(["count"])

In [None]:
# print(averages.head())
# print(averages[averages['count']==1].head())

In [None]:
targetcolname ='target'

In [None]:
# #Computer merchant level outliers
# import scipy.stats as sss

# for i,df in enumerate([historical_transactions,new_transactions]):
#     df['merch_zscore']=df.groupby(['merchant_id'])[targetcolname].transform(sss.zscore)
#     df.loc[df['merch_zscore'].abs()>3,'target'] = np.nan
#     print('transaction complete:i',i)
    
# print('merchant outlier complete')
# #only 12 outliers detected and need to explore

**Old Target Encoding**

In [None]:
# print(historical_transactions[historical_transactions['merch_card_count']>=10].shape)
# new_transactions[new_transactions['merch_card_count']>=10].shape

In [None]:
# #No of test records whose merchant id in train is having only one card

# mask = historical_transactions['target'].isnull()

# hist_merch_ids = historical_transactions.loc[(~mask) & (historical_transactions['merch_card_count']<10),'merchant_id'].unique()
# train_w_test_single =  historical_transactions.loc[(mask) & (historical_transactions['merchant_id'].isin(hist_merch_ids)),'merchant_id'].unique()
# print(len(train_w_test_single))

# mask = new_transactions['target'].isnull()
# new_merch_ids = new_transactions.loc[(~mask) & (new_transactions['merch_card_count']<10),'merchant_id'].unique()
# train_w_test_single =  new_transactions.loc[(mask) & (new_transactions['merchant_id'].isin(new_merch_ids)),'merchant_id'].unique()
# print(len(train_w_test_single))

# # 51% of hist test merchants and 65% of new test merchants will be NA for merchant card count < 10 in train

In [None]:
# mask = historical_transactions['target'].isnull()
# print(historical_transactions.loc[(mask),'merchant_id'].nunique())
# mask = new_transactions['target'].isnull()
# print(new_transactions.loc[(mask),'merchant_id'].nunique())

In [None]:
# #for train trans- compute target encoding average for each merchant record excluding the current card target value
# #for test trans- compute target encoding average including all merchant records
# for df in [historical_transactions,new_transactions]:
# # for df in [historical_transactions]:
#     mask = df['target'].isnull()
#     # for train 
#     df.loc[~mask,'targetenc_mean_merchant_id']= df.loc[~mask,'targetsumenc_merchant_id'] - df.loc[~mask,'target'] / (df.loc[~mask,'targetcountenc_merchant_id']- 1)
#     # for test 
#     df.loc[mask,'targetenc_mean_merchant_id']= df.loc[mask,'targetsumenc_merchant_id'] / df.loc[mask,'targetcountenc_merchant_id']
# #         df['targetenc_mean']= df['targetenc_sum'] - df['target'] * df['merch_card_purchase_count'] / (df['targetenc_count']- df['merch_card_purchase_count'])

In [None]:
# # no  of null merchants
# for i,df in enumerate([historical_transactions,new_transactions]):
#     mask = df['target'].isnull()
#     print('i:',i)
#     print('train total merchant count:',df.loc[(~mask),'merchant_id'].nunique())
#     print('train null merchant count:',df.loc[(~mask) & (df['targetsumenc_merchant_id'].isnull()),'merchant_id'].nunique())
#     print('test total merchant count:',df.loc[(mask),'merchant_id'].nunique())
#     print('test null merchant count:',df.loc[(mask) & (df['targetsumenc_merchant_id'].isnull()),'merchant_id'].nunique())

**Save Data**

In [None]:
# train.to_hdf('train_preproc.hdf',key='data')
# test.to_hdf('test_preproc.hdf',key='data')

In [None]:
train['target'] = target
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

In [None]:
sel_feats = ['feature_1',
 'feature_2',
 'feature_3',
 'hist_authorized_flag_mean',
 'hist_authorized_flag_sum',
 'hist_card_id_size',
 'hist_category_1_mean',
 'hist_category_1_sum',
 'hist_category_3_mean_mean',
 'hist_first_buy',
 'hist_installments_min',
 'hist_installments_sum',
 'hist_merchant_group_id_nunique',
 'hist_merchant_id_nunique',
 'hist_month_diff_mean',
 'hist_month_lag_max',
 'hist_month_lag_mean',
 'hist_month_nunique',
 'hist_most_recent_sales_range_std',
 'hist_purchase_amount_max',
 'hist_purchase_amount_mean',
 'hist_purchase_amount_min',
 'hist_purchase_amount_sum',
 'hist_purchase_amount_var',
 'hist_purchase_date_average',
 'hist_purchase_date_count_mean',
 'hist_purchase_date_diff',
 'hist_purchase_date_max',
 'hist_purchase_date_min',
 'hist_purchase_date_uptonow',
 'hist_purchase_duration_max_max',
 'hist_purchase_duration_max_mean',
 'hist_repeat_purchase_amount_sum_max',
 'hist_repeat_purchase_amount_sum_mean',
 'hist_repeat_purchase_amount_sum_min',
 'hist_subsector_id_nunique',
 'hist_sum_purchases_lag_max',
 'hist_sum_purchases_lag_min',
 'hist_sum_purchases_lag_std',
 'hist_sum_sales_lag_max',
 'hist_sum_sales_lag_mean',
 'hist_sum_sales_lag_min',
 'hist_sum_sales_lag_sum',
 'hist_sum_sales_p_purchases_lag_std',
 'hist_sum_sales_p_purchases_lag_sum',
 'hist_weekofyear_nunique',
 'hist_year_nunique',
 'new_hist_card_id_size',
 'new_hist_category_1_mean',
 'new_hist_category_1_sum',
 'new_hist_category_2_mean_mean',
 'new_hist_category_3_mean_mean',
 'new_hist_hour_nunique',
 'new_hist_installments_max',
 'new_hist_installments_mean',
 'new_hist_installments_min',
 'new_hist_installments_var',
 'new_hist_merchant_category_id_nunique',
 'new_hist_merchant_id_nunique',
 'new_hist_month_diff_mean',
 'new_hist_month_lag_max',
 'new_hist_month_lag_mean',
 'new_hist_month_lag_var',
 'new_hist_purchase_amount_max',
 'new_hist_purchase_amount_mean',
 'new_hist_purchase_amount_min',
 'new_hist_purchase_amount_var',
 'new_hist_purchase_date_average',
 'new_hist_purchase_date_diff',
 'new_hist_purchase_date_max',
 'new_hist_purchase_date_min',
 'new_hist_purchase_date_uptonow',
 'new_hist_subsector_id_nunique',
 'new_most_recent_purchases_range_std',
#  'new_sum_purchases_lag_std',
#  'new_sum_purchases_lag_sum',
#  'new_sum_sales_lag_sum',
#  'new_sum_sales_p_purchases_lag_sum',
 'weekofyear']

KMeans on all features before target encoding(i.e except target encoding feats)

In [None]:
# #KMeans na replaced col creation
# for df in [train,test]:
#     print('*****************')
#     for col in sel_feats:
#         if df.isnull().any()[col]:
#             print(col)
#             newcol =  col+'_na_replaced'
#             for df_l2 in [train,test]:
#                 if newcol not in df_l2.columns:
#                     df_l2[newcol] = df_l2[col]
#             df[newcol].fillna(df[col].mean(),inplace=True)

In [None]:
# # Kmeans feats : add null filled columns  and remove original of these columns
# orig_cols  = [col.replace('_na_replaced','') for col in list(train.columns)  if '_na_replaced'  in col]
# na_replaced_cols  = [col for col in list(train.columns) if '_na_replaced'  in col]
# # print(orig_cols)
# sel_feats_kmeans = sel_feats.copy() 
# for val in orig_cols:
#     sel_feats_kmeans.remove(val)

# sel_feats_kmeans += na_replaced_cols
# print(len(sel_feats_kmeans))
# print(sel_feats_kmeans)

In [None]:
# kmeans = KMeans(n_clusters=15,random_state=2018)
# print('kmeans fit start..')
# # kmeans = kmeans.fit(pd.concat([train[sel_feats_kmeans],test[sel_feats_kmeans]]))
# kmeans = kmeans.fit(train[sel_feats_kmeans])
# print('kmeans fit end')
# # 
# train['kmeans_cluster']=kmeans.predict(train[sel_feats_kmeans])
# test['kmeans_cluster']=kmeans.predict(test[sel_feats_kmeans])

In [None]:
# #save kmeans cluster data
# train[['card_id','kmeans_cluster']].to_hdf('train_kmeanscluster.hdf',key='data')
# test[['card_id','kmeans_cluster']].to_hdf('test_kmeanscluster.hdf',key='data')

In [None]:
excluded_cols = ['card_id', 'first_active_month','target','outliers']

# features = [c for c in train.columns if c not in excluded_cols]
# features = sel_feats + ['kmeans_cluster']
features = sel_feats 

print(excluded_cols)

# categorical_feats = [c for c in features if 'feature_' in c ]

print(features)

We then set the hyperparameters of the LGBM model:

1. 1. 1. 1. 1. We now train the model. Here, we use a standard KFold split of the dataset in order to validate the results and to stop the training. Interstingly, during the writing of this kernel, the model was enriched adding new features, which improved the CV score. The variations observed on the CV were found to be quite similar to the variations on the LB: it seems that the current competition won't give us headaches to define the correct validation scheme:

In [None]:
# print(train['hist_targetenc_merchant_id_mean'].isna().sum())
# print(test['hist_targetenc_merchant_id_mean'].isna().sum())

In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}

In [None]:
# catcolnames =['merchant_id','state_id','city_id','merchant_category_id','subsector_id','category_1','category_2','category_3']
catcolnames =['merchant_id']

In [None]:
min_samples_leaf=2000
smoothing=10
noise_level=0.1
cutoff =2

In [None]:
# min_samples_leaf=[2000,2000,2000,2000,2000]
# smoothing=[10]
# noise_level=[0.1]
# cutoff =[2]

In [None]:
catcolnames_main= ['dayofweek', 'weekofyear', 'month', 'elapsed_time','feature_1','feature_2','feature_3']
min_samples_leaf_main=[400,1000,650,4000,250,150,100]
# dayofweek - 7, weekofyear - 21, month -12,  elapsed_time - 75, feature_1 - 5, feature_2 - 3, feature_3 - 2
smoothing_main=[10,10,10,10,10,10,10]
noise_level_main=[0.1,0.1,0.1,0.1,0.1,0.1,0.1]

In [None]:
# hist_auth = historical_transactions[historical_transactions['authorized_flag']==1]
# hist_unauth = historical_transactions[historical_transactions['authorized_flag']==0]
# new_auth = new_transactions[new_transactions['authorized_flag']==1]
# new_unauth = new_transactions[new_transactions['authorized_flag']==0]

# merged_dfs=[hist_auth,hist_unauth,new_auth]
# merged_colnames =['hist_auth','hist_unauth','new']

# del historical_transactions,new_transactions;gc.collect()

In [None]:
# #Sample temp code
# temp_df=[]
# for i,df in enumerate(merged_dfs):
#     cur_df  = df.sample(frac=0.01)
#     temp_df += [cur_df]
#     print(cur_df.shape)

# merged_dfs= temp_df

In [None]:
# historical_transactions = pd.concat([hist_auth,hist_unauth])
# new_transactions = pd.concat([new_auth,new_unauth])

# del hist_auth,hist_unauth,new_auth,new_unauth;gc.collect()

In [None]:
historical_transactions['ishist']=1
new_transactions['ishist']=0

merged_trans = pd.concat([historical_transactions,new_transactions])

del historical_transactions,new_transactions;gc.collect()

merged_dfs = [merged_trans]
merged_colnames =['trans_merged']

In [None]:
# merged_trans_copy = merged_trans.copy()

In [None]:
# merged_trans = merged_trans_copy
# # merged_trans = merged_trans_copy.sample(frac=0.01)
# merged_dfs = [merged_trans]
# merged_trans.shape

In [None]:
# historical_transactions = merged_trans[merged_trans['ishist']==1]
# new_transactions = merged_trans[merged_trans['ishist']==0]
# del merged_trans;gc.collect()
# merged_dfs=[historical_transactions,new_transactions]
# merged_colnames =['history','new']

In [None]:
num_round = 10000

In [None]:
n_splits = 5

In [None]:
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)

In [None]:
start = time.time()
# n_splits=5
# folds = KFold(n_splits=n_splits, shuffle=True, random_state=4590)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
valid_scores =[]
test_scores =[]
num_iterations =[]
fold_importance_df = pd.DataFrame()
# fold_importance_df["feature"] = features
# fold_importance_df["importance"] = 0

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['outliers'].values)):
    print('******************************************************')
    print("FOLD  ---  {}".format(fold_))
    print('******************************************************')
   
    tr = train.iloc[trn_idx]
    val = train.iloc[val_idx]
    
    #drop any existing target enc cols
    tr,val,test = droptargetenccols(tr,val,test)
   
    #target encoding on transaction merchant id
    tr,val,test = targetencode_modelfold_mergedtrans(tr, val,test,merged_dfs,merged_colnames,catcolnames,targetcolname,
                                         smoothing,min_samples_leaf,noise_level,cutoff)

    enc_cols = [col for col in tr.columns if 'targetenc' in col]
    print('enc cols:',enc_cols)
    print('save encoding feats...')
    #save target encoding features in separate file
    tr[['card_id']+enc_cols].to_hdf('train_targetenc_feats'+str(fold_)+'.hdf',key='data')
    val[['card_id']+enc_cols].to_hdf('val_targetenc_feats'+str(fold_)+'.hdf',key='data')
    test[['card_id']+enc_cols].to_hdf('test_targetenc_feats'+str(fold_)+'.hdf',key='data')
    
    print('save encoding feats in csv...')
    tr[['card_id']+enc_cols].to_csv('train_targetenc_feats'+str(fold_)+'.csv')
    val[['card_id']+enc_cols].to_csv('val_targetenc_feats'+str(fold_)+'.csv')
    test[['card_id']+enc_cols].to_csv('test_targetenc_feats'+str(fold_)+'.csv')


end = time.time()
print('Target Enc Execution Time:',end-start)

In [None]:
# test_mask = merged_dfs[0]['target'].isnull()
# tr_merch_ids = merged_dfs[0].loc[~test_mask,'merchant_id'].unique()
# test_merch_ids = merged_dfs[0].loc[test_mask,'merchant_id'].unique()
# test_m_tr = set(test_merch_ids).difference(set(tr_merch_ids))
# print(len(test_m_tr))

In [None]:
# n_splits=2206
# fold_to_start = 52
# fold_to_stop = 54

In [None]:
# folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)

In [None]:
# start = time.time()
# # folds = KFold(n_splits=n_splits, shuffle=True, random_state=4590)
# oof = np.zeros(len(train))
# predictions = np.zeros(len(test))
# feature_importance_df = pd.DataFrame()
# valid_scores =[]
# test_scores =[]
# num_iterations =[]
# fold_importance_df = pd.DataFrame()
# # fold_importance_df["feature"] = features
# # fold_importance_df["importance"] = 0

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['outliers'].values)):
#     if (fold_to_stop is not None):
#         if (fold_ >=fold_to_stop):
#             break

#     if (fold_to_start is not None):
#         if (fold_ < fold_to_start):
#             continue
#     print('******************************************************')
#     print("FOLD  ---  {}".format(fold_))
#     print('******************************************************')
   
#     tr = train.iloc[trn_idx]
#     val = train.iloc[val_idx]
    
#     print('tr shape:',tr.shape)
#     print('val shape:',val.shape)
    
#     #drop any existing target enc cols
#     tr,val,test = droptargetenccols(tr,val,test)
    
# #     #target encoding on train main columns
# #     tr,val,test =  targetencode_train_main(tr,val,test,catcolnames_main,targetcolname,
# #                                          smoothing_main,min_samples_leaf_main,noise_level_main,cutoff)
# #     tr,val,test = gensumtargetenc(tr,val,test)
   
#     #target encoding on transaction merchant id
#     tr,val,test = targetencode_modelfold_mergedtrans(tr, val,test,merged_dfs,merged_colnames,catcolnames,targetcolname,
#                                          smoothing,min_samples_leaf,noise_level,cutoff)

#     enc_cols = [col for col in tr.columns if 'targetenc' in col]
# #     enc_cols = [col for col in tr.columns if ('targetenc' in col) or ('targetstdenc' in col)]
    
# #     enc_cols=[]
#     print('enc cols:',enc_cols)
#     print('save encoding feats...')
#     #save target encoding features in separate file
#     tr[['card_id']+enc_cols].to_hdf('train_targetenc_feats'+str(fold_)+'.hdf',key='data')
#     val[['card_id']+enc_cols].to_hdf('val_targetenc_feats'+str(fold_)+'.hdf',key='data')
#     test[['card_id']+enc_cols].to_hdf('test_targetenc_feats'+str(fold_)+'.hdf',key='data')
    
# #     print('enc cols for lgb train:',enc_cols)

# #     print('main train hist na merchants:',tr[tr['hist_targetenc_merchant_id_mean'].isnull()].shape)
# #     print('main train new na merchants:',tr[tr['new_hist_targetenc_merchant_id_mean'].isnull()].shape)
# #     print('main val hist na merchants:',val[val['hist_targetenc_merchant_id_mean'].isnull()].shape)
# #     print('main val new na merchants:',val[val['new_hist_targetenc_merchant_id_mean'].isnull()].shape)
# #     print('main test hist na merchants:',test[test['hist_targetenc_merchant_id_mean'].isnull()].shape)
# #     print('main test new na merchants:',test[test['new_hist_targetenc_merchant_id_mean'].isnull()].shape)    

#     cur_features = features + enc_cols
    
#     trn_data = lgb.Dataset(tr[cur_features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
#     val_data = lgb.Dataset(val[cur_features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

#     clf = lgb.train(param, trn_data, num_round, valid_sets = [val_data], verbose_eval=100, early_stopping_rounds = 100)
#     oof[val_idx] = clf.predict(val[cur_features], num_iteration=clf.best_iteration)
    
#     fold_importance_df["feature"] = cur_features
#     if fold_==0:
#         fold_importance_df["importance"] =0
#     fold_importance_df["importance"] += clf.feature_importance() / n_splits
#     valid_scores+=[clf.best_score['valid_0']['rmse']]
#     num_iterations+=[clf.best_iteration]
#     cur_preds= clf.predict(test[cur_features], num_iteration=clf.best_iteration) 
#     predictions += cur_preds / folds.n_splits
    
#     if hastestlabels:
#         test_score=mean_squared_error(cur_preds, test_target)**0.5
#         print("test cur score: {:<8.5f}".format(test_score))        
#         test_scores += [test_score]
# print('num_iterations:',num_iterations)
# print('valid scores:',valid_scores)
# print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))
# if hastestlabels:
#     print("Test CV score: {:<8.5f}".format(mean_squared_error(predictions, test_target)**0.5))        
#     print('Test scores:',test_scores)
# end = time.time()
# print('Model Execution Time:',end-start)

In [None]:
# corr = train[['hist_targetenc_mean_merchant_id_mean','new_targetenc_mean_merchant_id_mean','target']].corr()
# corr

In [None]:

# n_splits=5
# folds = KFold(n_splits=n_splits, shuffle=True, random_state=15)
# oof = np.zeros(len(train))
# predictions = np.zeros(len(test))
# start = time.time()
# feature_importance_df = pd.DataFrame()
# valid_scores =[]
# fold_importance_df = pd.DataFrame()
# fold_importance_df["feature"] = features
# fold_importance_df["importance"] = 0
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
#     print("fold n°{}".format(fold_))
#     trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
#     val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

#     num_round = 10000
#     clf = lgb.train(param, trn_data, num_round, valid_sets = [val_data], verbose_eval=100, early_stopping_rounds = 200)
#     oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
#     fold_importance_df["importance"] += clf.feature_importance() / n_splits
# #     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#     print(clf.best_score)
#     valid_scores+=[clf.best_score['valid_0']['rmse']]
#     predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

# print('valid scores:',valid_scores)
# print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

<a id="4"></a> <br>
## 4. Feature importance
Finally, we can have a look at the features that were used by the model:

In [None]:
# print(len(np.unique(oof)))
# print(train.shape)

In [None]:
# np.savetxt('LGB_targetenc_card_smoothing.npy',oof)

In [None]:
# cols = (fold_importance_df[["feature", "importance"]]
#         .groupby("feature")
#         .mean()
#         .sort_values(by="importance", ascending=False)[:1000].index)

# best_features = fold_importance_df.loc[fold_importance_df.feature.isin(cols)]

# plt.figure(figsize=(14,25))
# sns.barplot(x="importance",
#             y="feature",
#             data=best_features.sort_values(by="importance",
#                                            ascending=False))
# plt.title('LightGBM Features (avg over folds)')
# plt.tight_layout()
# plt.savefig('lgbm_importances.png')

In [None]:
# best_features=best_features.sort_values(by="importance", ascending=False)
# best_features.to_csv('best_features.csv')
# print(best_features[:100])

<a id="5"></a> <br>
## 5. Submission
Now, we just need to prepare the submission file:

In [None]:
# sub_df = pd.DataFrame({"card_id":test["card_id"].values})
# sub_df["target"] = predictions
# sub_df.to_csv("submit_targetenc_card_smoothing.csv", index=False)