In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import gc
import os
import sys
import time
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

print(os.getcwd())
code_path = '/root/ieee/pipeline/'
sys.path.append(code_path)


path = '/root/ieee/'

train_transaction = pd.read_csv(path + 'train_transaction.csv')
test_transaction = pd.read_csv(path + 'test_transaction.csv')

train_identity = pd.read_csv(path + 'train_identity.csv')
test_identity = pd.read_csv(path + 'test_identity.csv')
print("loading is over.")
train_transaction.sort_values('TransactionDT', inplace = True)
test_transaction.sort_values('TransactionDT', inplace = True)
train_transaction['nulls1'] = train_transaction.isna().sum(axis=1)
test_transaction['nulls1'] = test_transaction.isna().sum(axis=1)

card_feature = [col for col in train_transaction.columns if "card" in col] #category
addr_feature = [col for col in train_transaction.columns if "addr" in col] #category
dist_feature = [col for col in train_transaction.columns if "dist" in col] #numeric
mail_feature = [col for col in train_transaction.columns if "email" in col] #category

c_feature = [col for col in train_transaction.columns if "C" in col] #numeric
#C1-C14: counting, 
#    such as how many addresses are found to be associated with the payment card, etc. 
#The actual meaning is masked.
d_feature = [col for col in train_transaction.columns if "D" in col] #numeric 
#D1-D15: timedelta, such as days between previous transaction, etc.
d_feature.remove('TransactionID')
d_feature.remove('TransactionDT')
d_feature.remove('ProductCD')
c_feature.remove('ProductCD')
m_feature = [col for col in train_transaction.columns if "M" in col] #category
v_feature = [col for col in train_transaction.columns if "V" in col] #numeric

train_transaction = train_transaction.loc[train_transaction.card6 != 'debit or credit'].reset_index(drop = True)

C1_threshold = test_transaction.C1.mean() + test_transaction.C1.std() * 2.5
train_transaction = train_transaction[train_transaction.C1<C1_threshold].reset_index(drop = True)

# col = 'D1'
train_transaction['Transaction_day'] = np.floor((train_transaction['TransactionDT'] / (3600 * 24) - 1))
test_transaction['Transaction_day'] = np.floor((test_transaction['TransactionDT'] / (3600 * 24) - 1))
c_feat = [col for col in c_feature if col not in ['C13', 'C14']]

train_transaction['Transaction_day'] = np.floor((train_transaction['TransactionDT'] / (3600 * 24) - 1))
test_transaction['Transaction_day'] = np.floor((test_transaction['TransactionDT'] / (3600 * 24) - 1))
train_transaction['linear'] = train_transaction['Transaction_day']
test_transaction['linear'] = test_transaction['Transaction_day']
train_transaction['D4_new'] = train_transaction['D4']/test_transaction['linear']
test_transaction['D4_new'] = test_transaction['D4']/test_transaction['linear']


# prediction = pd.read_csv(path + '/sub/KFold_2019_0903_1608_0.94231_post_black:1626_white:13719_grey:378.csv')
for col in d_feature:
    train_transaction[col + '_new'] = train_transaction[col] - train_transaction['Transaction_day']
    test_transaction[col + '_new'] = test_transaction[col] - test_transaction['Transaction_day']

d_new_feature = ["{}_new".format(col) for col in d_feature if col not in ['D1']]
second_list = [col + card_feature + ['D1_new'] for col in [['C1', 'C2'], ['C1', 'C10'], ['C1', 'C11'],
                                                           ['C1', 'C2', 'C9', 'C10', 'C11', 'C12']] ]
count_list = [[col] + card_feature + ['D1_new'] for col in c_feat]
basis_list = [card_feature + addr_feature + c_feat + ['D1_new'],
              card_feature + addr_feature + ['D1_new']]

/root
loading is over.


In [None]:

import time
from contextlib import contextmanager
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

from joblib import Parallel, delayed
def data_card_merged_online(train, test, col_id):
    used = ['TransactionID', 'Transaction_day'] + col_id
    tr = train[used + ['isFraud']]
    ts = test[used]
    data = pd.concat([tr, ts]).reset_index(drop = True)
    data.isFraud.fillna(-1, inplace = True)
    data = data.groupby(col_id).agg({k:lambda x: list(x)
    for k in ['TransactionID'] + ['isFraud']})
    data.columns = ["{}_list".format(col) for col in data.columns]
    data.reset_index(inplace = True)
    return data


def fraud_past(x):
    # ['isFraud_list']
    zero = x.count(0)
    one = x.count(1)
    minus = x.count(-1)
    if (zero > 0) & (minus > 0) & (one > 0):
        return 'link_grey'
    elif (zero * minus) > 0:
        return 'link_white'
    elif (one * minus) > 0:
        return 'link_black'
    elif len(x) == 1:
        return 'single'
    elif (zero > 0) & (one > 0):
        return 'grey'
    elif zero > 0:
        return 'white'
    elif one > 0:
        return 'black'
    else:
        return 'outlier'

def fraud_list(x):
    id_list = []
    for i in x:
        id_list += i
    return list(id_list)

def local_black_merge(train, test, key_id):
    data_card = data_card_merged_online(train, test, key_id)
    data_card['type'] = data_card['isFraud_list'].apply(fraud_past)
    data_card['record'] = data_card['isFraud_list'].apply(lambda x: x.count(0) + x.count(1))

    fraud = pd.DataFrame(
        fraud_list(data_card.loc[data_card['type'] == 'link_black','TransactionID_list']), 
        columns = ['TransactionID'])
    fraud['fraud_guess'] = 1
    temp = pd.DataFrame([])
    fraud_num = pd.merge(fraud, test[['TransactionID','isFraud']], 
                                     how = 'inner', on =['TransactionID'])
    fraud_num['category'] = "_".join(key_id)
    temp.loc[0, 'num'] = fraud_num.isFraud.sum()
    temp.loc[0, 'ratio'] = fraud_num.isFraud.mean()
    temp.loc[0, 'name'] = "_".join(key_id)
    temp.loc[0, 'card_cat'] = 'black'
    del data_card;gc.collect()
    del fraud;gc.collect()
    return fraud_num, temp

def local_special_black_merge(train, test, key_id):
    data_card = data_card_merged_online(train, test, key_id)
    data_card['type'] = data_card['isFraud_list'].apply(fraud_past)
    data_card['record'] = data_card['isFraud_list'].apply(lambda x: x.count(0) + x.count(1))

    fraud = pd.DataFrame(
        fraud_list(data_card.loc[(data_card['type'] == 'link_black')
                                 & (data_card['record'] > 2)
                                 ,'TransactionID_list']), 
        columns = ['TransactionID'])
    fraud['special_fraud_guess'] = 1
    temp = pd.DataFrame([])
    fraud_num = pd.merge(fraud, test[['TransactionID','isFraud']], 
                                     how = 'inner', on =['TransactionID'])
    fraud_num['category'] = "_".join(key_id)
    temp.loc[0, 'num'] = fraud_num.isFraud.sum()
    temp.loc[0, 'ratio'] = fraud_num.isFraud.mean()
    temp.loc[0, 'name'] = "_".join(key_id)
    temp.loc[0, 'card_cat'] = 'special_black'
    del data_card;gc.collect()
    del fraud;gc.collect()
    return fraud_num, temp

def local_grey_merge(train, test, key_id):
    data_card = data_card_merged_online(train, test, key_id)
    data_card['type'] = data_card['isFraud_list'].apply(fraud_past)
    data_card['record'] = data_card['isFraud_list'].apply(lambda x: x.count(0) + x.count(1))
    data_card['ratio'] = data_card['isFraud_list'].apply(lambda x: x.count(1)/(x.count(0) + x.count(1)) 
                                                         if (x.count(0) + x.count(1)) > 0 else len(x))
    grey = pd.DataFrame(
        fraud_list(data_card.loc[(data_card['type'] == 'link_grey') 
                                 & (data_card['record'] > 1)
                                 & (data_card['ratio'] >= 0.9),
                                 'TransactionID_list']), columns = ['TransactionID'])
    grey['grey_guess'] = 1
    temp = pd.DataFrame([])
    fraud_num = pd.merge(grey, test[['TransactionID','isFraud']], 
                                     how = 'inner', on =['TransactionID'])
    fraud_num['category'] = "_".join(key_id)
    temp.loc[0, 'num'] = fraud_num.isFraud.sum()
    temp.loc[0, 'ratio'] = fraud_num.isFraud.mean()
    temp.loc[0, 'name'] = "_".join(key_id)
    temp.loc[0, 'card_cat'] = 'grey'
    del data_card;gc.collect()
    del grey;gc.collect()
    return fraud_num, temp

def local_white_merge(train, test, key_id):
    data_card = data_card_merged_online(train, test, key_id)
    data_card['type'] = data_card['isFraud_list'].apply(fraud_past)
    data_card['record'] = data_card['isFraud_list'].apply(lambda x: x.count(0) + x.count(1))
    
    white = pd.DataFrame(
        fraud_list(data_card.loc[(data_card['type'] == 'link_white') & (data_card['record'] > 1),
                                 'TransactionID_list']), columns = ['TransactionID'])
    white['white_guess'] = 1
    temp = pd.DataFrame([])
    fraud_num = pd.merge(white, test[['TransactionID','isFraud']], 
                                     how = 'inner', on =['TransactionID'])
    fraud_num['category'] = "_".join(key_id)
    temp.loc[0, 'num'] = fraud_num.isFraud.sum()
    temp.loc[0, 'ratio'] = fraud_num.isFraud.mean()
    temp.loc[0, 'ratio'] = fraud_num.isFraud.count()
    temp.loc[0, 'name'] = "_".join(key_id)
    temp.loc[0, 'card_cat'] = 'white'
    del data_card;gc.collect()
    del white;gc.collect()
    return fraud_num, temp


def train_threshold_rule_research(tr, full_list, threshold, func):
    feat_rule = c_feat + ['D1_new', 'TransactionID', 'Transaction_day'] + \
                card_feature + addr_feature + d_new_feature + ['isFraud']
    sample_size = train_transaction.shape[0] * (1 - threshold)
    local_tr = train_transaction.loc[:int(sample_size), feat_rule]
    local_ts = train_transaction.loc[int(sample_size):, feat_rule]
    print("simulation begin.")
    res = Parallel(n_jobs=8, backend = 'multiprocessing') \
            (delayed(func)(local_tr, local_ts, col) for col in full_list)
    del local_tr, local_ts;gc.collect()
    fraud_data = pd.concat([res[row][0] for row in range(len(res))], axis = 0)
    fraud_data.drop_duplicates(['TransactionID'], inplace = True)
    rule_list = pd.concat([res[row][1] for row in range(len(res))], axis = 0)
    del res;gc.collect()
    return fraud_data, rule_list

from model import kfold_lightgbm
from feature import woe_encoder, timer, mail_func, addr_func, \
numeric_func, identity_func, \
product_func, match_func,card_func, \
all_category_encoding, Transaction_amt_encoding, \
C_feature, pca_missing, date_feature, D_feature


def time_diff(tr, ts):
    H_move = 12
    card_feature = [col for col in tr.columns if "card" in col] #category
    addr_feature = [col for col in tr.columns if "addr" in col] #category
    feat = ['TransactionID',"P_emaildomain", 'TransactionDT', 'TransactionAmt'] +\
            d_feature + card_feature + addr_feature + c_feature
    tr_ = tr[feat]
    ts_ = ts[feat]
    data_train = pd.DataFrame([])
    data_test = pd.DataFrame([])
    tr_['cid'] = \
    tr[c_feat + card_feature].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    ts_['cid'] = \
    ts[c_feat + card_feature].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

    tr_['uid'] = tr['card1'].astype(str) + "_" + tr['card2'].astype(str) +\
                        tr['card3'].astype(str) + "_" + tr['card4'].astype(str) +\
                        tr['card5'].astype(str) + "_" + tr['card6'].astype(str) +\
                        tr['addr1'].astype(str) + "_" + tr['addr2'].astype(str) +\
                        tr['P_emaildomain'].astype(str)
    tr_['uid'] = tr['card1'].astype(str) + "_" + tr['card2'].astype(str) +\
                        tr['card3'].astype(str) + "_" + tr['card4'].astype(str) +\
                        tr['card5'].astype(str) + "_" + tr['card6'].astype(str) +\
                        tr['addr1'].astype(str) + "_" + tr['addr2'].astype(str) +\
                        tr['P_emaildomain'].astype(str)
    ts_['uid'] = ts['card1'].astype(str) + "_" + ts['card2'].astype(str) +\
                        ts['card3'].astype(str) + "_" + ts['card4'].astype(str) +\
                        ts['card5'].astype(str) + "_" + ts['card6'].astype(str) +\
                        ts['addr1'].astype(str) + "_" + ts['addr2'].astype(str) +\
                        ts['P_emaildomain'].astype(str)
    tr_["day"] = (tr["TransactionDT"] + 3600 * H_move) // (24 * 60 * 60)
    ts_["day"] = (ts["TransactionDT"] + 3600 * H_move) // (24 * 60 * 60)
    tr_['Hour'] = np.floor(tr['TransactionDT'] / 3600) % 24
    ts_['Hour'] = np.floor(ts['TransactionDT'] / 3600) % 24
    tr_['D1_delta'] = tr_['D1'] - tr_['day']
    ts_['D1_delta'] = ts_['D1'] - ts_['day']
    tr_['D2_delta'] = tr_['D2'] - tr_['day']
    ts_['D2_delta'] = ts_['D2'] - ts_['day']
    tr_['D10_delta'] = tr_['D10'] - tr_['day']
    ts_['D10_delta'] = ts_['D10'] - ts_['day']
    tr_['D15_delta'] = tr_['D15'] - tr_['day']
    ts_['D15_delta'] = ts_['D15'] - ts_['day']

    tr_['D1_delta_uid'] = tr_['D1_delta'].astype(str) + "_" + tr_['uid']
    tr_['D2_delta_uid'] = tr_['D2_delta'].astype(str) + "_" + tr_['uid']
    ts_['D1_delta_uid'] = ts_['D1_delta'].astype(str) + "_" + ts_['uid']
    ts_['D2_delta_uid'] = ts_['D2_delta'].astype(str) + "_" + ts_['uid']

    tr_['D10_delta_uid'] = tr_['D10_delta'].astype(str) + "_" + tr_['uid']
    tr_['D15_delta_uid'] = tr_['D15_delta'].astype(str) + "_" + tr_['uid']
    ts_['D10_delta_uid'] = ts_['D10_delta'].astype(str) + "_" + ts_['uid']
    ts_['D15_delta_uid'] = ts_['D15_delta'].astype(str) + "_" + ts_['uid']
    
    tr_['c4_d1_id'] = \
    tr_[['C4', 'D1_delta'] + card_feature].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    ts_['c4_d1_id'] = \
    ts_[['C4', 'D1_delta'] + card_feature].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

    data_train['trans_curday_Amt_cnt'] = tr_.groupby(['uid', 'day'])['TransactionAmt'].transform('count')
    data_test['trans_curday_Amt_cnt'] = ts_.groupby(['uid', 'day'])['TransactionAmt'].transform('count')
    # TransactionAmt相关
    # 当天的交易次数
    data_train['trans_curday_Amt_cnt'] = tr_.groupby(['uid', 'day'])['TransactionAmt'].transform('count')
    data_test['trans_curday_Amt_cnt'] = ts_.groupby(['uid', 'day'])['TransactionAmt'].transform('count')
    # 当天的交易总额
    data_train['trans_curday_Amt_sum'] = tr_.groupby(['uid', 'day'])['TransactionAmt'].transform('sum')
    data_test['trans_curday_Amt_sum'] = ts_.groupby(['uid', 'day'])['TransactionAmt'].transform('sum')
    # 当天的交易最大金额
    data_train['trans_curday_Amt_max'] = tr_.groupby(['uid', 'day'])['TransactionAmt'].transform('max')
    data_test['trans_curday_Amt_max'] = ts_.groupby(['uid', 'day'])['TransactionAmt'].transform('max')
    # 当天的交易最小金额
    data_train['trans_curday_Amt_min'] = tr_.groupby(['uid', 'day'])['TransactionAmt'].transform('min')
    data_test['trans_curday_Amt_min'] = ts_.groupby(['uid', 'day'])['TransactionAmt'].transform('min')
    # 当天的交易平均金额
    data_train['trans_curday_Amt_mean'] = tr_.groupby(['uid', 'day'])['TransactionAmt'].transform('mean')
    data_test['trans_curday_Amt_mean'] = ts_.groupby(['uid', 'day'])['TransactionAmt'].transform('mean')
    # 当天同样金额的交易次数
    data_train['trans_curday_samAmt_cnt'] = tr_.groupby(['uid', 'day', 'TransactionAmt'])['TransactionAmt'].transform('count')
    data_test['trans_curday_samAmt_cnt'] = ts_.groupby(['uid', 'day', 'TransactionAmt'])['TransactionAmt'].transform('count')

    data_train['trans_curday_hour_max'] = tr_.groupby(['uid', 'day'])['Hour'].transform('max')
    data_test['trans_curday_hour_max'] = ts_.groupby(['uid', 'day'])['Hour'].transform('max')
    # 当天的交易最小Hour
    data_train['trans_curday_hour_min'] = tr_.groupby(['uid', 'day'])['Hour'].transform('min')
    data_test['trans_curday_hour_min'] = ts_.groupby(['uid', 'day'])['Hour'].transform('min')
    # 当天的交易平均Hour
    data_train['trans_curday_hour_mean'] = tr_.groupby(['uid', 'day'])['Hour'].transform('mean')
    data_test['trans_curday_hour_mean'] = ts_.groupby(['uid', 'day'])['Hour'].transform('mean')

    data = pd.concat([data_train, data_test]).reset_index(drop = True)
    # 距离上一笔以及下一笔交易的时间差特征(seconds)
    key_list = ['uid', 'D1_delta_uid', 'D2_delta_uid', 'c4_d1_id', 'cid']
    values = ['TransactionDT', 'TransactionAmt', 'C13']
    df = pd.concat([tr_, ts_]).reset_index(drop = True)
    for key in key_list:
        for value in values:
            stat_temp = df[[key] + [value]].copy()
            for i in [-2, 2, -1, 1]:
                shift_value = stat_temp.groupby(key)[value].shift(i)
                cname = '_'.join([key, value]) + '_diff_time{}'.format(i)
                data[cname] = stat_temp[value] - shift_value
    
    key_list = ['uid', 'D1_delta_uid', 'D2_delta_uid', 'c4_d1_id', 'cid']
    stat_temp = df[key_list + d_feature+ ['TransactionAmt', 'C13']].copy()
    for key in key_list:
        for col in d_feature + ['TransactionAmt', 'C13']:
            for i in [-1, 1]:
                shift_value = stat_temp.groupby(key)[col].shift(i)
                cname = '_'.join([key, col]) +'_shift_time{}'.format(i)
                data[cname] = shift_value
    return data

def FE_OP_c1_c2(df, op = 'count', df_test = None):
    op = op.lower()
    c1, c2 = df.columns.tolist()
    c1c2 = 'FE_{}_({})_({})'.format(op.upper(), c1, c2)
    
    if df_test is not None:
        n_train = len(df)
        df = pd.concat([df, df_test], axis = 0).reset_index(drop = True)

    if op == 'count':
        s = df.groupby([c1, c2])[c1].transform(op)
    elif op in ['nunique', 'median', 'mean', 'min', 'max', 'std', 'sum', 'cumcount']:
        s = df.groupby(c1)[c2].transform(op)
    elif op == 'add':
        s = df[c1] + df[c2]
    elif op == 'diff':
        s = df[c1] - df[c2]
    elif op == 'mul':
        s = df[c1] * df[c2]
    elif op == 'div':
        s = df[c1] / df[c2]
    
    if df_test is not None:
        return c1c2, s[:n_train].reset_index(drop = True), s[n_train:].reset_index(drop = True)
    return c1c2, s

def decompose_name(col):
    c1, c2 = col.split(')_(')
    op, c1 = c1.split('_(')
    op = op[3:].lower()
    c2 = c2[:-1]
    return op, c1, c2

def name_to_fe(train, test, cols_add):
    for col in cols_add:
        print(col)
        op, c1, c2 = decompose_name(col)
        c1c2, train[col], test[col] =  FE_OP_c1_c2(train[[c1,c2]], op, test[[c1,c2]])
    return train, test

cols_add = [
'FE_MIN_(TransactionDT)_(C1)',
 'FE_MIN_(card2)_(C1)',
 'FE_MIN_(card1)_(C1)',
 'FE_MIN_(addr1)_(C1)',
 'FE_MIN_(card1)_(C13)',
 'FE_MIN_(card1)_(C14)',
 'FE_MIN_(card5)_(C1)',
 'FE_MAX_(TransactionDT)_(C1)',
 'FE_MAX_(card2)_(C1)',
 'FE_MAX_(card1)_(C1)',
 'FE_MAX_(addr1)_(C1)',
 'FE_MAX_(TransactionDT)_(C13)',
 'FE_MAX_(P_emaildomain)_(C1)',
 'FE_MAX_(card2)_(C13)',
 'FE_MAX_(card1)_(C13)',
 'FE_MAX_(TransactionDT)_(C14)',
 'FE_MAX_(TransactionDT)_(C12)',
 'FE_MAX_(addr1)_(C13)',
 'FE_MAX_(card2)_(C14)',
 'FE_MAX_(card1)_(C14)',
 'FE_MAX_(TransactionDT)_(TransactionAmt)',
 'FE_MAX_(card5)_(C1)']


def amt_accumulation_summary(tr, ts):
    v_list = ['V47', 'V244', 'V246', 'V257']
    key = card_feature + addr_feature + c_feature + d_feature + v_list
    df = pd.concat([tr[key + ['Transaction_day', 'TransactionDT','TransactionAmt','P_emaildomain']], 
                    ts[key + ['Transaction_day', 'TransactionDT','TransactionAmt','P_emaildomain']]]).reset_index(drop = True)
    H_move = 0
    df["Transaction_day"] = (df["TransactionDT"] + 3600 * H_move) // (24 * 60 * 60)
    col = 'D1'
    df[col + '_new'] = df[col] - df['Transaction_day']
    col = 'D2'
    df[col + '_new'] = df[col] - df['Transaction_day']
    data = pd.DataFrame([])
#     df['uid'] = df[card_feature + addr_feature].apply(
#                     lambda row: '_'.join(row.values.astype(str)), axis=1)

    df['uid'] = df[card_feature + addr_feature + ["P_emaildomain"]].apply(
                    lambda row: '_'.join(row.values.astype(str)), axis=1)
    df['d1_uid'] = df['uid'].astype(str) + "_" + df['D1_new'].astype(str)
    df['d2_uid'] = df['uid'].astype(str) + "_" + df['D2_new'].astype(str)
    
    df['c4_d1_id'] = \
    df[['C4', 'D1_new'] + card_feature].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    df['c14_d1_uid'] = \
    df[['C14', 'D1_new']+ card_feature].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    used = []
    for key in ['uid', 'd1_uid', 'c4_d1_id']:
        for col in ['TransactionAmt'] + ['D1']:
            df['{}_diff_1_{}'.format(col, key)] = \
            df[col] - df.groupby(key)[col].shift(1)
            df['{}_diff_-1_{}'.format(col, key)] = \
            df[col] - df.groupby(key)[col].shift(-1)
            used.append(('{}_diff_1_{}'.format(col, key), key))
            used.append(('{}_diff_-1_{}'.format(col, key), key))
    for func in ['mean', 'std']: #19.09.17 
        for col in used:
            data["_".join([col[0], func])] = df.groupby(col[1])[col[0]].transform(func)

            
    new_used = []
    for key in ['uid', 'd1_uid', 'c4_d1_id']:
#         ['C1', 'C13', 'C14'] + ['D1_new']
        for col in ['C1', 'C14']:
            df['{}_encoding_{}'.format(col, key)] = df[col]
            new_used.append(('{}_encoding_{}'.format(col, key), key))

    for func in ['mean', 'std']: #19.09.17 
        for col in new_used:
            data["_".join([col[0], func])] = df.groupby(col[1])[col[0]].transform(func)
    return data




def local_training_baseline_estimation(tr, threshold):
    features = c_feature + d_feature + dist_feature + v_feature + ['TransactionAmt','nulls1']
    sample_size = train_transaction.shape[0] * threshold
    local_tr = train_transaction.loc[:int(sample_size)].reset_index(drop = True)
    local_ts = train_transaction.loc[int(sample_size):].reset_index(drop = True)
    data = pd.DataFrame([])
    tr_shape = local_tr.shape[0]
    print("feature eng begin.")
    temp = mail_func(local_tr, local_ts)
    data[temp.columns] = temp
    temp = addr_func(local_tr, local_ts)
    data[temp.columns] = temp
    temp = numeric_func(local_tr, local_ts)
    data[temp.columns] = temp
    temp = identity_func(local_tr, local_ts,train_identity, test_identity)
    data[temp.columns] = temp
    temp = product_func(local_tr, local_ts)
    data[temp.columns] = temp
    temp = match_func(local_tr, local_ts)
    data[temp.columns] = temp
    temp = card_func(local_tr, local_ts)
    data[temp.columns] = temp
    temp = all_category_encoding(local_tr, local_ts)
    data[temp.columns] = temp
    temp = Transaction_amt_encoding(local_tr, local_ts)
    data[temp.columns] = temp
    temp = C_feature(local_tr, local_ts)
    data[temp.columns] = temp
    temp = date_feature(local_tr, local_ts)
    data[temp.columns] = temp
    temp = time_diff(local_tr, local_ts)
    data[temp.columns] = temp
    temp = amt_accumulation_summary(local_tr, local_ts)
    data[temp.columns] = temp
    basic_feature = list(data.columns)
    local_tr[data.columns] = data[:tr_shape].reset_index(drop = True)
    local_ts[data.columns] = data[tr_shape:].reset_index(drop = True)
    del data,temp;gc.collect()
    print("over.")
    used = [col for col in features + basic_feature if col not in d_feature]
    oof_train, oof_test, score_list = kfold_lightgbm(local_tr, local_ts, used)
    result = pd.DataFrame([])
    result['TransactionID'] = local_ts['TransactionID']
    result['pred'] = oof_test.mean(axis = 1)
    result['isFraud'] = local_ts['isFraud']
    del oof_train, oof_test, score_list;gc.collect()
    del local_tr, local_ts;gc.collect()
    return result



from sklearn.metrics import roc_auc_score

def rule_evaluation(local_result, rule_list, fraud_data, fraud_exist = pd.DataFrame([]), white = False):
    if white:
        local_sim = pd.merge(local_result, 
                     fraud_data[['TransactionID', 'white_guess']], 
                     on = ['TransactionID'], how = 'left')
        local_sim['pred'] = np.where(local_sim['white_guess'] == 1, 0, local_sim['pred'])
        initial_score = roc_auc_score(local_result['isFraud'],local_result['pred'])
        score_max = roc_auc_score(local_sim['isFraud'], local_sim['pred'])
        boost = score_max - initial_score
        print("initial score:", initial_score)
        print("Max boost:", boost)
        print("white score:", score_max)
        result = pd.DataFrame([])
        result.loc[0, 'initial_score'] = initial_score
        result.loc[0, 'boosted_score'] = score_max
        result.loc[0, 'boost_score'] = boost
        result.loc[[0], 'name'] = pd.Series([rule_list])
        return fraud_data, result
    if fraud_exist.shape[0] > 0:
        fraud_all = fraud_exist
        fraud_sim = local_result.copy()
        fraud_sim = pd.merge(fraud_sim, fraud_all, on = ['TransactionID'], how = 'left')
        fraud_sim['pred'] = np.where(fraud_sim['fraud_guess'] == 1, 1, fraud_sim['pred'])
        initial_score = roc_auc_score(fraud_sim['isFraud'], fraud_sim['pred'])
    else:
        fraud_all = pd.DataFrame([])
        initial_score = roc_auc_score(local_result['isFraud'],local_result['pred'])
    fraud_sim = local_result.copy()
    print("initial score:", initial_score)
    score_max = initial_score
    rule_used = []
    for i in rule_list['name']:
        fraud_temp = fraud_data.loc[fraud_data['category'] == i, ['TransactionID']]
        fraud_all = pd.concat([fraud_all, fraud_temp]).reset_index(drop = True)
        fraud_all.drop_duplicates(['TransactionID'], inplace = True)
        fraud_all['fraud_guess'] = 1
        if 'fraud_guess' in fraud_sim.columns:
            fraud_sim.drop(['fraud_guess'], axis = 1, inplace = True)
        fraud_sim = pd.merge(fraud_sim, fraud_all, on = ['TransactionID'], how = 'left')
        fraud_sim['pred'] = np.where(fraud_sim['fraud_guess'] == 1, 1, fraud_sim['pred'])
        score = roc_auc_score(fraud_sim['isFraud'], fraud_sim['pred'])
        if score > score_max:
            score_max = score
            rule_used.append(i)
    print("refit list.")
    fraud_all = fraud_exist
    for i in rule_used:
        fraud_temp = fraud_data.loc[fraud_data['category'] == i, ['TransactionID']]
        fraud_all = pd.concat([fraud_all, fraud_temp]).reset_index(drop = True)
        fraud_all.drop_duplicates(['TransactionID'], inplace = True)
        fraud_all['fraud_guess'] = 1
    print("highest:", score_max)
    print("last score:", score)
    boost = score_max - initial_score
    print("Max boost:", boost)
    result = pd.DataFrame([])
    result.loc[0, 'initial_score'] = initial_score
    result.loc[0, 'boosted_score'] = score_max
    result.loc[0, 'boost_score'] = boost
    result.loc[[0], 'name'] = pd.Series([rule_used])
    return fraud_all, result
    

In [13]:
c_feat = [col for col in c_feature if col not in ['C13', 'C14']]


addr_list = [addr_feature + ['D1_new', 'D10_new'],
            addr_feature + c_feat + ['D1_new', 'D10_new'],
             addr_feature + ['C{}'.format(i) for i in range(1, 12, 2)] + ['D1_new', 'D10_new'],
             addr_feature + ['C{}'.format(i) for i in range(2, 13, 2)] + ['D1_new', 'D10_new']
            ]


count_list = [[col] + card_feature + ['D1_new'] for col in ['C4', 'C5']]
basis_list = [card_feature + addr_feature + c_feat + ['D1_new'],
              card_feature + addr_feature + ['D1_new'],
             
             card_feature + ['C{}'.format(i) for i in range(1, 12, 2)] + ['D1_new'],
             card_feature + ['C{}'.format(i) for i in range(2, 13, 2)] + ['D1_new']
             ]
second_list = [col + card_feature + ['D1_new'] for col in [['C1', 'C10'],
                                                           ['C1', 'C2', 'C9', 'C10', 'C11', 'C12']] ]

day_list =  [[col] + card_feature + ['D1_new'] for col in ['D10_new', 'D15_new']] +\
            [[col] + card_feature + addr_feature for random_statecol in ['D2_new', 'D4_new']] +\
            [[col] + card_feature + c_feat for col in ['D4_new', 'D10_new']]

black_list = count_list + basis_list + second_list + day_list + addr_list

count_list = [[col] + card_feature + ['D1_new'] for col in ['C12', 'C14']]
day_list =  [[col] + card_feature + ['D1_new'] for col in ['D10_new', 'D15_new']] +\
            [[col] + card_feature + addr_feature for col in ['D2_new', 'D4_new']] +\
            [[col] + card_feature + c_feat for col in ['D4_new', 'D10_new']]
        
special_list = count_list + day_list + basis_list + addr_list

count_list = [[col] + card_feature + ['D1_new'] for col in c_feat]
basis_list = [card_feature + addr_feature + c_feat + ['D1_new'],
              card_feature + addr_feature + ['D1_new'],
             card_feature + ['C{}'.format(i) for i in range(1, 12, 2)] + ['D1_new'],
             card_feature + ['C{}'.format(i) for i in range(2, 13, 2)]+ ['D1_new']
              
             ]

day_list =  [[col] + card_feature + ['D1_new'] for col in ['D2_new', 'D11_new']] +\
             [[col] + card_feature + addr_feature for col in ['D2_new', 'D11_new']]
#plus D11_new out of research.
grey_list = basis_list + day_list + addr_list + count_list

white_list = [card_feature + addr_feature + c_feat + ['D1_new'], card_feature + c_feat + ['D1_new']
             ]

from sklearn.metrics import roc_auc_score

def local_rule_based_chosen(tr, threshold):
    local_result = local_training_baseline_estimation(tr, threshold)
    fraud_data1, rule_list1 = train_threshold_rule_research(tr, black_list, threshold, local_black_merge)
    fraud_data2, rule_list2 = train_threshold_rule_research(tr, special_list, threshold, local_special_black_merge)
    fraud_data3, rule_list3 = train_threshold_rule_research(tr, grey_list, threshold, local_grey_merge)
    fraud_data4, rule_list4 = train_threshold_rule_research(tr, white_list, threshold, local_white_merge)
    rule_list1.sort_values(['ratio', 'num'], ascending = False, inplace = True)
    rule_list2.sort_values(['ratio', 'num'], ascending = False, inplace = True)
    rule_list3.sort_values(['ratio', 'num'], ascending = False, inplace = True)
    rule_list4.sort_values(['ratio', 'num'], ascending = False, inplace = True)
    fraud_id, result1 = rule_evaluation(local_result, rule_list1, fraud_data1)
    fraud_id, result2 = rule_evaluation(local_result, rule_list2, fraud_data2, fraud_id)
    fraud_id, result3 = rule_evaluation(local_result, rule_list3, fraud_data3, fraud_id)
    fraud_id_none, result4 = rule_evaluation(local_result, rule_list4, fraud_data4, pd.DataFrame([]), True)
    result1.loc[0, 'card_cat'] = 'black'
    result2.loc[0, 'card_cat'] = 'special'
    result3.loc[0, 'card_cat'] = 'grey'
    result4.loc[0, 'card_cat'] = 'white'
    result = pd.concat([result1, result2, result3, result4]).reset_index(drop = True)
    del local_result, fraud_data1, fraud_data2, fraud_data3, fraud_data4;gc.collect()
    del rule_list1, rule_list2, rule_list3, rule_list4;gc.collect()
    return result

In [16]:
def threshold_rule_selection(tr):
    ans = pd.DataFrame([])
    for i in [0.2, 0.5, 0.8]:
        print("##################################threshold:", i)
        res = local_rule_based_chosen(tr, i)
        ans = pd.concat([ans, res])
    return ans

final_result = threshold_rule_selection(train_transaction)

##################################threshold: 0.2
feature eng begin.
over.
train.shape = (117418, 811), test.shape = (469670, 811)
############################################################ fold = 1 / 5
####### cur time = 2019/09/22 08:51:41
Training until validation scores don't improve for 100 rounds.
[500]	valid_0's auc: 0.908984
[1000]	valid_0's auc: 0.911426
Early stopping, best iteration is:
[981]	valid_0's auc: 0.911504
period: [    0     1     2 ... 23481 23482 23483] , the score is 0.9115044025203857
############################################################ fold = 2 / 5
####### cur time = 2019/09/22 08:53:17
Training until validation scores don't improve for 100 rounds.
[500]	valid_0's auc: 0.927853
Early stopping, best iteration is:
[822]	valid_0's auc: 0.928189
period: [23484 23485 23486 ... 46965 46966 46967] , the score is 0.9281886467465198
############################################################ fold = 3 / 5
####### cur time = 2019/09/22 08:54:40
Training until v

KeyError: "['C14'] not in index"

In [15]:
pd.options.display.max_colwidth = 1000
final_result

Unnamed: 0,initial_score,boosted_score,boost_score,name,card_cat
0,0.900783,0.909399,0.008616242,"[card1_card2_card3_card4_card5_card6_addr1_addr2_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14_D1_new, card1_card2_card3_card4_card5_card6_addr1_addr2_D1_new, D10_new_card1_card2_card3_card4_card5_card6_D1_new, D10_new_card1_card2_card3_card4_card5_card6_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14, D15_new_card1_card2_card3_card4_card5_card6_D1_new, D4_new_card1_card2_card3_card4_card5_card6_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14, card1_card2_card3_card4_card5_card6_C2_C4_C6_C8_C10_C12_D1_new, C1_C10_card1_card2_card3_card4_card5_card6_D1_new, card1_card2_card3_card4_card5_card6_C1_C3_C5_C7_C9_C11_D1_new, D2_new_card1_card2_card3_card4_card5_card6_addr1_addr2, C4_card1_card2_card3_card4_card5_card6_D1_new, C5_card1_card2_card3_card4_card5_card6_D1_new, D4_new_card1_card2_card3_card4_card5_card6_addr1_addr2]",black
1,0.909419,0.90942,1.473543e-07,[C14_card1_card2_card3_card4_card5_card6_D1_new],special
2,0.90942,0.90942,0.0,[],grey
3,0.900783,0.902468,0.001684784,num ratio \ 0 0.0 12722.0 0 0.0 12318.0 name \ 0 card1_card2_card3_card4_card5_card6_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14_D1_new 0 card1_card2_card3_card4_card5_card6_addr1_addr2_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14_D1_new card_cat 0 white 0 white,white
0,0.926634,0.933242,0.006607443,"[card1_card2_card3_card4_card5_card6_addr1_addr2_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14_D1_new, card1_card2_card3_card4_card5_card6_addr1_addr2_D1_new, D10_new_card1_card2_card3_card4_card5_card6_D1_new, D10_new_card1_card2_card3_card4_card5_card6_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14, D15_new_card1_card2_card3_card4_card5_card6_D1_new, D2_new_card1_card2_card3_card4_card5_card6_addr1_addr2, addr1_addr2_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14_D1_new, D4_new_card1_card2_card3_card4_card5_card6_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14, C4_card1_card2_card3_card4_card5_card6_D1_new, C5_card1_card2_card3_card4_card5_card6_D1_new, D4_new_card1_card2_card3_card4_card5_card6_addr1_addr2]",black
1,0.933506,0.933559,5.270585e-05,"[C12_card1_card2_card3_card4_card5_card6_D1_new, D2_new_card1_card2_card3_card4_card5_card6_addr1_addr2, D15_new_card1_card2_card3_card4_card5_card6_D1_new]",special
2,0.933559,0.933611,5.18924e-05,[D11_new_card1_card2_card3_card4_card5_card6_D1_new],grey
3,0.926634,0.927144,0.0005098825,num ratio \ 0 7.0 15826.0 0 6.0 15287.0 name \ 0 card1_card2_card3_card4_card5_card6_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14_D1_new 0 card1_card2_card3_card4_card5_card6_addr1_addr2_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14_D1_new card_cat 0 white 0 white,white
0,0.941366,0.941558,0.0001924965,"[D10_new_card1_card2_card3_card4_card5_card6_D1_new, C5_card1_card2_card3_card4_card5_card6_D1_new, D4_new_card1_card2_card3_card4_card5_card6_C1_C2_C3_C4_C5_C6_C7_C8_C9_C10_C11_C12_C14, card1_card2_card3_card4_card5_card6_addr1_addr2_D1_new]",black
1,0.941643,0.941643,0.0,[],special


In [None]:

day_list =  [[col] + card_feature + ['D1_new'] for col in ['D11_new']]
#plus D11_new out of research.
grey_list = day_list

In [None]:
count_list = [[col] + card_feature + ['D1_new'] for col in ['C4', 'C5']]
basis_list = [card_feature + addr_feature + c_feat + ['D1_new'],
              card_feature + addr_feature + ['D1_new'],
             
             card_feature + ['C{}'.format(i) for i in range(1, 12, 2)] + ['D1_new'],
             card_feature + ['C{}'.format(i) for i in range(2, 13, 2)] + ['D1_new']
             ]
second_list = [col + card_feature + ['D1_new'] for col in [['C1', 'C10'],
                                                           ['C1', 'C2', 'C9', 'C10', 'C11', 'C12']] ]

day_list =  [[col] + card_feature + ['D1_new'] for col in ['D10_new', 'D15_new']] +\
            [[col] + card_feature + addr_feature for col in ['D2_new', 'D4_new']] +\
            [[col] + card_feature + c_feat for col in ['D4_new', 'D10_new']]

black_list = count_list + basis_list + second_list + day_list + addr_list

count_list = [[col] + card_feature + ['D1_new'] for col in ['C12', 'C14']]
day_list =  [[col] + card_feature + ['D1_new'] for col in ['D10_new', 'D15_new']] +\
            [[col] + card_feature + addr_feature for col in ['D2_new', 'D4_new']] +\
            [[col] + card_feature + c_feat for col in ['D4_new', 'D10_new']]
        
special_list = count_list + day_list + basis_list + addr_list

count_list = [[col] + card_feature + ['D1_new'] for col in c_feat]
basis_list = [card_feature + addr_feature + c_feat + ['D1_new'],
              card_feature + addr_feature + ['D1_new'],
             card_feature + ['C{}'.format(i) for i in range(1, 12, 2)] + ['D1_new'],
             card_feature + ['C{}'.format(i) for i in range(2, 13, 2)]+ ['D1_new']
              
             ]

day_list =  [[col] + card_feature + ['D1_new'] for col in ['D2_new', 'D11_new']] +\
             [[col] + card_feature + addr_feature for col in ['D2_new', 'D11_new']]
#plus D11_new out of research.
grey_list = basis_list + day_list + addr_list

white_list = [card_feature + addr_feature + c_feat + ['D1_new'], card_feature + c_feat + ['D1_new']
             ]