In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 'TC_AUC',0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3,True

In [4]:
import os
data_path = '../data/'
origin_path = data_path + 'origin_data/'

op_train_new_fn = 'operation_train_new.csv'
tran_train_new_fn = 'transaction_train_new.csv'
tag_train_new_fn = 'tag_train_new.csv'
op_train_sorted_fn = 'op_train_sorted.csv'
tran_train_sorted_fn = 'tran_train_sorted.csv'
tag_train_sorted_fn = 'tag_train_sorted.csv'

op_origin_fn = 'op_origin.csv'
tran_origin_fn = 'tran_origin.csv'


op_train_new_file = data_path + op_train_new_fn
tran_train_new_file = data_path + tran_train_new_fn
tag_train_new_file = data_path + tag_train_new_fn
op_train_sorted_file = data_path + op_train_sorted_fn
tran_train_sorted_file = data_path + tran_train_sorted_fn
tag_train_sorted_file = data_path + tag_train_sorted_fn

op_origin_file = origin_path + op_origin_fn
tran_origin_file = origin_path + tran_origin_fn

In [6]:
def load_data():
    print('[info]:start read from op_train...')
    op_train = pd.read_csv(op_origin_file).drop('Unnamed: 0', axis=1)
    print('[info]:start read from tran_train...')
    tran_train = pd.read_csv(tran_origin_file).drop('Unnamed: 0', axis=1)
    print('[info]:start read from tag_train...')
    tag_train = pd.read_csv(tag_train_sorted_file).drop('Unnamed: 0', axis=1)
    return op_train, tran_train, tag_train

In [7]:
op_train, tran_train, tag_train = load_data()

[info]:start read from op_train...
[info]:start read from tran_train...
[info]:start read from tag_train...


In [8]:
op_mac1_dict = op_train['mac1'].value_counts().to_dict()
op_ipsub_dict = op_train['ipsub'].value_counts().to_dict()
tran_mac1_dict = tran_train['mac1'].value_counts().to_dict()
tran_ip1sub_dict = tran_train['ip1_sub'].value_counts().to_dict()

In [15]:
def get_tag(uid):
    return tag_train[tag_train['UID'] == uid]

def get_op(uid):
    return op_train[op_train['UID'] == uid]

def get_tran(uid):
    return tran_train[tran_train['UID'] == uid]

def get_value_counts(uid, train_data):
    assert type(train_data) is pd.DataFrame
    for c in list(train_data.columns):
        print('[%r]'%c)
        print(train_data[train_data['UID']==uid][c].value_counts())
        print('====')
        
def oneday_cnt(tmp, gb_str, ft_str):
    gb = tmp.groupby(gb_str)
    top_oneday = []
    for gb_key in gb.indices.keys():
        cnt = 0
        sub_gb = gb.get_group(gb_key)
        value_counts = sub_gb[ft_str].value_counts()
        if not value_counts.empty:
            top_value = value_counts.sort_values(ascending=False).values[0]
            top_mode = value_counts.sort_values(ascending=False).index[0]
            top_oneday.append([gb_key, top_value, top_mode])
    if not len(top_oneday):
        top_oneday = [np.nan, np.nan, np.nan]
    return top_oneday

def topcnts_oneday(tmp, sub_str):
    top_oneday = oneday_cnt(tmp, 'day', sub_str)
    top_idx, top_cnt = 0, top_oneday[0][1]
    for item in top_oneday:
        if top_cnt < item[1]:
            top_cnt = item[1]
            top_idx += 1
    top_day = top_oneday[top_idx][0]
    top_value = top_oneday[top_idx][1]
    top_mode = top_oneday[top_idx][2]
    return top_day, top_value, top_mode
        
def op_times_per2min(tmp):
    day_gb = tmp.groupby('day')
    min2 = 120
    over2min_rec = []
    for gb_key in day_gb.indices.keys():
        over2min_cnt = 0
        sgb = day_gb.get_group(gb_key)
        timestamp = list(sgb['timestamp'].get_values())
        index, idx_max = 0, len(timestamp) - 1
        now_t = timestamp[index]
        for t in timestamp:
            if timestamp[index] - now_t > min2:
                now_t = timestamp[index]
                over2min_cnt += 1
            index += 1
        over2min_rec.append(over2min_cnt)
    return max(over2min_rec)

def suc_rate(tmp):
    for item in tmp['success'].value_counts().items():
        if item[0] == 1:
            suc_cnt = item[1]
            break
    suc_all = tmp['success'].count()
    return suc_cnt/suc_all

def fbfill_series(series):
    series = series.copy()
#     series.replace(0, np.nan, inplace=True)
    new_s = series.fillna(method='bfill')
    new_s = series.fillna(method='ffill')
    return new_s

def ft_change_cnt(tmp, ft_str):   
    assert ft_str in tmp.keys()
    
    # 排除某一个ft下全部是NaN的情况
    if len(tmp[ft_str][tmp[ft_str].notna()]):
        series = fbfill_series(tmp[ft_str])
        last_dc = series.get_values()[0]
        dc_cnt = 0
        for dc in series.get_values():
            if last_dc != dc:
                last_dc = dc
                dc_cnt += 1
        return dc_cnt
    else:
        return np.nan

def get_change_frq(tmp, ft_str):
    frq = ft_change_cnt(tmp, ft_str)/tmp['day'].count()
    if frq is not np.nan:
        return float('%.2f' % frq)
    else:
        return np.nan
       
def ip_change_oneday_top(tmp, ip_ft_str):   
    day_gb = tmp.groupby('day')
    ip_rec = []
    for gb_key in day_gb.indices.keys():
        sgb = day_gb.get_group(gb_key)
        series = fbfill_series(sgb[ip_ft_str])
        last_ip = series.get_values()[0]
        ip_cnt = 0
        if np.isnan(last_ip):
            ip_rec.append(np.nan)
            continue
        for ip in series.get_values():
            if last_ip != ip:
                last_ip = ip
                ip_cnt += 1
        ip_rec.append(ip_cnt)
    return max(ip_rec)

def top_type(tmp, ft_str):
    value_counts = tmp[ft_str].value_counts()
    if not value_counts.empty:
        return value_counts.sort_values(ascending=False).index[0]
    else:
        return np.nan

def top_value(tmp, ft_str):
    value_counts = tmp[ft_str].value_counts()
    if not value_counts.empty:
        return value_counts.sort_values(ascending=False).values[0]
    else:
        return np.nan

def get_frq(tmp, ft_str):
    return float('%.2f' % (top_value(tmp, ft_str)/tmp['day'].count()))

def top_op_mac1_in_diffUID_cnt(tmp):
    tp = top_type(tmp, 'mac1')
    if tp and tp in op_mac1_dict.keys():
        return op_mac1_dict[tp]
    else:
        return np.nan

def top_type_in_diffUID_cnt(tmp, ft_str, type_dict):
    tp = top_type(tmp, ft_str)
    if tp and tp in type_dict.keys():
        return type_dict[tp]
    else:
        return np.nan

In [27]:
op_gb = op_train.groupby('UID')
drop_fts = ['mode','success','time','device1','device2','device_code1','device_code2','device_code3','mac1','ip1','ip2','ip1_sub','ip2_sub','timestamp']
op_train_nf = op_gb.count().drop(drop_fts, axis='columns')
op_train_nf.rename(columns={'day':'day_cnts'}, inplace=True)

In [11]:
op_10001 = get_op(10001)

In [20]:
# op
op_feature = {}
op_feature['UID'] = op_10001['UID'].values[0]
op_feature['day_cnts'] = op_10001['day'].count()
op_feature['op_top_appear_day'] = top_type(op_10001, 'day')
op_feature['op_top_appear_day_cnt'] = top_value(op_10001, 'day')
op_feature['op_times_per2min'] = op_times_per2min(op_10001)
op_feature['mode_top_day_oneday'] = topcnts_oneday(op_10001, 'mode')[0] # 一天中某一操作类型次数最多的那一天
op_feature['mode_top_cnt_oneday'] = topcnts_oneday(op_10001, 'mode')[1] # 一天中某一操作类型次数最多的次数
op_feature['mode_top_type_oneday'] = topcnts_oneday(op_10001, 'mode')[2] # 一天中某一操作类型次数最多的类型
op_feature['mode_cnt'] = top_value(op_10001, 'mode')
op_feature['mode_rank1'] = top_type(op_10001, 'mode')
op_feature['suc_rate'] = '%.2f' % (suc_rate(op_10001))
op_feature['device_code_frq'] = get_change_frq(op_10001, 'device_code')
op_feature['ip_change_frq'] = get_change_frq(op_10001, 'ip')
op_feature['ip_change_oneday_top'] = ip_change_oneday_top(op_10001, 'ip')
op_feature['ip_change_cnt'] = ft_change_cnt(op_10001, 'ip')
op_feature['top_mac1_in_diffUID_cnt'] = top_type_in_diffUID_cnt(op_10001, 'mac1', op_mac1_dict)
op_feature['top_ipsub_in_diffUID_cnt'] = top_type_in_diffUID_cnt(op_10001, 'ipsub', op_ipsub_dict)

In [21]:
op_feature

{'UID': 10001,
 'day_cnts': 65,
 'op_top_appear_day': 3,
 'op_top_appear_day_cnt': 14,
 'op_times_per2min': 4,
 'mode_top_day_oneday': 3,
 'mode_top_cnt_oneday': 10,
 'mode_top_type_oneday': 66.0,
 'mode_cnt': 34,
 'mode_rank1': 66.0,
 'suc_rate': '0.75',
 'device_code_frq': 0.28,
 'ip_change_frq': 0.15,
 'ip_change_oneday_top': 2,
 'ip_change_cnt': 10,
 'top_mac1_in_diffUID_cnt': 5,
 'top_ipsub_in_diffUID_cnt': 81}

In [24]:
features = []
features.append(op_feature)
features = pd.DataFrame(features)

In [31]:
tran_train.columns

Index(['UID', 'channel', 'day', 'time', 'trans_amt', 'amt_src1', 'merchant',
       'code1', 'trans_type1', 'acc_id1', 'device_code1', 'device_code2',
       'device_code3', 'device1', 'device2', 'mac1', 'ip1', 'bal', 'amt_src2',
       'trans_type2', 'market_code', 'ip1_sub', 'timestamp', 'device_code'],
      dtype='object')

In [28]:
features

Unnamed: 0,UID,day_cnts,device_code_frq,ip_change_cnt,ip_change_frq,ip_change_oneday_top,mode_cnt,mode_rank1,mode_top_cnt_oneday,mode_top_day_oneday,mode_top_type_oneday,op_times_per2min,op_top_appear_day,op_top_appear_day_cnt,suc_rate,top_ipsub_in_diffUID_cnt,top_mac1_in_diffUID_cnt
0,10001,65,0.28,10,0.15,2,34,66.0,10,3,66.0,4,3,14,0.75,81,5


In [29]:
op_train_nf = op_train_nf.join(features)
op_train_nf

ValueError: columns overlap but no suffix specified: Index(['day_cnts'], dtype='object')

In [None]:
tran_10001 = get_tran(10001)

In [22]:
tran_10001 = get_tran(17520)
tran_feature = {}
tran_feature['UID'] = tran_10001['UID'].values[0]
tran_feature['channel_top'] = top_type(tran_10001, 'channel')
tran_feature['channel_top_frq'] = top_value(tran_10001, 'channel')
tran_feature['tran_day_cnts'] = tran_10001['day'].count()
tran_feature['tran_day_appear_top'] = top_type(tran_10001, 'day')
tran_feature['tran_amt_frq'] = get_frq(tran_10001, 'trans_amt')
tran_feature['tran_amt_top'] = top_type(tran_10001, 'trans_amt')
tran_feature['tran_topcnts_oneday'] = topcnts_oneday(tran_10001, 'trans_amt')[1]
tran_feature['tran_times_per2min'] = op_times_per2min(tran_10001)
tran_feature['amt_src1_frq'] = get_change_frq(tran_10001, 'amt_src1')
tran_feature['amt_src1_type_top'] = top_type(tran_10001, 'amt_src1')
tran_feature['amt_src1_type_cnt'] = topcnts_oneday(tran_10001, 'amt_src1')[1]
tran_feature['amt_src2_frq'] = get_change_frq(tran_10001, 'amt_src2')
tran_feature['amt_src2_type_top'] = top_type(tran_10001, 'amt_src2')
tran_feature['amt_src2_type_cnt'] = topcnts_oneday(tran_10001, 'amt_src2')[1]
tran_feature['merchant_frq'] = get_change_frq(tran_10001, 'merchant')
tran_feature['merchant_type_top'] = top_type(tran_10001, 'merchant')
tran_feature['merchant_type_cnt'] = len(tran_10001['merchant'].value_counts()) # 商户标识类型总数
tran_feature['code1_type_top'] = top_type(tran_10001, 'code1')
tran_feature['code1_type_cnt'] = len(tran_10001['code1'].value_counts()) # 出现最多的商户子门店
tran_feature['trans_type1_top_cnt'] = top_value(tran_10001, 'trans_type1')
tran_feature['trans_type1_top_frq'] = get_frq(tran_10001, 'trans_type1')
tran_feature['trans_type1_top'] = top_type(tran_10001, 'trans_type1')
tran_feature['trans_type2_top_cnt'] = top_value(tran_10001, 'trans_type2')
tran_feature['trans_type2_top_frq'] = get_frq(tran_10001, 'trans_type2')
tran_feature['trans_type2_top'] = top_type(tran_10001, 'trans_type2')
tran_feature['acc_id1_top_cnt'] = top_value(tran_10001, 'acc_id1')
tran_feature['acc_id1_top_frq'] = get_frq(tran_10001, 'acc_id1')
tran_feature['acc_id1_top'] = top_type(tran_10001, 'acc_id1')
tran_feature['device_code_frq'] = get_change_frq(tran_10001, 'device_code')
tran_feature['dev_name_frq'] = get_change_frq(tran_10001, 'device1')
tran_feature['dev_type_frq'] = get_change_frq(tran_10001, 'device2')
tran_feature['ip_change_oneday_top'] = ip_change_oneday_top(tran_10001, 'ip1')
tran_feature['ip_change_frq'] = get_change_frq(tran_10001, 'ip1') # ip变化次数
tran_feature['ip_change_times'] = ft_change_cnt(tran_10001, 'ip1') # ip变化次数
tran_feature['top_mac1_in_diffUID_cnt'] = top_type_in_diffUID_cnt(tran_10001, 'mac1', tran_mac1_dict)
tran_feature['top_ip1sub_in_diffUID_cnt'] = top_type_in_diffUID_cnt(tran_10001, 'ip1_sub', tran_ip1sub_dict)

In [23]:
tran_feature

{'UID': 17520,
 'channel_top': 140,
 'channel_top_frq': 3017,
 'tran_day_cnts': 4030,
 'tran_day_appear_top': 25,
 'tran_amt_frq': 0.2,
 'tran_amt_top': 102,
 'tran_topcnts_oneday': 1,
 'tran_times_per2min': 145,
 'amt_src1_frq': 0.0,
 'amt_src1_type_top': 7.0,
 'amt_src1_type_cnt': 2,
 'amt_src2_frq': 0.0,
 'amt_src2_type_top': 67.0,
 'amt_src2_type_cnt': 10,
 'merchant_frq': 0.01,
 'merchant_type_top': 19325.0,
 'merchant_type_cnt': 25,
 'code1_type_top': 1798.0,
 'code1_type_cnt': 602,
 'trans_type1_top_cnt': 3015,
 'trans_type1_top_frq': 0.75,
 'trans_type1_top': 8.0,
 'trans_type2_top_cnt': 3913,
 'trans_type2_top_frq': 0.97,
 'trans_type2_top': 3.0,
 'acc_id1_top_cnt': 1005,
 'acc_id1_top_frq': 0.25,
 'acc_id1_top': 18003.0,
 'device_code_frq': 0.0,
 'dev_name_frq': 0.0,
 'dev_type_frq': 0.0,
 'ip_change_oneday_top': 2,
 'ip_change_frq': 0.0,
 'ip_change_times': 10,
 'top_mac1_in_diffUID_cnt': 54807,
 'top_ip1sub_in_diffUID_cnt': 99}

In [None]:
def get_feature(op,trans,label):
    for feature in op.columns[2:]:
        label = label.merge(op.groupby(['UID'])[feature].count().reset_index(),on='UID',how='left')
        label =label.merge(op.groupby(['UID'])[feature].nunique().reset_index(),on='UID',how='left')
    
    for feature in trans.columns[2:]:
        if trans_train[feature].dtype == 'object':
            label =label.merge(trans.groupby(['UID'])[feature].count().reset_index(),on='UID',how='left')
            label =label.merge(trans.groupby(['UID'])[feature].nunique().reset_index(),on='UID',how='left')
        else:
            print(feature)
            label =label.merge(trans.groupby(['UID'])[feature].count().reset_index(),on='UID',how='left')
            label =label.merge(trans.groupby(['UID'])[feature].nunique().reset_index(),on='UID',how='left')
            label =label.merge(trans.groupby(['UID'])[feature].max().reset_index(),on='UID',how='left')
            label =label.merge(trans.groupby(['UID'])[feature].min().reset_index(),on='UID',how='left')
            label =label.merge(trans.groupby(['UID'])[feature].sum().reset_index(),on='UID',how='left')
            label =label.merge(trans.groupby(['UID'])[feature].mean().reset_index(),on='UID',how='left')
            label =label.merge(trans.groupby(['UID'])[feature].std().reset_index(),on='UID',how='left')
    return label

In [None]:
train = get_feature(op_train,trans_train,y)
test = get_feature(op_test,trans_test,sub)

In [None]:
list(train.columns)

In [None]:
train.head()

In [None]:
train = train.fillna(-1)
test = test.fillna(-1)

In [None]:
train = train.drop(['UID','Tag'],axis = 1).fillna(-1)
label = y['Tag']

In [None]:
test_id = test['UID']
test = test.drop(['UID','Tag'],axis = 1).fillna(-1)