<img style="float: left;" src="http://third.datacastle.cn/pkbigdata/master.other.img/8ef429f9-2032-47b8-bcc4-818fa9e41a25.png" width="50%">

In [None]:
import numpy as np
import pandas as pd
import time,os
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data_path = '../data/'
op_train_sorted_file = 'op_train_sorted.csv' 
tran_train_sorted_file = 'tran_train_sorted.csv' 
tag_train_sorted_file = 'tag_train_sorted.csv' 

In [None]:
is_preprocessed = False
if not os.path.exists(data_path + op_train_sorted_file) or not os.path.exists(data_path + tran_train_sorted_file) or not os.path.exists(data_path + tag_train_sorted_file):
    print('[info]:start read from new train data...')
    is_preprocessed = False
    print('[info]:start read from op_train...')
    op_train = pd.read_csv('../data/operation_train_new.csv')
    print('[info]:start read from tran_train...')
    tran_train = pd.read_csv('../data/transaction_train_new.csv')
    print('[info]:start read from tag_train...')
    tag_train = pd.read_csv('../data/tag_train_new.csv')
else:
    print('[info]:start read from sorted data...')
    is_preprocessed = True
    print('[info]:start read from op_train...')
    op_train = pd.read_csv('../data/op_train_sorted.csv').drop('Unnamed: 0', axis=1)
    print('[info]:start read from tran_train...')
    tran_train = pd.read_csv('../data/tran_train_sorted.csv').drop('Unnamed: 0', axis=1)
    print('[info]:start read from tag_train...')
    tag_train = pd.read_csv('../data/tag_train_sorted.csv').drop('Unnamed: 0', axis=1)

### 1.数据预处理

In [None]:
def find_different_uid(op_uids, tran_uids):
    op_diff_uids = []
    for uid in op_uids:
        if uid not in tran_uids:
            op_diff_uids.append(uid)
        else:
            tran_uids.remove(uid)
    return op_diff_uids, tran_uids

def find_same_uid(src1_uids, src2_uids):
    same_uids = [uid for uid in src1_uids if uid in src2_uids]
    return same_uids

In [None]:
if not is_preprocessed:
    # 处理时间字符串
    op_train['time'] = op_train['day'].apply(lambda x: "2018-08-%02d" % x) + ' ' + op_train['time']
    op_train['timestamp'] = op_train['time'].apply(lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))

    tran_train['time'] = tran_train['day'].apply(lambda x: "2018-08-%02d" % x) + ' ' + tran_train['time']
    tran_train['timestamp'] = tran_train['time'].apply(lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
    
    # 根据UID进行排序 再根据timestamp进行排序 
    op_train = op_train.sort_values(by=['UID', 'timestamp'],ascending=True).reset_index(drop=True)
    tran_train = tran_train.sort_values(by=['UID', 'timestamp'],ascending=True).reset_index(drop=True)
    tag_train = tag_train.sort_values(by=['UID'], ascending=True).reset_index(drop=True)

    op_train.to_csv(data_path + op_train_sorted_file)
    tran_train.to_csv(data_path + tran_train_sorted_file)
    tag_train.to_csv(data_path + tag_train_sorted_file)

In [None]:
# 分组
op_train_gb = op_train.groupby('UID', as_index=False)
tran_train_gb = tran_train.groupby('UID', as_index=False)
#  获取op和tran各自的uid
op_train_uids = [uid for uid, item in op_train_gb.groups.items()]
tran_train_uids = [uid for uid, item in tran_train_gb.groups.items()]
# 获取op和tran各自的tag
op_tag = tag_train[tag_train['UID'].isin(op_train['UID'])]
tran_tag = tag_train[tag_train['UID'].isin(tran_train['UID'])]

#### 1.1处理缺失值

In [None]:
# op需要进行LabelEncoder的特征
op_columns = list(op_train.columns)
op_le_obj_fts = ['mode', 'os', 'version',
                  'device1', 'device2', 'device_code1', 'device_code2', 'device_code3',
                  'mac1', 'mac2', 'ip1', 'ip2', 'wifi', 'geo_code', 'ip1_sub', 'ip2_sub']

In [None]:
def get_nan_counts(gb_count, ft_columns):
    hasnans_features_cnts = []
    for ft in ft_columns:
        cnts = gb_count[ft].value_counts()
        value = cnts[cnts.index == 0].values
        if len(value):
            hasnans_features_cnts.append((cnts.name,value[0]))
    return hasnans_features_cnts
        
def find_invalid_feature(gb_count, ft_columns):
    invalid_features = []
    for ft in ft_columns:
        cnts = gb_count[ft].value_counts()
        # 寻找值为0的统计数
        value = cnts[cnts.index == 0].values
        if len(value):
            if value[0] / gb_count.shape[0] > 0.5:
                print(cnts.name, value[0] / gb_count.shape[0])
                invalid_features.append(cnts.name)
    return invalid_features

def remove_list_item(src_l, rm_l):
    assert type(src_l) is list
    assert type(rm_l) is list
    
    for i in rm_l:
        if i in src_l:
            src_l.remove(i)
    return src_l

In [None]:
# 由于success是float型，将success单独拿出来处理缺失值
print('[info]: start fill nans...')
op_train['success'].fillna(-1, inplace=True)
op_train_gb = op_train.groupby('UID', as_index=False)

# 填补缺失值
op_train = op_train_gb.ffill()
op_train_gb = op_train.groupby('UID', as_index=False)
op_train = op_train_gb.bfill()
op_train_gb = op_train.groupby('UID', as_index=False)

# 在填补基础上计数，去除nan值占一半以上的值
print('[info]: start remove invalid features...')
invalid_features = find_invalid_feature(op_train_gb.count(), op_columns)
op_columns = remove_list_item(op_columns, invalid_features)
op_le_obj_fts = remove_list_item(op_le_obj_fts, invalid_features)
op_train.drop(invalid_features, axis='columns', inplace=True)

# 填补剩余的缺失值
print('[info]: start handle left nans...')
op_hasnans_features_cnts = get_nan_counts(op_train_gb.count(), op_columns)
for ft_cnts in op_hasnans_features_cnts:
    if op_train[ft_cnts[0]].hasnans:
        op_train[ft_cnts[0]].fillna('-1', inplace=True)
op_train_gb = op_train.groupby('UID', as_index=False)
print('[info]: handle nans finished.')

# operator的LabelEncoder
print('[info]: start label encoding...')
le = LabelEncoder()
for feature in op_le_obj_fts:
    try:
        print('[info]: %r label encoding...' %feature)
        op_train[feature] = le.fit_transform(op_train[feature])
    except TypeError as e:
        print(e)
op_train_gb = op_train.groupby('UID', as_index=False)
print('[info]: label encoding finished.')

In [None]:
for fk in fake_keys:
    print(op_train_fake_gp.get_group(fk)['mac1'].value_counts())

In [None]:
# 将op数据中的作假者与正常人分离开
op_train_fake_gp = op_train[op_train['UID'].isin(op_tag['UID'][op_tag['Tag'] == 1])].groupby('UID', as_index=False)
op_train_normal_gp = op_train[op_train['UID'].isin(op_tag['UID'][op_tag['Tag'] == 0])].groupby('UID', as_index=False)

In [None]:
fake_keys=[]
for key, item in op_train_fake_gp.indices.items():
    fake_keys.append(key)
    
normal_keys=[]
for key, item in op_train_normal_gp.indices.items():
    normal_keys.append(key)

In [None]:
type(op_train_fake_gp.get_group(10000))
df_10000 = op_train_fake_gp.get_group(10000)
df_10000['mode'].value_counts()

In [None]:
op_train['mode'].value_counts(dropna=False)

In [None]:
fake_keys=[]
for key, item in op_train_fake_gp.indices.items():
    fake_keys.append(key)

#### 1.2 tran处理

In [None]:
# tran需要进行LabelEncoder的特征
tran_columns = ['UID', 'channel', 'day', 'time', 'trans_amt', 'amt_src1', 'merchant',
       'code1', 'code2', 'trans_type1', 'acc_id1', 'device_code1',
       'device_code2', 'device_code3', 'device1', 'device2', 'mac1', 'ip1',
       'bal', 'amt_src2', 'acc_id2', 'acc_id3', 'geo_code', 'trans_type2',
       'market_code', 'market_type', 'ip1_sub', 'timestamp']
tran_operator_features = []
tran_user_features = []