<img style="float: left;" src="http://third.datacastle.cn/pkbigdata/master.other.img/8ef429f9-2032-47b8-bcc4-818fa9e41a25.png" width="50%">

In [None]:
import numpy as np
import pandas as pd
import time,os
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data_path = '../data/'
op_train_sorted_file = 'op_train_sorted.csv' 
tran_train_sorted_file = 'tran_train_sorted.csv' 
tag_train_sorted_file = 'tag_train_sorted.csv' 

In [None]:
is_preprocessed = False
if not os.path.exists(data_path + op_train_sorted_file) or not os.path.exists(data_path + tran_train_sorted_file) or not os.path.exists(data_path + tag_train_sorted_file):
    print('from :\noperation_train_new.csv \ntransaction_train_new.csv \ntag_train_new.csv')
    is_preprocessed = False
    op_train = pd.read_csv('../data/operation_train_new.csv')
    tran_train = pd.read_csv('../data/transaction_train_new.csv')
    tag_train = pd.read_csv('../data/tag_train_new.csv')
else:
    print('from :\nop_train_sorted.csv \ntran_train_sorted.csv \ntag_train_sorted.csv')
    is_preprocessed = True
    op_train = pd.read_csv('../data/op_train_sorted.csv')
    tran_train = pd.read_csv('../data/tran_train_sorted.csv')
    tag_train = pd.read_csv('../data/tag_train_sorted.csv')

### 1.数据探索

In [None]:
def find_different_uid(op_uids, tran_uids):
    op_diff_uids = []
    for uid in op_uids:
        if uid not in tran_uids:
            op_diff_uids.append(uid)
        else:
            tran_uids.remove(uid)
    return op_diff_uids, tran_uids

def find_same_uid(src1_uids, src2_uids):
    same_uids = [uid for uid in src1_uids if uid in src2_uids]
    return same_uids

In [None]:
if not is_preprocessed:
    # 处理时间字符串
    op_train['time'] = op_train['day'].apply(lambda x: "2018-08-%02d" % x) + ' ' + op_train['time']
    op_train['timestamp'] = op_train['time'].apply(lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))

    tran_train['time'] = tran_train['day'].apply(lambda x: "2018-08-%02d" % x) + ' ' + tran_train['time']
    tran_train['timestamp'] = tran_train['time'].apply(lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
    
    # 根据UID进行排序 再根据timestamp进行排序 
    op_train = op_train.sort_values(by=['UID', 'timestamp'],ascending=True).reset_index(drop=True)
    tran_train = tran_train.sort_values(by=['UID', 'timestamp'],ascending=True).reset_index(drop=True)
    tag_train = tag_train.sort_values(by=['UID'], ascending=True).reset_index(drop=True)

    op_train.to_csv(data_path + op_train_sorted_file)
    tran_train.to_csv(data_path + tran_train_sorted_file)
    tag_train.to_csv(data_path + tag_train_sorted_file)

In [None]:
# 分组
op_train_gb = op_train.groupby('UID', as_index=False)
tran_train_gb = tran_train.groupby('UID', as_index=False)
#  获取op和tran各自的uid
op_train_uids = [uid for uid, item in op_train_gb.groups.items()]
tran_train_uids = [uid for uid, item in tran_train_gb.groups.items()]

# 获取op和tran各自的tag
op_tag = tag_train[tag_train['UID'].isin(op_train['UID'])]
tran_tag = tag_train[tag_train['UID'].isin(tran_train['UID'])]

#### 1.1对op进行分析

In [None]:
# op的大体印象
op_train.head(15)

In [None]:
# op需要进行LabelEncoder的特征
op_columns = ['UID', 'day', 'mode', 'success', 'time', 'os', 'version',
              'device1', 'device2', 'device_code1', 'device_code2', 'device_code3',
              'mac1', 'mac2', 'ip1', 'ip2', 'wifi', 'geo_code', 'ip1_sub', 'ip2_sub',
              'timestamp']
op_operator_features = ['mode', 'os', 'version']
op_user_features = ['device1', 'device2', 'device_code1', 'device_code2', 'device_code3',
                   'mac1', 'mac2', 'ip1', 'ip2', 'wifi', 'geo_code', 'ip1_sub', 'ip2_sub',
                   'timestamp']
op_nan_features = ['mode', 'success', 'os', 'version',
                   'device1', 'device2', 'device_code1', 'device_code2', 'device_code3',
                   'mac1', 'mac2', 'ip1', 'ip2', 'wifi', 'geo_code', 'ip1_sub', 'ip2_sub']
op_le_features = ['mode', 'os', 'version',
                  'device1', 'device2', 'device_code1', 'device_code2', 'device_code3',
                  'mac1', 'mac2', 'ip1', 'ip2', 'wifi', 'geo_code', 'ip1_sub', 'ip2_sub']

In [None]:
# 处理缺失值
for ft in op_nan_features:
    print('[info]: %r handle nan...' %ft)
    if ft == 'success':
        op_train[ft].fillna(-1, inplace=True)
    else:
        op_train[ft].fillna('-1', inplace=True)

print('[info]: handle nan finished.')
# operator的LabelEncoder
le = LabelEncoder()
for feature in op_le_features:
    try:
        print('[info]: %r label encoding...' %feature)
        op_train[feature] = le.fit_transform(op_train[feature])
    except TypeError as e:
        print(e)
print('[info]: label encoding finished.')

In [None]:
# 将op数据中的作假者与正常人分离开
op_train_fake_gp = op_train[op_train['UID'].isin(op_tag['UID'][op_tag['Tag'] == 1])].groupby('UID', as_index=False)
op_train_normal_gp = op_train[op_train['UID'].isin(op_tag['UID'][op_tag['Tag'] == 0])].groupby('UID', as_index=False)

In [None]:
op_train_fake_des = op_train_fake_gp.describe()

In [None]:
type(op_train_fake_gp.get_group(10000))
df_10000 = op_train_fake_gp.get_group(10000)
df_10000['mode'].value_counts()

In [None]:
op_train['mode'].value_counts(dropna=False)

In [None]:
for key, item in op_train_fake_gp.indices.items():
    print(key, item)

#### 1.2 tran处理

In [None]:
# tran需要进行LabelEncoder的特征
tran_columns = ['UID', 'channel', 'day', 'time', 'trans_amt', 'amt_src1', 'merchant',
       'code1', 'code2', 'trans_type1', 'acc_id1', 'device_code1',
       'device_code2', 'device_code3', 'device1', 'device2', 'mac1', 'ip1',
       'bal', 'amt_src2', 'acc_id2', 'acc_id3', 'geo_code', 'trans_type2',
       'market_code', 'market_type', 'ip1_sub', 'timestamp']
tran_operator_features = []
tran_user_features = []