In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold
from sklearn.metrics import roc_auc_score
import sqlite3
import xgboost as xgb
import datetime
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import gc
from sklearn.model_selection import TimeSeriesSplit
import hashlib

In [2]:
import os
__print__ = print
def print(string):
    __print__(string)
    os.system(f'echo \"{string}\"')

# 读取数并进行预处理

In [3]:
# 对DeviceInfo，id_30，id_31进行处理，并生成一个是否有identity的特征

def id_split(df):
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]

    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]

    df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'
    
    # 类别太稀疏的置为其他类    
    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 100].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()
    
    return df

In [4]:
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity =    pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction =  pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity =     pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

In [5]:
print(train_transaction.shape)
print(train_identity.shape)
print(test_transaction.shape)
print(test_identity.shape)

(590540, 394)
(144233, 41)
(506691, 393)
(141907, 41)


In [6]:
train_identity.columns

Index(['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06',
       'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14',
       'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22',
       'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object')

In [7]:
test_identity.columns.values

array(['TransactionID', 'id-01', 'id-02', 'id-03', 'id-04', 'id-05',
       'id-06', 'id-07', 'id-08', 'id-09', 'id-10', 'id-11', 'id-12',
       'id-13', 'id-14', 'id-15', 'id-16', 'id-17', 'id-18', 'id-19',
       'id-20', 'id-21', 'id-22', 'id-23', 'id-24', 'id-25', 'id-26',
       'id-27', 'id-28', 'id-29', 'id-30', 'id-31', 'id-32', 'id-33',
       'id-34', 'id-35', 'id-36', 'id-37', 'id-38', 'DeviceType',
       'DeviceInfo'], dtype=object)

In [8]:
col1 = ['TransactionID']
col2 = ['id_0'+str(n) for n in np.arange(1, 10)]
col3 = ['id_'+str(n) for n in np.arange(10, 39)]
col4 = ['DeviceType', 'DeviceInfo']
col =  col1 + col2 + col3 + col4
# col
test_identity.columns = col

In [9]:
train_identity = id_split(train_identity)
test_identity = id_split(test_identity)

In [10]:
print(train_identity.shape)
print(test_identity.shape)

(144233, 46)
(141907, 46)


In [11]:
# 新增了5个字段
print(train_identity.device_name.value_counts())
print(train_identity.device_version.value_counts())
print(train_identity.OS_id_30.value_counts())
print(train_identity.browser_id_31.value_counts())

Windows            47722
iOS Device         19782
MacOS              12573
Samsung            12092
Trident             7440
Others              4978
RV                  4385
Motorola            2935
Huawei              2377
LG                  2331
Sony                 575
ZTE                  518
HTC                  406
hi6210sft Build      190
F3213 Build          125
Linux                121
F5121 Build          116
Name: device_name, dtype: int64
7.0              7440
NRD90M           5908
MMB29K           1874
MRA58K           1446
MMB29M           1342
                 ... 
HUAWEILYO-L21       1
NJH47F              1
34.2.A.2.47         1
Q1010               1
V41020c             1
Name: device_version, Length: 293, dtype: int64
Windows    36739
iOS        19782
Mac        13580
Android     6303
Linux       1136
other         15
func          10
Name: OS_id_30, dtype: int64
chrome               76059
mobile               28379
ie                    9733
safari                89

In [12]:
train = pd.merge(train_transaction, train_identity, on = 'TransactionID', how = 'left')
test = pd.merge(test_transaction, test_identity, on = 'TransactionID', how = 'left')

In [13]:
print(sum(pd.util.hash_pandas_object(train)))
#5447467732477208009266664
print(sum(pd.util.hash_pandas_object(test)))
#4667966862746899078655461

5447467732477208009266664
4667966862746899078655461


In [14]:
del train_transaction
del test_transaction
del train_identity
del test_identity

In [15]:
# train = train[:5000]
# test = test[:5000]

# 特征工程

In [16]:
train_len = len(train)

In [17]:
# 加入DT_D  第几天，
#    DT_W  第几周，
#    DT_M  第几月
START_DATE = '2017-11-30'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
train['TransactionDT'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
test['TransactionDT'] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
for df in [train,test]:
    df['DT_D'] = ((df['TransactionDT'].dt.year-2017)*365 + df['TransactionDT'].dt.dayofyear).astype(np.int16)
    df['DT_W'] = (df['TransactionDT'].dt.year-2017)*52 + df['TransactionDT'].dt.weekofyear
    df['DT_M'] = (df['TransactionDT'].dt.year-2017)*12 + df['TransactionDT'].dt.month

In [18]:
train.ProductCD.value_counts()

W    439670
C     68519
R     37699
H     33024
S     11628
Name: ProductCD, dtype: int64

    因为意识到ProductCD各个种类在欺诈和时间序列上的表现差异很大，所以将他们的count_encoding拆解为5个指标

In [19]:
#### R
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_R_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_R_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'R','ProductCD_R_Day'] = -999
test.loc[test.ProductCD != 'R','ProductCD_R_Day'] = -999

In [20]:
#### H
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_H_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_H_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'H','ProductCD_H_Day'] = -999
test.loc[test.ProductCD != 'H','ProductCD_H_Day'] = -999

In [21]:
#### C
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_C_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_C_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'C','ProductCD_C_Day'] = 999999
test.loc[test.ProductCD != 'C','ProductCD_C_Day'] = 999999

In [22]:
#### W
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_W_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_W_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'W','ProductCD_W_Day'] = -999
test.loc[test.ProductCD != 'W','ProductCD_W_Day'] = -999

In [23]:
#### S
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_S_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_S_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'S','ProductCD_S_Day'] = -999
test.loc[test.ProductCD != 'S','ProductCD_S_Day'] = -999

In [24]:
# 新增了5个特征
print(train.columns[-5:].tolist())

['ProductCD_R_Day', 'ProductCD_H_Day', 'ProductCD_C_Day', 'ProductCD_W_Day', 'ProductCD_S_Day']


In [25]:
# 使用card系列和open_card的字段作为标识唯一用户的id
train['open_card'] = train.DT_D - train.D1
train['first_tran'] = train.DT_D - train.D2
test['open_card'] = test.DT_D - test.D1
test['first_tran'] = test.DT_D - test.D2

train['uid1'] = train.card1.astype(str) +' '+ train.card2.astype(str)+' '+ train.card3.astype(str)+' '+train.card4.astype(str)+' '+ train.card5.astype(str)+' '+ train.card6.astype(str) +' '+ train.addr1.astype(str)+' '+train.addr2.astype(str)+' '+train.open_card.astype(str)
test['uid1'] = test.card1.astype(str) +' '+ test.card2.astype(str)+' '+ test.card3.astype(str)+' '+ test.card4.astype(str)+' '+ test.card5.astype(str)+' '+ test.card6.astype(str) +' '+ test.addr1.astype(str)+' '+test.addr2.astype(str)+' '+test.open_card.astype(str)

In [26]:
print(train['uid1'].nunique())
print(test['uid1'].nunique())

222518
198011


In [27]:
# 使用id_30，id_31，id_32，id_33，DeviceType，DeviceInfo标识一个设备
def device_hash(x):
    s =  str(x['id_30'])+str(x['id_31'])+str(x['id_32'])+str(x['id_33'])+str( x['DeviceType'])+ str(x['DeviceInfo'])
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h

In [28]:
for df in [train,test]:
    df['device_hash'] = df.apply(lambda x: device_hash(x), axis=1)

In [29]:
# 同uid设备的个数
concat_df = pd.concat([train[['uid1','device_hash']],test[['uid1','device_hash']]])
tmp = concat_df.groupby('uid1')['device_hash'].agg(['nunique'])

In [30]:
train['uid_device_nunique'] = train.uid1.map(tmp.to_dict()['nunique'])
test['uid_device_nunique'] = test.uid1.map(tmp.to_dict()['nunique'])

In [31]:
# 同设备uid的个数
tmp = concat_df.groupby('device_hash')['uid1'].agg(['nunique'])
train['device_uid_nunique'] = train.device_hash.map(tmp.to_dict()['nunique'])
test['device_uid_nunique'] = test.device_hash.map(tmp.to_dict()['nunique'])

In [32]:
del concat_df

In [33]:
# 得到金额不为零小数位的个数
def change(hoge):
    hoge = np.round(hoge,3)
    num = 3
    hoge = int(np.round(np.round(hoge,3)*1000))
    while(hoge % 10 ==0):
        num = num-1
        hoge = hoge /10
    if num<0:
        num = 0
    return num
  
train['decimal_digit'] = train["TransactionAmt"].map(change)
test['decimal_digit'] = test['TransactionAmt'].map(change)

gc.collect()

0

In [34]:
#没有identity填充为0
train.had_id = train.had_id.fillna(0)
test.had_id = test.had_id.fillna(0)

In [35]:
### scale
### D系列数据有随时间增加的趋势，未来的数据大于过去的数据，所以进行缩放，将相对关系保留
for t in ['D1','D2','D4','D6','D10','D11','D12','D14','D15']:
    train[t+'_revised'] = train[t]/train.groupby('DT_W')[t].transform('max')
    test[t+'_revised'] = test[t]/test.groupby('DT_W')[t].transform('max')
for t in ['D3','D5','D7','D8','D13']:
    train[t+'_revised'] = train[t]/train.groupby('DT_M')[t].transform('max')
    test[t+'_revised'] = test[t]/test.groupby('DT_M')[t].transform('max')

In [36]:
test.loc[test.DT_W == 78 ,'D14_revised'] = test.loc[test.DT_W == 78 ,'D14_revised'].map(lambda x: np.nan if pd.isna(x) else x/900*530)

In [37]:
### 对时间进行细分，周内第几天和当天时间小时
train['dow'] = train['TransactionDT'].dt.dayofweek
train['hour'] = train['TransactionDT'].dt.hour
test['dow'] = test['TransactionDT'].dt.dayofweek
test['hour'] = test['TransactionDT'].dt.hour
# train['month'] = train['TransactionDT'].dt.month
# test['month'] = test['TransactionDT'].dt.month
train['email_domain_comp'] = (train['P_emaildomain'].values == train['R_emaildomain'].values).astype(int)
test['email_domain_comp'] = (test['P_emaildomain'].values == test['R_emaildomain'].values).astype(int)
train.drop(['D9'],axis=1,inplace=True)
test.drop(['D9'],axis=1,inplace=True)
# X_train = train.drop(['TransactionID','TransactionDT'],axis=1)
# X_test = test.drop(['TransactionID','TransactionDT'],axis=1)

In [38]:
#类别变量，需要进行LabelEncoder
cat_columns = ['uid1','id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','hour','dow','device_name', 'device_version', 'OS_id_30',  'browser_id_31']
#进行count encoding的
count_columns = ['uid1','id_13','id_14','id_17','id_18','id_19','id_20','id_21',
                 'id_22','id_24','id_25','id_26','id_30','id_31','id_33',
                 'DeviceInfo','card6','P_emaildomain','R_emaildomain','card1',
                 'card2','card3','card5','addr1','addr2','hour','device_version','OS_id_30','browser_id_31']

In [39]:
print('check1')

check1


In [40]:
for f in cat_columns:
    #if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[f].astype(str)) + list(test[f].astype(str)))
    train[f] = lbl.transform(list(train[f].astype(str)))
    test[f] = lbl.transform(list(test[f].astype(str))) 

In [41]:
train.fillna(-999,inplace = True)
test.fillna(-999,inplace = True)

In [42]:
# 计数编码
for i in count_columns:
    train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
    test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))

In [43]:
print('check1.5')

check1.5


In [44]:
# 计算自然小时，自然天中的交易次数
train_test_all = pd.concat([train[['TransactionDT','TransactionAmt']],test[['TransactionDT','TransactionAmt']]],ignore_index=True,sort=False)
train_test_all['day_count'] = train_test_all.groupby(train_test_all.TransactionDT.dt.date)['TransactionAmt'].transform('count')
train_test_all['hour_count'] = train_test_all.groupby(train_test_all.TransactionDT.map(lambda x:str(x)[:13]))['TransactionAmt'].transform('count')
train['day_count'] = train_test_all[:train_len].day_count.tolist()
test['day_count'] = train_test_all[train_len:].day_count.tolist()
train['hour_count'] = train_test_all[:train_len].hour_count.tolist()
test['hour_count'] = train_test_all[train_len:].hour_count.tolist()

In [45]:
# #计算自然小时，自然天中的交易总金额
# train_test_all = pd.concat([train[['TransactionDT','TransactionAmt']],test[['TransactionDT','TransactionAmt']]],ignore_index=True,sort=False)
# train_test_all['day_sum'] = train_test_all.groupby(train_test_all.TransactionDT.dt.date)['TransactionAmt'].transform('sum')
# train_test_all['hour_sum'] = train_test_all.groupby(train_test_all.TransactionDT.map(lambda x:str(x)[:13]))['TransactionAmt'].transform('sum')
# train['day_sum'] = train_test_all[:590540].day_sum.tolist()
# test['day_sum'] = train_test_all[590540:].day_sum.tolist()
# train['hour_sum'] = train_test_all[:590540].hour_sum.tolist()
# test['hour_sum'] = train_test_all[590540:].hour_sum.tolist()

In [46]:
# #计算自然小时，自然天中的交易平均金额
# train_test_all = pd.concat([train[['TransactionDT','TransactionAmt']],test[['TransactionDT','TransactionAmt']]],ignore_index=True,sort=False)
# train_test_all['day_mean'] = train_test_all.groupby(train_test_all.TransactionDT.dt.date)['TransactionAmt'].transform('mean')
# train_test_all['hour_mean'] = train_test_all.groupby(train_test_all.TransactionDT.map(lambda x:str(x)[:13]))['TransactionAmt'].transform('mean')
# train['day_mean'] = train_test_all[:590540].day_mean.tolist()
# test['day_mean'] = train_test_all[590540:].day_mean.tolist()
# train['hour_mean'] = train_test_all[:590540].hour_mean.tolist()
# test['hour_mean'] = train_test_all[590540:].hour_mean.tolist()

In [47]:
### 按照价格个类别确定商品id
temp123 = ['TransactionAmt__ProductCD']
for feature in temp123:
    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)
    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))
train.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
test.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
for i in ['ProductID']:
    train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
    test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))

In [48]:
print('check2')

check2


In [49]:
###  类似的继续做一些交叉类别
temp = ['DeviceInfo__P_emaildomain', 
        'card1__card5', 
        'card2__id_20',
        'card5__P_emaildomain', 
        'addr1__card1',
        'addr1__addr2',
        'card1__card2',
        'card2__addr1',
        'card1__P_emaildomain',
        'card2__P_emaildomain',
        'addr1__P_emaildomain',
        'DeviceInfo__id_31',
        'DeviceInfo__id_20',
        'DeviceType__id_31',
        'DeviceType__id_20',
        'DeviceType__P_emaildomain',
        'card1__M4',
        'card2__M4',
        'addr1__M4',
        'P_emaildomain__M4',
       'uid1__ProductID',
       'uid1__DeviceInfo']
for feature in temp:
    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))

for i in temp:
    train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
    test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))

In [50]:
print('check3')

check3


In [51]:
###  做一些类别和连续变量交叉
con_fea = ['V258','C1','C14','C13','TransactionAmt','D15_revised','D2_revised','id_02','dist1','V294','C11']
cat_fea = ['card1','card2','addr1','card4','R_emaildomain','P_emaildomain','ProductID','uid1']
train_test = pd.concat([train[con_fea+cat_fea],test[con_fea+cat_fea]],ignore_index=True,sort=False)

for cont in con_fea:
    for cat in cat_fea:
        train[cont+'_'+cat+'_mean'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('mean')[:train_len].tolist()
        train[cont+'_'+cat+'_std'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('std')[:train_len].tolist()
        test[cont+'_'+cat+'_mean'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('mean')[train_len:].tolist()
        test[cont+'_'+cat+'_std'] =  train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('std')[train_len:].tolist()

In [52]:
train.fillna(-999,inplace=True)
test.fillna(-999,inplace=True)

In [53]:
print('check4')

check4


In [54]:
del df
del train_test
del train_test_all
gc.collect()

0

In [55]:
train.drop(['DeviceInfo','device_version','DT_D','DT_W','DT_M','D15',
              'D2','D1','D4','D6','D10','D11','D12','D3','D5','D7','D8','D13','D14','TransactionAmt_ProductID_mean'],axis=1,inplace=True)
test.drop(['DeviceInfo','device_version','DT_D','DT_W','DT_M','D15',
             'D2','D1','D4','D6','D10','D11','D12','D3','D5','D7','D8','D13','D14','TransactionAmt_ProductID_mean'],axis=1,inplace=True)

In [56]:
### 除掉之前350之后的特征,根据特征重要性排序
# orders = pd.read_csv('importance.csv')
# drop = orders.loc[350:,'Unnamed: 0'].tolist()
drop = ['V256', 'V223', 'V19', 'V244', 'V324', 'V37', 'V200', 'card3', 'C1_P_emaildomain_mean', 'V131', 'V35', 'id_17_count_full', 'id_17', 'V30', 'V163', 'V81', 'V332', 'V164', 'D7_revised', 'id_02_R_emaildomain_std', 'V336', 'M9', 'V301', 'V251', 'M8', 'V275', 'V5', 'V272', 'V331', 'V215', 'V291', 'V129', 'C11_P_emaildomain_mean', 'id_34', 'V160', 'V139', 'V124', 'V159', 'V274', 'V59', 'V126', 'V52', 'V79', 'V271', 'V24', 'V137', 'V286', 'TransactionAmt_R_emaildomain_std', 'V335', 'V115', 'V198', 'V234', 'V298', 'V43', 'V258_card4_mean', 'V264', 'OS_id_30_count_full', 'V267', 'V169', 'V217', 'C3', 'V23', 'V287', 'id_18_count_full', 'V96', 'V208', 'card4', 'id_32', 'V232', 'V188', 'V4', 'V7', 'id_38', 'id_02_R_emaildomain_mean', 'V293', 'V219', 'V259', 'V276', 'C1_R_emaildomain_mean', 'V279', 'V102', 'V253', 'C13_R_emaildomain_std', 'V245', 'V73', 'browser_id_31', 'V74', 'V322', 'V209', 'V203', 'V273', 'V221', 'V40', 'V242', 'V289', 'D15_revised_R_emaildomain_std', 'C14_R_emaildomain_std', 'V150', 'V316', 'V239', 'V265', 'V278', 'V166', 'V172', 'V132', 'V93', 'V58', 'C1_R_emaildomain_std', 'V29', 'V300', 'V134', 'V254', 'V145', 'V141', 'C11_P_emaildomain_std', 'V292', 'V210', 'V231', 'V280', 'V158', 'V123', 'V135', 'V220', 'V39', 'V26', 'V238', 'id_11', 'V319', 'V125', 'id_37', 'V206', 'C1_card4_mean', 'V94', 'V304', 'V57', 'V270', 'V33', 'V170', 'V202', 'V218', 'V108', 'V303', 'V213', 'V222', 'V64', 'V263', 'V326', 'V10', 'V147', 'V101', 'V142', 'V97', 'V214', 'V105', 'V60', 'V171', 'V329', 'ProductCD', 'V216', 'V34', 'V25', 'V6', 'TransactionAmt_card4_mean', 'V212', 'V250', 'V3', 'V63', 'V194', 'id_36', 'V178', 'V42', 'V85', 'V193', 'V290', 'id_23', 'V258_card4_std', 'V15', 'V288', 'id_15', 'V182', 'V2', 'V192', 'V260', 'V235', 'id_26_count_full', 'V138', 'id_24', 'id_10', 'C1_card4_std', 'V11', 'id_08', 'id_25_count_full', 'id_07', 'V167', 'V51', 'V229', 'V248', 'V197', 'V230', 'V144', 'V233', 'V157', 'dist1_card4_std', 'V284', 'V140', 'addr2_count_full', 'V154', 'V22', 'V204', 'M1', 'V71', 'V211', 'V255', 'V72', 'TransactionAmt_card4_std', 'V1', 'V80', 'V184', 'V299', 'C11_R_emaildomain_mean', 'V173', 'V177', 'id_04', 'D15_revised_card4_std', 'V180', 'V228', 'V151', 'V186', 'OS_id_30', 'V109', 'DeviceType', 'V18', 'V17', 'id_26', 'V247', 'V9', 'V191', 'V148', 'V65', 'V196', 'id_21', 'V297', 'V46', 'V338', 'addr2', 'V95', 'V92', 'dist1_card4_mean', 'V334', 'V100', 'id_25', 'V179', 'V104', 'V116', 'V16', 'V183', 'id_21_count_full', 'V302', 'V199', 'V227', 'C11_R_emaildomain_std', 'V176', 'V249', 'V237', 'V327', 'id_16', 'V155', 'V8', 'V252', 'V175', 'V339', 'V330', 'V181', 'V190', 'C14_card4_mean', 'V14', 'V337', 'C14_card4_std', 'id_35', 'id_02_card4_mean', 'V110', 'id_12', 'V226', 'V168', 'V21', 'V153', 'V195', 'id_02_card4_std', 'V236', 'V174', 'id_28', 'V84', 'V32', 'V106', 'V41', 'V111', 'V112', 'V114', 'V146', 'V328', 'V50', 'id_29', 'C13_card4_mean', 'V103', 'V98', 'V121', 'id_24_count_full', 'D2_revised_card4_mean', 'had_id', 'V113', 'D2_revised_card4_std', 'V240', 'TransactionAmt_ProductID_std', 'V185', 'id_22_count_full', 'id_22', 'V31', 'C13_card4_std', 'V68', 'V88', 'V294_card4_std', 'V294_card4_mean', 'V122', 'dist1_R_emaildomain_mean', 'V118', 'V269', 'V107', 'V305', 'V117', 'V119', 'V120', 'C11_card4_mean', 'C11_card4_std', 'dist1_R_emaildomain_std', 'V89', 'V241', 'id_27', 'V325', 'V28', 'D15_revised_card4_mean', 'V27']

In [57]:
drop1 = drop[:200]
drop2 = drop[200:]

In [58]:
train.drop(drop1,axis=1,inplace=True)
test.drop(drop1,axis=1,inplace=True)

In [59]:
train.drop(drop2,axis=1,inplace=True)
test.drop(drop2,axis=1,inplace=True)

In [60]:
y_train = train['isFraud'].copy()
X_train = train.drop(['TransactionID','isFraud','TransactionDT'],axis=1)
X_test = test.drop(['TransactionID','TransactionDT'],axis=1)

In [61]:
del train,test

In [62]:
print(X_train.shape)
print(X_test.shape)

(590540, 359)
(506691, 359)


In [63]:
cat = ['uid1','id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','hour','dow','device_name', 'OS_id_30',  'browser_id_31','ProductID',
'DeviceInfo__P_emaildomain', 
        'card1__card5', 
        'card2__id_20',
        'card5__P_emaildomain', 
        'addr1__card1',
        'addr1__addr2',
        'card1__card2',
        'card2__addr1',
        'card1__P_emaildomain',
        'card2__P_emaildomain',
        'addr1__P_emaildomain',
        'DeviceInfo__id_31',
        'DeviceInfo__id_20',
        'DeviceType__id_31',
        'DeviceType__id_20',
        'DeviceType__P_emaildomain',
        'card1__M4',
        'card2__M4',
        'addr1__M4',
        'P_emaildomain__M4',
       'uid1__ProductID',
       'uid1__DeviceInfo']

In [64]:
for i in drop:
    if i in cat:
        cat.remove(i)

In [65]:
# remove this call for catboost
# for column in cat:
#     train_set = set(X_train[column])
#     test_set = set(X_test[column])
#     tt = train_set.intersection(test_set)
#     print('----------------------------------------')
#     print(column)
#     print(f'train:{len(tt)/len(train_set)}')
#     print(f'test:{len(tt)/len(test_set)}')
#     X_train[column] = X_train[column].map(lambda x: -999 if x not in tt else x)
#     X_test[column] = X_test[column].map(lambda x: -999 if x not in tt else x)

In [66]:
X_train.drop(['device_hash'],axis=1,inplace=True)
X_test.drop(['device_hash'],axis=1,inplace=True)

In [67]:
print(X_train.shape)

(590540, 358)


In [68]:
X_train.to_pickle('./X_train2.pkl')
X_test.to_pickle('./X_test2.pkl')
y_train.to_frame().to_pickle('./y_train2.pkl')

# 模型训练

In [69]:
# kf=KFold(n_splits = 5)
# resu1 = 0
# impor1 = 0
# y_pred = 0
# stack_train = np.zeros([X_train.shape[0],])
# for train_index, test_index in kf.split(X_train, y_train):
#     X_train2= X_train.iloc[train_index,:]
#     y_train2= y_train.iloc[train_index]
#     X_test2= X_train.iloc[test_index,:]
#     y_test2= y_train.iloc[test_index]
#     clf = lgb.LGBMClassifier(n_estimators=10000, random_state=1995,subsample=0.7,
#                              colsample_bytree=0.7,learning_rate=0.005,importance_type = 'gain',
#                      max_depth = -1, num_leaves = 256,min_child_samples=20,min_split_gain = 0.001,
#                        bagging_freq=1,reg_alpha = 0,reg_lambda = 0,n_jobs = -1,metric='None')
#     clf.fit(X_train2,y_train2,eval_set = [(X_train2,y_train2),(X_test2,y_test2)], eval_metric = 'auc',early_stopping_rounds=500,verbose=100)
#     temp_predict = clf.predict_proba(X_test2)[:,1]
#     stack_train[test_index] = temp_predict
#     y_pred += clf.predict_proba(X_test)[:,1]/5
#     roc = roc_auc_score(y_test2, temp_predict)
#     print(roc)
#     resu1 += roc/5
#     impor1 += clf.feature_importances_/5
#     gc.collect()
# print('End:',resu1)

In [70]:
# resu = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
# resu['isFraud'] = y_pred
# resu.to_csv('lgb.csv',index=False)
# a= pd.DataFrame()
# a['train'] = stack_train
# a.to_csv('lgb_train.csv',index=False)