In [1]:
#https://www.kaggle.com/c/ieee-fraud-detection/discussion/103439#latest-600713

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff
import datetime
# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)
#cufflinks.go_offline(connected=True)

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
# from xgboost import XGBClassifier
# import xgboost as xgb

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
print(os.listdir("../input/ieee-fraud-detection"))

['test_identity.csv', 'sample_submission.csv', 'train_identity.csv', 'train_transaction.csv', 'test_transaction.csv']


In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
%%time
import multiprocessing
warnings.simplefilter('ignore')
files = ['../input/ieee-fraud-detection/test_identity.csv', 
         '../input/ieee-fraud-detection/test_transaction.csv',
         '../input/ieee-fraud-detection/train_identity.csv',
         '../input/ieee-fraud-detection/train_transaction.csv',
         '../input/ieee-fraud-detection/sample_submission.csv']

def load_data(file):
    return reduce_mem_usage(pd.read_csv(file, index_col='TransactionID'))

with multiprocessing.Pool() as pool:
    test_identity, test_transaction, train_identity, train_transaction, sample_submission = pool.map(load_data, files)
gc.collect()

Memory usage of dataframe is 44.39 MB
Memory usage of dataframe is 45.12 MB
Memory usage after optimization is: 10.40 MB
Decreased by 76.6%
Memory usage after optimization is: 10.57 MB
Decreased by 76.6%
Memory usage of dataframe is 7.73 MB
Memory usage after optimization is: 4.83 MB
Decreased by 37.5%
Memory usage of dataframe is 1519.24 MB
Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 427.17 MB
Decreased by 71.9%
Memory usage after optimization is: 489.41 MB
Decreased by 72.4%
CPU times: user 956 ms, sys: 2.21 s, total: 3.17 s
Wall time: 4min 9s


6

In [5]:


# train_transaction = reduce_mem_usage(pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv', index_col='TransactionID'))
# test_transaction = reduce_mem_usage(pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv', index_col='TransactionID'))

# train_identity = reduce_mem_usage(pd.read_csv('../input/ieee-fraud-detection/train_identity.csv', index_col='TransactionID'))
# test_identity = reduce_mem_usage(pd.read_csv('../input/ieee-fraud-detection/test_identity.csv', index_col='TransactionID'))

# sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')
# gc.collect()

In [6]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

In [7]:
def corret_card_id(x): 
    x=x.replace('.0','')
    x=x.replace('-999','nan')
    return x

def define_indexes(df):
    
    # create date column
    START_DATE = '2017-12-01'
    startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    df['TransactionDT'] = df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
    
    df['year'] = df['TransactionDT'].dt.year
    df['month'] = df['TransactionDT'].dt.month
    df['dow'] = df['TransactionDT'].dt.dayofweek
    df['hour'] = df['TransactionDT'].dt.hour
    df['day'] = df['TransactionDT'].dt.day
   
    # create card ID 
    cards_cols= ['card1', 'card2', 'card3', 'card5']
    for card in cards_cols: 
        if '1' in card: 
            df['card_id']= df[card].map(str)
        else : 
            df['card_id']+= ' '+df[card].map(str)
    
    # small correction of the Card_ID
    df['card_id']=df['card_id'].apply(corret_card_id)

    return df

In [8]:
train = define_indexes(train)
test = define_indexes(test)

In [9]:
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

In [10]:
# # 신규로 만든 cardID에 대해 Amountamt를을 평균으로 나눠보자  : 이 건이 평균대비 많은지 적은지 - V13
train['TransactionAmt_to_mean_card_id'] = train['TransactionAmt'] / train.groupby(['card_id'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card_id'] = train['TransactionAmt'] / train.groupby(['card_id'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card_id'] = test['TransactionAmt'] / test.groupby(['card_id'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card_id'] = test['TransactionAmt'] / test.groupby(['card_id'])['TransactionAmt'].transform('std')

train['id_02_to_mean_card_id'] = train['id_02'] / train.groupby(['card_id'])['id_02'].transform('mean')
train['id_02_to_std_card_id'] = train['id_02'] / train.groupby(['card_id'])['id_02'].transform('std')

test['id_02_to_mean_card_id'] = test['id_02'] / test.groupby(['card_id'])['id_02'].transform('mean')
test['id_02_to_std_card_id'] = test['id_02'] / test.groupby(['card_id'])['id_02'].transform('std')

train['D15_to_mean_card_id'] = train['D15'] / train.groupby(['card_id'])['D15'].transform('mean')
train['D15_to_std_card_id'] = train['D15'] / train.groupby(['card_id'])['D15'].transform('std')

test['D15_to_mean_card_id'] = test['D15'] / test.groupby(['card_id'])['D15'].transform('mean')
test['D15_to_std_card_id'] = test['D15'] / test.groupby(['card_id'])['D15'].transform('std')


In [11]:
# cardid 기준으로 최근 5일간 거래량 통계
train['count_last'] = train.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).count())
train['mean_last'] = train.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
train['min_last'] = train.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).min())
train['max_last'] = train.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).max())
train['std_last'] = train.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).std())

test['count_last'] = test.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).count())
test['mean_last'] = test.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
test['min_last'] = test.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).min())
test['max_last'] = test.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).max())
test['std_last'] = test.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).std())

#최근 10일 평균 대비 큰지 작은지 
train['trans_mean_last'] = train['TransactionAmt'] / train.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
train['trans_std_last'] = train['TransactionAmt'] / train.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).std())
test['trans_mean_last'] = test['TransactionAmt'] / test.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
test['trans_std_last'] = test['TransactionAmt'] / test.groupby('card_id')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())

In [12]:
print(train['R_emaildomain'].dtypes)

category


In [13]:
# train.loc[train['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'

# train.loc[train['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
#                                             'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
#                                             'yahoo.es']), 'R_emaildomain'] = 'Yahoo Mail'
# train.loc[train['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
#                                             'hotmail.es','hotmail.co.uk', 'hotmail.de',
#                                             'outlook.es', 'live.com', 'live.fr',
#                                             'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
# train.loc[train.R_emaildomain.isin(train.R_emaildomain\
#                                         .value_counts()[train.R_emaildomain.value_counts() <= 300 ]\
#                                         .index), 'R_emaildomain'] = "Others"
# train.R_emaildomain.fillna("NoInf", inplace=True)

In [14]:
#test.loc[test['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'
#
#test.loc[test['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
#                                             'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
#                                             'yahoo.es']), 'R_emaildomain'] = 'Yahoo Mail'
#test.loc[test['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
#                                             'hotmail.es','hotmail.co.uk', 'hotmail.de',
#                                             'outlook.es', 'live.com', 'live.fr',
#                                             'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
#test.loc[test.R_emaildomain.isin(test.R_emaildomain\
 #                                        .value_counts()[test.R_emaildomain.value_counts() <= 300 ]\
#                                         .index), 'R_emaildomain'] = "Others"
#test.R_emaildomain.fillna("NoInf", inplace=True)

In [15]:
########################### Freq encoding
# 각 컬럼의 값의 전체에서 얼만큼 빈도수를 가지는지 카운트하여 넣어줌 -> C,D는 크게 의미 없을 듯?
i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8','D9',
          'addr1','addr2',
          'dist1','dist2',
         # 'P_emaildomain', 
          'R_emaildomain'
         ]

for col in i_cols:
    temp_df = pd.concat([train[[col]], test[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()    # Null 값도 계산하게하기
    train[col+'_fq_enc'] = train[col].map(fq_encode)
    test[col+'_fq_enc']  = test[col].map(fq_encode)

In [16]:
# Based on https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419
def features_interaction(df, feature_1, feature_2):
    return df[feature_1].astype(str) + '_' + df[feature_2].astype(str)
features_interactions = [
    'id_02__id_20',
    'id_02__D8',
    'D11__DeviceInfo',
    'DeviceInfo__P_emaildomain',
    'P_emaildomain__C2',
    'card2__dist1',
    'card1__card5',
    'card2__id_20',
    'card5__P_emaildomain',
    'addr1__card1'
]

for new_feature in features_interactions:
    feature_1, feature_2 = new_feature.split('__')
    
    train[new_feature] = features_interaction(train, feature_1, feature_2)
    test[new_feature] = features_interaction(test, feature_1, feature_2)

In [17]:
# Amount Log취하기
train['TransactionAmt'] = np.log(train['TransactionAmt'])
test['TransactionAmt'] = np.log(test['TransactionAmt'])

In [18]:
# PCA - V
# mas_v = df.columns[55:]

# for col in mas_v:
#     df[col].fillna((df[col].min() - 2), inplace=True)
#     df[col] = (minmax_scale(df[col], feature_range=(0,1)))
    
# df = PCA_change(df, mas_v, prefix='PCA_V_', n_components=35)

# columns = ['PCA_V_0', 'PCA_V_1', 'PCA_V_2', 'PCA_V_3', 'PCA_V_4', 'PCA_V_5', 
#            'PCA_V_6', 'PCA_V_7', 'PCA_V_8', 'PCA_V_9', 'PCA_V_10', 'PCA_V_11', 
#            'PCA_V_12', 'PCA_V_13', 'PCA_V_14', 'PCA_V_15', 'PCA_V_16', 
#            'PCA_V_17', 'PCA_V_18', 'PCA_V_19', 'PCA_V_20', 'PCA_V_21', 
#            'PCA_V_22', 'PCA_V_23', 'PCA_V_24', 'PCA_V_25', 'PCA_V_26', 
#            'PCA_V_27', 'PCA_V_28', 'PCA_V_29', 'PCA_V_30', 'PCA_V_31', 
#            'PCA_V_32', 'PCA_V_33', 'PCA_V_34']

# km = KMeans(n_clusters=6)
# km = km.fit(df[columns])
# df['clusters_V'] = km.predict(df[columns])
# gc.collect()

In [19]:
########################### M columns (except M4)
# All these columns are binary encoded 1/0
# We can have some features from it
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

for df in [train, test]:
    df['M_sum'] = df[i_cols].sum(axis=1).astype(np.int8)
    df['M_na'] = df[i_cols].isna().sum(axis=1).astype(np.int8)

In [20]:
one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]

big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols+ one_value_cols_test))

cols_to_drop.remove('isFraud')

train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

In [21]:
train.head(10).T.to_csv('file.csv')#'TransactionDT' 삭제
# train.head(10).T

In [22]:
train.to_pickle('train.pkl')
test.to_pickle('test.pkl')
sample_submission.to_pickle('sample_submission.pkl')

In [23]:
# del train_transaction, train_identity, test_transaction, test_identity

Y_train = train['isFraud'].copy()

X_train = train.drop('isFraud', axis=1)
X_train.drop('TransactionDT', axis=1, inplace=True)
X_test = test.drop('TransactionDT', axis=1)

del train, test

In [24]:
gc.collect()

29

In [25]:
for f in X_train.select_dtypes(include='category').columns.tolist() + X_train.select_dtypes(include='object').columns.tolist():
    lbl = preprocessing.LabelEncoder()
    print(f)
    lbl.fit(list(X_train[f].values) + list(X_test[f].values))
    X_train[f] = lbl.transform(list(X_train[f].values))
    X_test[f] = lbl.transform(list(X_test[f].values))

ProductCD
card4
card6
P_emaildomain
R_emaildomain
M1
M2
M3
M4
M5
M6
M7
M8
M9
id_12
id_15
id_16
id_28
id_29
id_30
id_31
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
card_id
id_02__id_20
id_02__D8
D11__DeviceInfo
DeviceInfo__P_emaildomain
P_emaildomain__C2
card2__dist1
card1__card5
card2__id_20
card5__P_emaildomain
addr1__card1


In [26]:
5%3 

2

In [27]:
#https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment
#https://www.kaggle.com/niteshx2/beginner-explained-lgb-2-leaves-augment
# Data augmentation
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            if c%3 ==0:
                np.random.shuffle(ids)
                x1[:,c] = x1[ids][:,c]
        xs.append(x1)

   # for i in range(t//2):
    #    mask = y==0
   #     x1 = x[mask].copy()
   #     ids = np.arange(x1.shape[0])
   #     for c in range(x1.shape[1]):
   #         if c%3 == 0 :
   #             np.random.shuffle(ids)
    #            x1[:,c] = x1[ids][:,c]
    #    xn.append(x1)

    xs = np.vstack(xs)
   # xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
   # yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs])
    y = np.concatenate([y,ys])
    return x,y

In [28]:
# Hyperparameter 찾기

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization

In [29]:
#  N = 5
#     p_valid,yp = 0,0
#     for i in range(N):
#         X_t, y_t = augment(X_train.values, y_train.values)
#         X_t = pd.DataFrame(X_t)
#         X_t = X_t.add_prefix('var_')
    
#         trn_data = lgb.Dataset(X_t, label=y_t)
#         val_data = lgb.Dataset(X_valid, label=y_valid)
#         evals_result = {}
#         lgb_clf = lgb.train(lgb_params,
#                         trn_data,
#                         100000,
#                         valid_sets = [trn_data, val_data],
#                         early_stopping_rounds=3000,
#                         verbose_eval = 1000,
#                         evals_result=evals_result
#                        )
#         p_valid += lgb_clf.predict(X_valid)
#         yp += lgb_clf.predict(X_test)
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = features
#     fold_importance_df["importance"] = lgb_clf.feature_importance()
#     fold_importance_df["fold"] = fold + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#     oof['predict'][val_idx] = p_valid/N

In [30]:
def train_model(learning_rate,num_leaves, min_data_in_leaf, bagging_fraction, feature_fraction, lambda_l1, lambda_l2,max_bin,min_child_weight):
    print("############## New Run ################")
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': 4,
        'learning_rate': learning_rate,#0.01, # learning rate
        'num_iterations' : 200,#3000,
        #'n_estimators' : 800,
        'num_leaves': int(num_leaves),
        'min_data_in_leaf': int(min_data_in_leaf),
        'max_depth': -1,#int(max_depth),
        'bagging_fraction' : bagging_fraction,
        'feature_fraction' : feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'max_bin' : int(max_bin),
        'min_child_weight': min_child_weight,
        'bagging_seed' : 11,
        #'early_stopping_round' : 50,
        'verbose' : -1,
        'seed' :165
    }
    print("PARAMETERS: ")
    print(f"params  = {params}")
    
    tscv =  StratifiedKFold(n_splits=5, shuffle=True, random_state=42)#KFold(n_splits=5)#StratifiedKFold(n_splits=5)#TimeSeriesSplit(n_splits=3)
    sc = []
#     for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    for fold_,(train_idx, test_idx) in enumerate(tscv.split(X_train, Y_train)):
#         print( Y_train.iloc[train_idx].head())
        x_train, x_val = X_train.iloc[train_idx,:], X_train.iloc[test_idx,:]
        y_train, y_val = Y_train.iloc[train_idx], Y_train.iloc[test_idx]
        
#         sc2 =[]
#         N = 3
#         for i in range(N):
        #Data Augmentation Test
#         X_t, y_t = augment(x_train.values, y_train.values)
#         X_t = pd.DataFrame(X_t)
# #             X_t = X_t.add_prefix('var_')
#         lgb_train = lgb.Dataset(X_t.astype('float32'), label=y_t.astype('float32'))
#         lgb_valid = lgb.Dataset(data=x_val.astype('float32'), label=y_val.astype('float32'))
        #############################
        lgb_train = lgb.Dataset(data=x_train.astype('float32'), label=y_train.astype('float32'))
        lgb_valid = lgb.Dataset(data=x_val.astype('float32'), label=y_val.astype('float32'))
        
        
        lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_valid, verbose_eval=500)
        y = lgb_model.predict(x_train.astype('float32'), num_iteration=lgb_model.best_iteration)
        train_score = roc_auc_score(y_train.astype('float32'), y)        
        y = lgb_model.predict(x_val.astype('float32'), num_iteration=lgb_model.best_iteration)
        score = roc_auc_score(y_val.astype('float32'), y)
        print ("Fold : ", fold_,"train_auc : ",train_score,"val_auc : ", score)
        sc.append(score)
        
    paralst.append(params)
    scorelst.append(np.mean(sc))
    df_para = pd.DataFrame({"parameter" :paralst,"score":scorelst})
    print(df_para)
    df_para.to_csv('para.csv',index=False)

        
    return np.mean(sc)

bounds = {
    'num_leaves': (1300,1600),#(450, 500),
    'min_data_in_leaf': (0,100),#(100, 150),
    #'max_depth': -1,#(-1, 50), # -> -1
    'learning_rate': (0.014,0.019),#0.006883242363721497,
    'bagging_fraction' : (0.3, 0.5),
    'feature_fraction' : (0.1, 0.4),
    'lambda_l1': (0.3, 0.5),
    'lambda_l2': (0.3, 0.7),
    'max_bin' : (150,255), # 추가
    'min_child_weight': (0.01, 0.05),
}

traintime = False
if traintime :
    paralst,scorelst = [],[]
    bo = BayesianOptimization(train_model, bounds, random_state= 165)
    bo.maximize(init_points=5, n_iter=30, acq='ucb', xi=0.0, alpha=1e-6)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': 4,
        'bagging_seed' : 11,
        'learning_rate':bo.max['params']['learning_rate'],#0.01,
        'n_estimators' : 200,#3000,
        'num_leaves': int(bo.max['params']['num_leaves']),
        'min_data_in_leaf': int(bo.max['params']['min_data_in_leaf']),
        'max_depth': -1,#int(bo.max['params']['max_depth']),
        'bagging_fraction' : bo.max['params']['bagging_fraction'],
        'feature_fraction' : bo.max['params']['feature_fraction'],
        'lambda_l1': bo.max['params']['lambda_l1'],
        'lambda_l2': bo.max['params']['lambda_l2'],
        'max_bin' : bo.max['params']['max_bin'],
        'min_child_weight' :  bo.max['params']['min_child_weight'],
       #'early_stopping_round' : 50,
        'verbose' : -1,
        'seed' :165

    }
    print(params)

In [31]:
if not traintime :  
    params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.014645596881635362, 'num_iterations': 3000, 'num_leaves': 1385, 'min_data_in_leaf': 2, 'max_depth': -1, 'bagging_fraction': 0.38470656275266424, 'feature_fraction': 0.3963200069886693, 'lambda_l1': 0.3620323536611625, 'lambda_l2': 0.35373332136972424, 'max_bin': 154, 'min_child_weight': 0.04295077705493547, 'bagging_seed': 11, 'verbose': -1, 'seed': 12}#0.96069
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.014960621900450888, 'num_iterations': 3000, 'num_leaves': 1395, 'min_data_in_leaf': 0, 'max_depth': -1, 'bagging_fraction': 0.34044595394766747, 'feature_fraction': 0.34214407482513476, 'lambda_l1': 0.4805008954626611, 'lambda_l2': 0.37118265029595326, 'max_bin': 245, 'min_child_weight': 0.029364491351868728, 'bagging_seed': 11, 'verbose': -1, 'seed': 399} #0.960682
    
    
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.009833261038853328, 'num_iterations': 3000, 'num_leaves': 1149, 'min_data_in_leaf': 49, 'max_depth': -1, 'bagging_fraction': 0.3240951596739299, 'feature_fraction': 0.3498407985953037, 'lambda_l1': 0.3501312050658983, 'lambda_l2': 0.31690592587909633, 'max_bin': 161, 'min_child_weight': 0.0114896062770591, 'bagging_seed': 11, 'verbose': -1, 'seed': 99} #0.92151
#     params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.009754414928876673, 'num_iterations': 3000, 'num_leaves': 1143, 'min_data_in_leaf': 35, 'max_depth': -1, 'bagging_fraction': 0.3554679818433634, 'feature_fraction': 0.3402175958274567, 'lambda_l1': 0.34752046110782797, 'lambda_l2': 0.37744033227254215, 'max_bin': 242, 'min_child_weight': 0.03617820556477377, 'bagging_seed': 11, 'verbose': -1, 'seed': 165} #0.921416
    #0.9474
   # params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.008872937600948674, 'num_iterations': 3000, 'num_leaves': 999, 'min_data_in_leaf': 10, 'max_depth': -1, 'bagging_fraction': 0.4713288697981503, 'feature_fraction': 0.39018895127170683, 'lambda_l1': 0.3510394531977163, 'lambda_l2': 0.3873703557388186, 'max_bin': 254, 'min_child_weight': 0.033084878554612146, 'bagging_seed': 11, 'verbose': -1, 'seed': 12} #0.919813
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.008976158898617054, 'num_iterations': 3000, 'num_leaves': 995, 'min_data_in_leaf': 14, 'max_depth': -1, 'bagging_fraction': 0.4682934735568483, 'feature_fraction': 0.3965931325683636, 'lambda_l1': 0.3760099829112723, 'lambda_l2': 0.3975897064215117, 'max_bin': 254, 'min_child_weight': 0.02847878240902596, 'bagging_seed': 11, 'verbose': -1, 'seed': 277} #0.919887
    #0.9465
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.00898758545742255, 'num_iterations': 3000, 'num_leaves': 924, 'min_data_in_leaf': 14, 'max_depth': -1, 'bagging_fraction': 0.4024823598624291, 'feature_fraction': 0.3208499817534414, 'lambda_l1': 0.46115302747432046, 'lambda_l2': 0.35539565704287, 'max_bin': 250, 'min_child_weight': 0.04355005924028087, 'bagging_seed': 11, 'verbose': -1, 'seed': 399} #0.91827
    #0.9467
    #params ={'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.007972931513741082, 'num_iterations': 3000, 'num_leaves': 799, 'min_data_in_leaf': 1, 'max_depth': -1, 'bagging_fraction': 0.4895500948839138, 'feature_fraction': 0.37387418327918154, 'lambda_l1': 0.41482044398244905, 'lambda_l2': 0.5362091922107519, 'min_child_weight': 0.021811137046946497, 'bagging_seed': 11, 'verbose': -1, 'seed': 132} #0.916263
    #0.9469 -> 0.9473
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.007898644187072399, 'num_iterations': 3000, 'num_leaves': 785, 'min_data_in_leaf': 8, 'max_depth': -1, 'bagging_fraction': 0.395331230891172, 'feature_fraction': 0.31552276732000295, 'lambda_l1': 0.3470087563049069, 'lambda_l2': 0.503276501340582, 'min_child_weight': 0.03208937707510653, 'bagging_seed': 11, 'verbose': -1, 'seed': 99} #0.916033
    #0.9468
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.007899687615149507, 'num_iterations': 3000, 'num_leaves': 799, 'min_data_in_leaf': 0, 'max_depth': -1, 'bagging_fraction': 0.33041506218684263, 'feature_fraction': 0.35416575497676395, 'lambda_l1': 0.38704306154857826, 'lambda_l2': 0.5178012092942015, 'min_child_weight': 0.026728352023513757, 'bagging_seed': 11, 'verbose': -1, 'seed': 12} #0.916009
    # 0.9467
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.007372873282822138, 'num_iterations': 3000, 'num_leaves': 599, 'min_data_in_leaf': 50, 'max_depth': -1, 'bagging_fraction': 0.38244660605863556, 'feature_fraction': 0.44257840443437824, 'lambda_l1': 0.45420352424554195, 'lambda_l2': 0.5105273366387331, 'min_child_weight': 0.04533307349626744, 'bagging_seed': 11, 'verbose': -1, 'seed': 160} #0.9131
    # 0.9465
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.007203449399886559, 'num_iterations': 3000, 'num_leaves': 599, 'min_data_in_leaf': 50, 'max_depth': -1, 'bagging_fraction': 0.38525498446643613, 'feature_fraction': 0.380367681990148, 'lambda_l1': 0.34523297270543946, 'lambda_l2': 0.5734926679250656, 'min_child_weight': 0.02678622651731609, 'bagging_seed': 11, 'verbose': -1, 'seed': 160} #0.913067
    #0.9460이하
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.006990904355014822, 'num_iterations': 3000, 'num_leaves': 499, 'min_data_in_leaf': 100, 'max_depth': -1, 'bagging_fraction': 0.3812981748987723, 'feature_fraction': 0.47302209023435043, 'lambda_l1': 0.4631415001252107, 'lambda_l2': 0.5618020019879593, 'min_child_weight': 0.043917717737188104, 'bagging_seed': 11, 'verbose': -1, 'seed': 105} # 0.909427
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.006787730751128605, 'num_iterations': 3000, 'num_leaves': 499, 'min_data_in_leaf': 100, 'max_depth': -1, 'bagging_fraction': 0.47560044639915244, 'feature_fraction': 0.41666150781008676, 'lambda_l1': 0.47548378689674464, 'lambda_l2': 0.9757213490246281, 'min_child_weight': 0.041972701015528587, 'bagging_seed': 11, 'verbose': -1, 'seed': 105} # 0.909053
    #params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.006882517225047709, 'num_iterations': 3000, 'num_leaves': 499, 'min_data_in_leaf': 100, 'max_depth': -1, 'bagging_fraction': 0.4237390751101919, 'feature_fraction': 0.43926420965306257, 'lambda_l1': 0.3175126554926237, 'lambda_l2': 0.7515373479664667, 'min_child_weight': 0.017207106806066748, 'bagging_seed': 11, 'verbose': -1, 'seed': 105} #0.909397
   # params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.006535640044021754, 'num_iterations': 3000, 'num_leaves': 450, 'min_data_in_leaf': 114, 'max_depth': -1, 'bagging_fraction': 0.3167479847792764, 'feature_fraction': 0.3667012584958002, 'lambda_l1': 0.4718711490177294, 'lambda_l2': 0.5528434176595423, 'min_child_weight': 0.01254758279001024, 'bagging_seed': 11, 'verbose': -1, 'seed': 105}
   # params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.006536108525154421, 'num_iterations': 3000, 'num_leaves': 455, 'min_data_in_leaf': 108, 'max_depth': -1, 'bagging_fraction': 0.4637593273666464, 'feature_fraction': 0.43726611671270266, 'lambda_l1': 0.4261934244016219, 'lambda_l2': 0.6851945667053867, 'min_child_weight': 0.016926478912796333, 'bagging_seed': 11, 'verbose': -1, 'seed': 105} # Base 0.9461
   

    # params = {'num_leaves': 455,
    #           'n_estimators' : 10000,
    #          'min_child_weight': 0.03454472573214212,
   #           'feature_fraction': 0.3797454081646243,
   #           'bagging_fraction': 0.4181193142567742,
    #          'min_data_in_leaf': 106,
     #         'objective': 'binary',
   #           'max_depth': -1,
    #          'learning_rate': 0.006883242363721497,
  #            "boosting_type": "gbdt",
    #          "bagging_seed": 11,
    #          "metric": 'auc',
    #          "verbosity": -1,
     #         'reg_alpha': 0.3899927210061127,
      #        'reg_lambda': 0.6485237330340494,
     #         'random_state': 47
     #        }
    print(params)



    # Cross validation Score?로 제출?

    tscv =  StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#KFold(n_splits=5)#(n_splits=8, shuffle=True, random_state=42)#TimeSeriesSplit(n_splits=5) #? -> 
    sc = []
    #     for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    for fold_,(train_idx, test_idx) in enumerate(tscv.split(X_train, Y_train)):
    #         print( Y_train.iloc[train_idx].head())
        x_train, x_val = X_train.iloc[train_idx,:], X_train.iloc[test_idx,:]
        y_train, y_val = Y_train.iloc[train_idx], Y_train.iloc[test_idx]
        print(x_train.shape)
#         sc2 =[]
#         N = 3
#         for i in range(N):
#         Data Augmentation Test
        X_t, y_t = augment(x_train.values, y_train.values)
        X_t = pd.DataFrame(X_t)
        print(X_t.shape)
# #             X_t = X_t.add_prefix('var_')
         # augmentation 한 것
        lgb_train = lgb.Dataset(X_t.astype('float32'), label=y_t.astype('float32'))
        lgb_valid = lgb.Dataset(data=x_val.astype('float32'), label=y_val.astype('float32'))
        
        lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_valid, verbose_eval=200,early_stopping_rounds=50)
        y = lgb_model.predict(x_train.astype('float32'), num_iteration=lgb_model.best_iteration)
        train_score = roc_auc_score(y_train.astype('float32'), y)        
        y = lgb_model.predict(x_val.astype('float32'), num_iteration=lgb_model.best_iteration)
        score = roc_auc_score(y_val.astype('float32'), y)
            
        print ("Fold : ", fold_,"train_auc : ",train_score,"val_auc : ", score)
        y = lgb_model.predict(X_test.astype('float32'), num_iteration=lgb_model.best_iteration)
        sc.append(y)

       # lgb_train = lgb.Dataset(data=x_train.astype('float32'), label=y_train.astype('float32'))
        #lgb_valid = lgb.Dataset(data=x_val.astype('float32'), label=y_val.astype('float32'))
       # lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_valid, verbose_eval=200,early_stopping_rounds=50)
       # y = lgb_model.predict(x_train.astype('float32'), num_iteration=lgb_model.best_iteration)
      #  train_score = roc_auc_score(y_train.astype('float32'), y)        
      #  y = lgb_model.predict(x_val.astype('float32'), num_iteration=lgb_model.best_iteration)
      #  score = roc_auc_score(y_val.astype('float32'), y)
      #  print ("Fold : ", fold_,"train_auc : ",train_score,"val_auc : ", score)
       # y = lgb_model.predict(X_test.astype('float32'), num_iteration=lgb_model.best_iteration)
       # sc.append(y)

        
    print(np.mean(sc,axis =0))

    submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')
    submission['isFraud'] = np.mean(sc, axis =0)
    submission.to_csv('submission.csv')

{'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.014645596881635362, 'num_iterations': 3000, 'num_leaves': 1385, 'min_data_in_leaf': 2, 'max_depth': -1, 'bagging_fraction': 0.38470656275266424, 'feature_fraction': 0.3963200069886693, 'lambda_l1': 0.3620323536611625, 'lambda_l2': 0.35373332136972424, 'max_bin': 154, 'min_child_weight': 0.04295077705493547, 'bagging_seed': 11, 'verbose': -1, 'seed': 12}
(472431, 419)
(505491, 419)
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.951435
[400]	valid_0's auc: 0.96803
[600]	valid_0's auc: 0.971613
[800]	valid_0's auc: 0.972827
[1000]	valid_0's auc: 0.973204
Early stopping, best iteration is:
[996]	valid_0's auc: 0.973206
Fold :  0 train_auc :  1.0 val_auc :  0.9732063436432451
(472431, 419)
(505491, 419)
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.953504
[400]	valid_0's auc: 0.969205
[600]	valid_0's auc: 0.972865
[800]	valid_0's auc: 0.9

In [32]:
# 몇개 Parameter를 고정하고 Grid search

# import copy
# import pickle
# import pandas as pd
# import numpy as np
# import lightgbm as lgb
# from sklearn.model_selection import ParameterGrid, StratifiedKFold

# N_SPLITS = 3
# df_train = pd.read_csv('processed/df_train.csv')
# df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
# target = pd.read_csv('processed/df_train_target.csv', header=None)
# assert df_train.shape[0] == target.shape[0]
# trn_data = lgb.Dataset(df_train[df_train_columns], label=target)
# folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
# folds_generator = folds.split(df_train, df_train['outliers'].values)

# default_params = {'num_leaves': 31,
#                   'min_data_in_leaf': 30,
#                   'objective': 'regression',
#                   'max_depth': -1,
#                   'learning_rate': 0.01,
#                   "min_child_samples": 20,
#                   "boosting": "gbdt",
#                   "feature_fraction": 0.9,
#                   "bagging_freq": 1,
#                   "bagging_fraction": 0.9,
#                   "bagging_seed": 11,
#                   "metric": 'rmse',
#                   "lambda_l1": 0.1,
#                   "verbosity": -1,
#                   "nthread": 4,
#                   "random_state": 42,
#                   "device": 'gpu'}
# params_cv = {'min_data_in_leaf': [30, 60], "min_child_samples": [20, 40]}
# param_grid = ParameterGrid(params_cv)
# results = []
# for param in param_grid:
#     param_now = copy.copy(default_params)
#     param_now.update(param)
#     cv_results = lgb.cv(param_now, trn_data, num_boost_round=10000,
#                         folds=folds_generator,
#                         early_stopping_rounds=100,
#                         stratified=False)
#     # print(cv_results['rmse-mean'], cv_results['rmse-std'])
#     cv_results = (param_now, cv_results)
#     results.append(cv_results)

# with open('hyper_searchs/lgb_cv.pkl', 'wb') as f:
#     pickle.dump(results, f)