In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import os
import gc
print(os.listdir("../input/ieee-fraud-detection/"))
pd.options.display.max_rows = 99
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from catboost import CatBoostClassifier, Pool, cv

['test_identity.csv', 'test_transaction.csv', 'sample_submission.csv', 'train_transaction.csv', 'train_identity.csv']


In [2]:
# column Type을 다시 지정해주자 
#https://www.kaggle.com/mhviraf/reducing-memory-size-an-alternative
# NaN 이 포함된 int value도 float으로 되어 있고, 이를 Reduce mem usuage 사용하면 데이터 손실이 발생하기도 한다.V
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    del_cols = ['TransactionAmt','dist1', 'dist2', 'C1', 'C2', 'C4', 'C6', 'C7', 'C8', 'C10', 'C11', 'C12', 'C13', 'D8', 'D9', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V150', 'V159', 'V164', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V332', 'V334', 'V335', 'V336']
    
    cols = [x for x in list(df.columns) if x not in del_cols]# 데이터가 변경되는 컬럼은 제외 81개
    for col in tqdm(cols):
#         if col in cols : continue 
        col_type = df[col].dtype
        #print(col_type)
#         if str(col_type)[:4]== 'date' : continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
# train = pd.read_pickle('../input/datas6/train.pkl')
# test = pd.read_pickle('../input/datas6/test.pkl')
# gc.collect()

X_train = pd.read_pickle('../input/datas7/X_train.pkl')
X_test = pd.read_pickle('../input/datas7/X_test.pkl')
Y_train = pd.read_pickle('../input/datas7/Y_train.pkl')

In [4]:
# # 바로 모델링
# X_train = train.sort_values('TransactionDT').drop(['isFraud','TransactionDT','date'],axis =1)
# Y_train = train.sort_values('TransactionDT')['isFraud']
# X_test = test.sort_values('TransactionDT').drop(['TransactionDT','date'],axis =1 )
# train, test = [], [] 
# del train, test
# gc.collect()

In [5]:
# numerical_columns = list(X_train.select_dtypes(include=['float16','float32','float64','int8','int16','int64']).columns)
# numerical_columns=list(X_train[numerical_columns].isnull().sum()[X_train[numerical_columns].isnull().sum()>0].index)
# #list(test.select_dtypes(exclude=['object']).columns)
# print(X_train.shape)
# X_train[numerical_columns] =X_train[numerical_columns].fillna(-999)
# X_test[numerical_columns] =X_test[numerical_columns].fillna(-999)
# print("filling numerical columns null values done")

In [6]:
# categorical_features = ['ProductCD','M4',
#                         'card1','card2','card3','card4','card5','card6',
#                         'addr1','addr2','dist1','dist2',
#                         'P_emaildomain','R_emaildomain',
#                        ]
# categorical_features = list(set(categorical_features+X_train.select_dtypes(include='category').columns.tolist()+X_test.select_dtypes(include='category').columns.tolist()))
# # categorical_features

In [7]:
# categorical_features +=['M1_9', 'uid1', 'uid2', 'uid3', 'uid4', 'uid6', 'uid7', 'uid8', 'uid9', 'uid10', 'V1_11', 'V35_52', 'V75_94',
#                          'card1_addr1','id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_13',
#                          'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32', 'id_03_04', 'id_05_06',
#                      'id_07_08', 'id_35_38', 'id_17_20_ip1', 'id_21_22_ip2', 'id_24_26_ip3','TransactionAmt_grouping', 'dayofyear_block', 'hours_block',
#                     ]
# categorical_features = list(set(categorical_features))

In [8]:

# for col in categorical_features :
#     try :
#         X_train[col] = X_train[col].cat.add_categories(-999).fillna(-999)
#         X_test[col] = X_test[col].cat.add_categories(-999).fillna(-999)
#     except :
#         X_train[col] = X_train[col].fillna(-999)
#         X_test[col] = X_test[col].fillna(-999)

In [9]:
# # Categorical feature로 변경
# X_train[categorical_features] = X_train[categorical_features].astype('category')
# X_test[categorical_features] = X_test[categorical_features].astype('category')

In [10]:
# from sklearn import preprocessing
# for f in tqdm(X_train.select_dtypes(include='category').columns.tolist() + X_train.select_dtypes(include='object').columns.tolist()):
# #     print(f)
#     lbl = preprocessing.LabelEncoder()
#     lbl.fit(list(X_train[f].values) + list(X_test[f].values))
#     X_train[f] = lbl.transform(list(X_train[f].values))
#     X_test[f] = lbl.transform(list(X_test[f].values))

In [11]:
categorical_features = ['ProductCD','M4',
                        'card1','card2','card3','card4','card5','card6',
                        'addr1','addr2','dist1','dist2',
                        'P_emaildomain','R_emaildomain',
                       ]
categorical_features +=['M1_9', 'uid1', 'uid2', 'uid3', 'uid4', 'uid6', 'uid7', 'uid8', 'uid9', 'uid10', 'V1_11', 'V35_52', 'V75_94',
                         'card1_addr1','id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_13',
                         'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32', 'id_03_04', 'id_05_06',
                     'id_07_08', 'id_35_38', 'id_17_20_ip1', 'id_21_22_ip2', 'id_24_26_ip3','TransactionAmt_grouping', 'dayofyear_block', 'hours_block',
                   
                       ]
categorical_features = list(set(categorical_features))

In [12]:
########################### Freq encoding
# 각 컬럼의 값의 전체에서 얼만큼 빈도수를 가지는지 카운트하여 넣어줌 -> C,D는 크게 의미 없을 듯?
i_cols = [#'card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8','D9',
          'addr1','addr2',
          'dist1','dist2',
         # 'P_emaildomain', 
          'R_emaildomain'
         ]

for col in i_cols:
    temp_df = pd.concat([X_train[[col]], X_test[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()    # Null 값도 계산하게하기
    X_train[col+'_fq_enc'] = X_train[col].map(fq_encode)
    X_test[col+'_fq_enc']  = X_test[col].map(fq_encode)

In [13]:
X_train['day_block'] = X_train['day'].map(lambda x: 1 if x>27 else 0)
X_test['day_block'] = X_test['day'].map(lambda x: 1 if x>27 else 0)
categorical_features.append('day_block')

In [14]:
# X_train['C0'] = np.where(X_train['C1']==0,X_train['C2'],X_train['C1'])
# X_train['Amt_C0'] = X_train['TransactionAmt']/X_train['C0']
# X_train['Amt_C0'] = X_train['Amt_C0'].fillna(X_train['TransactionAmt'])
# X_train['Amt_C0_int'] = X_train['Amt_C0'].astype('int')
# X_train['Amt_C0_dec'] = (X_train['Amt_C0'] -X_train['Amt_C0_int'])
# # X_train['Amt_C0_dec']

# X_test['C0'] = np.where(X_test['C1']==0,X_test['C2'],X_test['C1'])
# X_test['C0'] = np.where(X_test['C0']==0,1,X_test['C0'])
# X_test['Amt_C0'] = X_test['TransactionAmt']/X_test['C0']
# X_test['Amt_C0'] = X_test['Amt_C0'].fillna(X_test['TransactionAmt'])
# X_test['Amt_C0_int'] = X_test['Amt_C0'].astype('int')
# X_test['Amt_C0_dec'] = (X_test['Amt_C0'] -X_test['Amt_C0_int'])
# # X_test[X_test['Amt_C0']==np.inf]['']


In [15]:
for col in ['D15','D10','D2','D14','D12','D8']:
    X_train[col] = np.where(X_train[col]==-999,0,X_train[col])
    X_test[col] = np.where(X_test[col]==-999,0,X_test[col])
    
X_train['D16'] = X_train['D15']
X_train['D16'] = np.where(X_train['D16']>0, X_train['D16'],X_train['D10'])
X_train['D16'] = np.where(X_train['D16']>0, X_train['D16'],X_train['D2'])
X_train['D16'] = np.where(X_train['D16']>0, X_train['D16'],X_train['D14'])
X_train['D16'] = np.where(X_train['D16']>0, X_train['D16'],X_train['D12'])
X_train['D16'] = np.where(X_train['D16']>0, X_train['D16'],X_train['D8'])
# X_train['D16'].values[:100]

X_test['D16'] = X_test['D15']
X_test['D16'] = np.where(X_test['D16']>0, X_test['D16'],X_test['D10'])
X_test['D16'] = np.where(X_test['D16']>0, X_test['D16'],X_test['D2'])
X_test['D16'] = np.where(X_test['D16']>0, X_test['D16'],X_test['D14'])
X_test['D16'] = np.where(X_test['D16']>0, X_test['D16'],X_test['D12'])
X_test['D16'] = np.where(X_test['D16']>0, X_test['D16'],X_test['D8'])

for col in ['D15','D10','D2','D14','D12','D8']:
    X_train[col] = X_train[col].fillna(-999)
    X_test[col] = X_test[col].fillna(-999)

In [16]:
# 시작점부터의 일수 계산
fromstart = []
tmp = 0
x =1
dayofyears = list(X_train['dayofyear'])+list(X_test['dayofyear']) 
for i in tqdm(range(len(dayofyears))) :
#     print(tmp, X_train['dayofyear'].iloc[i])
    if fromstart ==[] : fromstart.append(x)
    else :
        if tmp == dayofyears[i] : fromstart.append(x)
        else : 
            x += 1
            fromstart.append(x)
    tmp = dayofyears[i]

X_train['fromstart'] = fromstart[:X_train.shape[0]]
X_test['fromstart'] = fromstart[X_train.shape[0]:]


X_train['grouping'] = (X_train['D16']- X_train['fromstart'])/10
X_train['grouping'] = np.round(X_train['grouping'])

X_test['grouping'] = (X_test['D16']- X_test['fromstart'])/10
X_test['grouping'] = np.round(X_test['grouping'])

100%|██████████| 1097231/1097231 [00:00<00:00, 1414507.63it/s]


In [17]:
# Some arbitrary features interaction
features = [ 'P_emaildomain__C2','D11__DeviceInfo','DeviceInfo__P_emaildomain','card5__P_emaildomain','uid10__grouping']
#[ 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
#                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']
# features = features[:6]
print(features)
from sklearn import preprocessing
for feature in features:

    f1, f2 = feature.split('__')
    X_train[feature] = X_train[f1].astype(str) + '_' + X_train[f2].astype(str)
    X_test[feature] = X_test[f1].astype(str) + '_' + X_test[f2].astype(str)
    categorical_features.append(feature)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train[feature].values) + list(X_test[feature].values))
    X_train[feature] = lbl.transform(list(X_train[feature].values))
    X_test[feature] = lbl.transform(list(X_test[feature].values))
    
    

# for f in tqdm(X_train.select_dtypes(include='category').columns.tolist() + X_train.select_dtypes(include='object').columns.tolist()):
# #     print(f)
#     lbl = preprocessing.LabelEncoder()
#     lbl.fit(list(X_train[f].values) + list(X_test[f].values))
#     X_train[f] = lbl.transform(list(X_train[f].values))
#     X_test[f] = lbl.transform(list(X_test[f].values))
    

['P_emaildomain__C2', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'card5__P_emaildomain', 'uid10__grouping']


In [18]:
X_train['uid10__grouping_count_full'] = X_train['uid10__grouping'].map(pd.concat([X_train['uid10__grouping'], X_test['uid10__grouping']], ignore_index=True).value_counts(dropna=False))
X_test['uid10__grouping_count_full'] = X_test['uid10__grouping'].map(pd.concat([X_train['uid10__grouping'], X_test['uid10__grouping']], ignore_index=True).value_counts(dropna=False))


In [19]:
%timeit
# cardid 기준으로 최근 5일간 거래량 통계
X_train['uid10__grouping'] = X_train['uid10__grouping'].astype('str') 
X_train['count_last_uid10__grouping'] = X_train.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).count())
X_train['mean_last_uid10__grouping'] = X_train.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
X_train['min_last_uid10__grouping'] = X_train.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).min())
X_train['max_last_uid10__grouping'] = X_train.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).max())
X_train['std_last_uid10__grouping'] = X_train.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).std())

X_test['uid10__grouping'] = X_test['uid10__grouping'].astype('str') 
X_test['count_last_uid10__grouping'] = X_test.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).count())
X_test['mean_last_uid10__grouping'] = X_test.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
X_test['min_last_uid10__grouping'] = X_test.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).min())
X_test['max_last_uid10__grouping'] = X_test.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).max())
X_test['std_last_uid10__grouping'] = X_test.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).std())

#최근 10일 평균 대비 큰지 작은지 
X_train['trans_mean_last_uid10__grouping'] = X_train['TransactionAmt'] / X_train.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
X_train['trans_std_last_uid10__grouping'] = X_train['TransactionAmt'] / X_train.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).std())
X_test['trans_mean_last_uid10__grouping'] = X_test['TransactionAmt'] / X_test.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).mean())
X_test['trans_std_last_uid10__grouping'] = X_test['TransactionAmt'] / X_test.groupby('uid10__grouping')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).std())


X_train['uid10__grouping'] = X_train['uid10__grouping'].astype('float32') 
X_test['uid10__grouping'] = X_test['uid10__grouping'].astype('float32') 

In [20]:
def uid_aggregation(train_df, test_df, main_columns, uids, aggregations):
    for main_column in tqdm(main_columns):  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = col+'_'+main_column+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   
                
                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name]  = test_df[col].map(temp_df)
            del temp_df
            gc.collect()
    return train_df, test_df

In [21]:
# ########################### D Columns
# # From columns description we know that
# # D1-D15: timedelta, such as days between previous transaction, etc.
# # 1. I can't imagine normal negative timedelta values (Let's clip Values)
# # 2. Normalize (Min-Max, Standard score) All D columns, except D1,D2,D9
# # 3. Do some aggregations based on uIDs
# # 4. Freaquency encoding
# # 5. D1,D2 are clipped by max train_df values (let's scale it)
# i_cols = ['D'+str(i) for i in range(1,16)]
# i_cols.remove('D7')
# #i_cols += ['TransactionAmt'] ## 커널이 죽는다
# # i_cols += ['isFraud']
# uids = ['uid10__grouping']  #'card1','card2','card3','card5' 추가하면 커널 죽음 #,'uid9','uid7'
# aggregations = ['mean','std']

# ####### uIDs aggregations
# X_train, X_test = uid_aggregation(X_train, X_test, i_cols, uids, aggregations)

In [22]:
# i_cols = ['TransactionAmt']
# uids = ['uid10__grouping']# ['card1','uid1','uid4','uid7','uid9']  
# aggregations = ['mean','std']

# ####### uIDs aggregations
# X_train, X_test = uid_aggregation(X_train, X_test, i_cols, uids, aggregations)

In [23]:
# https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/77537
# CV 와 LB 사이의 GAP 줄이기
from scipy.stats import ks_2samp
list_p_value =[]

cols = [x for x in X_train.columns if x not in categorical_features]
for i in tqdm(cols):
    list_p_value.append(ks_2samp(X_test[i] , X_train[i])[1])

Se = pd.Series(list_p_value, index = cols).sort_values() 
list_discarded = list(Se[Se == 0].index)
# len(list_discarded)
# if 'dayofyear' in list_discarded :list_discarded.remove('dayofyear')
X_train = X_train.drop(list_discarded,axis =1 )
X_test = X_test.drop(list_discarded,axis =1 )

100%|██████████| 470/470 [01:16<00:00,  6.14it/s]


In [24]:
print(X_train.shape)
gc.collect()

(590540, 441)


0

In [25]:
# # Create correlation matrix
drop_cols = []
for df in tqdm([X_train,X_test]):
    corr_matrix = df.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find index of feature columns with correlation greater than 0.95
    drop_cols += [column for column in upper.columns if any(upper[column] > 0.98)]
drop_cols =list(set(drop_cols))
print(drop_cols)
gc.collect()
# if 'dayofyear' in drop_cols :drop_cols.remove('dayofyear')
X_train = X_train.drop(drop_cols,axis =1 )
X_test = X_test.drop(drop_cols,axis =1 )

100%|██████████| 2/2 [07:50<00:00, 235.16s/it]


['V177', 'V121', 'V192', 'V123', 'V124', 'V118', 'V153', 'V180', 'V226', 'V251', 'V262', 'V173', 'V257', 'V229', 'V191', 'V317', 'uid4_D9_std', 'V115', 'V139', 'uid9', 'V211', 'V255', 'V242', 'V186', 'V252', 'V187', 'C6', 'id_11', 'V181', 'V196', 'V184', 'V236', 'V237', 'V110', 'addr2_fq_enc', 'V248', 'V275', 'V297', 'V157', 'V241', 'uid10_D9_std', 'V154', 'V283', 'V172', 'V174', 'uid3_count_full', 'V326', 'V199', 'V247', 'id_05', 'V300', 'C11', 'V218', 'id_29', 'id_10', 'V329', 'V261', 'V243', 'V111', 'V144', 'V213', 'V221', 'V327', 'V333', 'V302', 'V140', 'V325', 'D9_fq_enc', 'V219', 'V197', 'V231', 'TransactionAmt_grouping', 'V112', 'id_04', 'V143', 'V225', 'V210', 'Avg_V134', 'V254', 'D8_fq_enc', 'C14', 'uid4', 'V223', 'V258', 'uid8', 'V125', 'V175', 'V228', 'V244', 'V256', 'V232', 'V270', 'V152', 'V227', 'V305', 'V195', 'uid10_D9_mean', 'V176', 'R_emaildomain_fq_enc', 'V293', 'V158', 'C8', 'V323', 'V179', 'V208', 'V109', 'V245', 'V169', 'V295', 'V147', 'V116', 'V328', 'V149', 'uid

In [26]:
one_value_cols = [col for col in X_train.columns if X_train[col].nunique() <= 1]
one_value_cols_test = [col for col in X_test.columns if X_test[col].nunique() <= 1]

drop_cols = list(set(one_value_cols+one_value_cols_test))
X_train = X_train.drop(drop_cols,axis =1 )
X_test = X_test.drop(drop_cols,axis =1 )

In [27]:
big_top_value_cols = [col for col in X_train.columns if X_train[col].value_counts(dropna=False, normalize=True).values[0] > 0.98]
big_top_value_cols_test = [col for col in X_test.columns if X_test[col].value_counts(dropna=False, normalize=True).values[0] > 0.98]

drop_cols = list(set(big_top_value_cols+big_top_value_cols_test))
X_train = X_train.drop(drop_cols,axis =1 )
X_test = X_test.drop(drop_cols,axis =1 )

In [28]:
# Categorical feature로 변경
# X_train[categorical_features] = X_train[categorical_features].astype('category')
# X_test[categorical_features] = X_test[categorical_features].astype('category')
# from sklearn import preprocessing
# for f in tqdm(X_train.select_dtypes(include='category').columns.tolist() + X_train.select_dtypes(include='object').columns.tolist()):
# #     print(f)
#     lbl = preprocessing.LabelEncoder()
#     lbl.fit(list(X_train[f].values) + list(X_test[f].values))
#     X_train[f] = lbl.transform(list(X_train[f].values))
#     X_test[f] = lbl.transform(list(X_test[f].values))

In [29]:
# # https://www.kaggle.com/duykhanh99/hust-lgb-fe-0-9485-lb-newfeature

# drop_cols = ['V300','V309','V111','V124','V106','V125','V315','V134','V102','V123','V316','V113',
#               'V136','V305','V110','V299','V289','V286','V318','V304','V116','V284','V293',
#               'V137','V295','V301','V104','V311','V115','V109','V119','V321','V114','V133','V122','V319',
#               'V105','V112','V118','V117','V121','V108','V135','V320','V303','V297','V120',
#               'V1','V14','V41','V65','V88', 'V89', 'V107', 'V68', 'V28', 'V27', 'V29', 'V241','V269',
#               'V240', 'V325', 'V138', 'V154', 'V153', 'V330', 'V142', 'V195', 'V302', 'V328', 'V327', 
#               'V198', 'V196', 'V155']
# drop_cols = [x for x in drop_cols if x in list(X_train.columns)]

In [30]:
print(X_train.shape)
gc.collect()

(590540, 221)


0

In [31]:
from sklearn.model_selection import StratifiedKFold, KFold,TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import lightgbm as lgb


In [32]:
categorical_features = [ x for x in categorical_features if x in list(X_train.columns)]

In [33]:
X_train[categorical_features] = X_train[categorical_features].astype('category')
X_test[categorical_features] = X_test[categorical_features].astype('category')

In [34]:
gc.collect()

0

In [35]:
#cols = ['uid2', 'uid3', 'uid4', 'uid6', 'uid7', 'uid8', 'uid9', 'uid10','uid10__grouping']
#for col in cols :
#    try :
  #      X_train.remove(col)
   #     X_test.remove(col)
  #  except : continue

In [36]:

seeds =  79
splitcounts = 10
LGBM =  False#True
# 앞선 Test에서의 결과로 StratifiedKFold를 사용한다. 
# folds = TimeSeriesSplit(n_splits= splitcounts)
folds = KFold(n_splits=splitcounts, random_state = seeds) 
# folds = StratifiedKFold(n_splits=splitcounts, random_state = seeds)

# params =  {
#         'objective': 'binary',
#         'metric': 'auc',
#         'num_threads': 4,
#         'learning_rate': 0.01, 
#         'num_iterations' : 10000,
#         'max_depth': -1,
#         'reg_alpha': 0.3,
#          'reg_lambda': 0.3,
#         'bagging_seed' : seeds,
#         'verbose' : -1,
#         'seed' :seeds
#     }

if LGBM :


    params = {'objective': 'binary', 'metric': 'auc', 'num_threads': 4, 'learning_rate': 0.007898644187072399, 'num_iterations': 10000, 
              'num_leaves': 650, 'min_data_in_leaf': 8, 'max_depth': -1, 'bagging_fraction': 0.395331230891172, 
              'feature_fraction': 0.31552276732000295, 'lambda_l1': 0.3470087563049069, 'lambda_l2': 0.503276501340582,
              'min_child_weight': 0.03208937707510653, 'bagging_seed': seeds, 'verbose': -1, 'seed': seeds }
             #, 'categorical_feature' :  categorical_features}
    CVscore = []
    predicts = []
    X_idx = []
    X_predicts = []
    importance = pd.DataFrame(np.zeros((X_train.shape[1], splitcounts)), columns=['Fold_{}'.format(i) for i in range(1, splitcounts+1)], index=X_train.columns)
    for fold_, (train_idx, test_idx) in enumerate(folds.split(X_train,Y_train)):
#         if fold_ != 9 : continue
        
        X_train_, X_val_ = X_train.iloc[train_idx,:], X_train.iloc[test_idx,:]
        Y_train_, Y_val_ = Y_train.iloc[train_idx], Y_train.iloc[test_idx]

        lgb_train = lgb.Dataset(data=X_train_,label = Y_train_)
        lgb_valid = lgb.Dataset(data=X_val_,label = Y_val_)

        lgb_model = lgb.train(params, lgb_train, valid_sets =lgb_valid, verbose_eval = 200, early_stopping_rounds= 500)
        y = lgb_model.predict(X_train, num_iteration=lgb_model.best_iteration)
        train_score = roc_auc_score(Y_train.astype('float32'), y) 
        
        
        y = lgb_model.predict(X_val_, num_iteration = lgb_model.best_iteration)
        X_predicts.append(y)
        X_idx += list(test_idx)
        score = roc_auc_score(Y_val_,y)
        CVscore.append(score)
        # 예측
#         pd.DataFrame({'TransactionID':list(test_idx),'predict':y}).to_csv('modelprediction.csv')
#         y = lgb_model.predict(X_train, num_iteration = lgb_model.best_iteration)
#         X_train['predict'] = y
#         X_train['predict'].to_csv('predict_fold9.csv')

           
        
        print("Fold : ", fold_,"train_auc : ", train_score, "val_auc : ", score )
        y = lgb_model.predict(X_test,num_iteration = lgb_model.best_iteration)
        importance.iloc[:, fold_ - 1] = lgb_model.feature_importance()
        predicts.append(y)
        lgb_model, lgb_train, lgb_valid = None, None, None
        del lgb_model,lgb_train, lgb_valid
        gc.collect()

    print("CV Score : " ,np.mean(CVscore))
    submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')
    submission['isFraud'] = np.mean(predicts, axis =0)
    submission.to_csv('submission.csv')
    importance.to_csv('importance.csv')
#     pd.DataFrame({'TransactionID':X_idx,'predict':X_predicts}).to_csv('modelprediction_all.csv')

else :
#     X_train = reduce_mem_usage(X_train)
#     X_test = reduce_mem_usage(X_test)
#     X_train.to_pickle('X_train.pkl')
#     X_test.to_pickle('X_test.pkl')
#     Y_train.to_pickle('Y_train.pkl')
    gc.collect()
    params = {
                'n_estimators': 5000,
                'learning_rate': 0.07,
                'eval_metric':'AUC',
                'loss_function':'Logloss',
                'random_seed':seeds,
                'metric_period':500,
                'od_wait':500, #earlystoping
                'task_type':'GPU',
                'depth': 8,
                #'colsample_bylevel':0.7,
              'max_ctr_complexity' : 2
                } 
    
    CVscore = []
    predicts = []
    X_idx = []
    X_predicts = []
    importance = pd.DataFrame(np.zeros((X_train.shape[1], splitcounts)), columns=['Fold_{}'.format(i) for i in range(1, splitcounts+1)], index=X_train.columns)
#     for fold_, (train_idx, test_idx) in enumerate(folds.split(X_train,Y_train)):
#         X_train_+fold_, Y_train_+fold_ =X_train.iloc[train_idx,:],Y_train.iloc[train_idx]
        
    for fold_, (train_idx, test_idx) in enumerate(folds.split(X_train,Y_train)):
        print(fold_)
        #if fold_ != 5: continue
#         X_train_, X_val_ = , 
#         Y_train_, Y_val_ = , 
        X_train_, X_val_ = X_train.iloc[train_idx,:], X_train.iloc[test_idx,:]
        Y_train_, Y_val_ = Y_train.iloc[train_idx], Y_train.iloc[test_idx]

        cat_model = CatBoostClassifier(**params)        
        cat_model.fit(
            X_train_,Y_train_,
            eval_set=(X_val_, Y_val_),
            cat_features=categorical_features,
            use_best_model=True,
            verbose=True
        )
        print("Training end")

#         y = cat_model.predict_proba(X_val_)[:,1]
#         X_predicts.append(y)
#         X_idx += list(train_idx)
#         score = roc_auc_score(Y_val_,y)
#         CVscore.append(score)
#         print("Fold : ", fold_, "val_auc : ", score )
        y = cat_model.predict_proba(X_test)[:,1]
#         importance.iloc[:, fold_ - 1] = lgb_model.feature_importance()
        predicts.append(y)
        cat_model = None
        del cat_model, X_train_, X_val_,Y_train_, Y_val_
        gc.collect()

#     print("CV Score : " ,np.mean(CVscore))
    submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')
    submission['isFraud'] = np.mean(predicts, axis =0)
    submission.to_csv('submission.csv')
#     importance.to_csv('importance.csv')

0
0:	learn: 0.8282966	test: 0.7545147	best: 0.7545147 (0)	total: 173ms	remaining: 14m 24s
500:	learn: 0.9763851	test: 0.9392651	best: 0.9392651 (500)	total: 1m 16s	remaining: 11m 27s
1000:	learn: 0.9806029	test: 0.9426043	best: 0.9426275 (998)	total: 2m 38s	remaining: 10m 32s
1500:	learn: 0.9837750	test: 0.9434239	best: 0.9438818 (1198)	total: 3m 59s	remaining: 9m 19s
bestTest = 0.9438818097
bestIteration = 1198
Shrink model to first 1199 iterations.
Training end
1
0:	learn: 0.8278483	test: 0.7874370	best: 0.7874370 (0)	total: 146ms	remaining: 12m 10s
500:	learn: 0.9759010	test: 0.9372289	best: 0.9372373 (499)	total: 1m 18s	remaining: 11m 41s
1000:	learn: 0.9800790	test: 0.9412803	best: 0.9413058 (993)	total: 2m 39s	remaining: 10m 36s
1500:	learn: 0.9831250	test: 0.9427502	best: 0.9428031 (1428)	total: 4m 1s	remaining: 9m 22s
bestTest = 0.9428031445
bestIteration = 1428
Shrink model to first 1429 iterations.
Training end
2
0:	learn: 0.8223577	test: 0.7580749	best: 0.7580749 (0)	total: 

In [37]:
if LGBM :

    importance['Mean_Importance'] = importance.sum(axis=1) / splitcounts
    importance.sort_values(by='Mean_Importance', inplace=True, ascending=False)

    plt.figure(figsize=(15, 120))
    sns.barplot(x='Mean_Importance', y=importance.index, data=importance)

    plt.xlabel('')
    plt.tick_params(axis='x', labelsize=15)
    plt.tick_params(axis='y', labelsize=15)
    plt.title('Mean Feature Importance Between Folds', size=15)

    plt.show()

In [38]:
# pd.DataFrame({'TransactionID':X_idx,'predict':X_predicts})
X_predicts

[]