In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff

# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)
#cufflinks.go_offline(connected=True)

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
print(os.listdir("../input"))

['test_identity.csv', 'test_transaction.csv', 'sample_submission.csv', 'train_transaction.csv', 'train_identity.csv']


In [2]:
#modeling
df_trans = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
df_test_trans = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')

df_id = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
df_test_id = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

df_train = df_trans.merge(df_id, how='left', left_index=True, right_index=True)
df_test = df_test_trans.merge(df_test_id, how='left', left_index=True, right_index=True)

print(df_train.shape)
print(df_train.shape)

(590540, 433)
(590540, 433)


In [3]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def CalcOutliers(df_num): 

    # calculating mean and std of the array
    data_mean, data_std = np.mean(df_num), np.std(df_num)

    # seting the cut line to both higher and lower values
    # You can change this value
    cut = data_std * 3

    #Calculating the higher and lower cut values
    lower, upper = data_mean - cut, data_mean + cut

    # creating an array of lower, higher and total outlier values 
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]

    # array without outlier values
    outliers_removed = [x for x in df_num if x > lower and x < upper]
    
    print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
    print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
    print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides
    print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
    print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points
    
    return

In [4]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 668.22 Mb (66.2% reduction)
Mem. usage decreased to 583.43 Mb (65.6% reduction)


In [5]:
import datetime
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
df_train["Date"] = df_train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
df_train['_Weekdays'] = df_train['Date'].dt.dayofweek
df_train['_Hours'] = df_train['Date'].dt.hour
df_train['_Days'] = df_train['Date'].dt.day

START_DATE_test = '2018-06-01'
startdate = datetime.datetime.strptime(START_DATE_test, "%Y-%m-%d")
df_test["Date"] = df_test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
df_test['_Weekdays'] = df_test['Date'].dt.dayofweek
df_test['_Hours'] = df_test['Date'].dt.hour
df_test['_Days'] = df_test['Date'].dt.day



In [6]:
def corret_card_id(x): 
    x=x.replace('.0','')
    x=x.replace('-999','nan')
    return x

def definie_indexes(df):
    # create card ID 
    cards_cols= ['card1', 'card2', 'card3', 'card5']
    for card in cards_cols: 
        if '1' in card: 
            df['Card_ID']= df[card].map(str)
        else : 
            df['Card_ID']+= ' '+df[card].map(str)
    
    # sort train data by Card_ID and then by transaction date 
    df= df.sort_values(['Card_ID', 'Date'], ascending=[True, True])
    
    # small correction of the Card_ID
    df['Card_ID']=df['Card_ID'].apply(corret_card_id)
    
    # set indexes 
    # df= df.set_index(['Card_ID', 'Date'])
    return df

df_train = definie_indexes(df_train)
df_test = definie_indexes(df_test)


In [7]:
########################### M columns (except M4)
# All these columns are binary encoded 1/0
# We can have some features from it
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

df_train['M_sum'] = df_train[i_cols].sum(axis=1).astype(np.int8)
df_test['M_sum']  = df_test[i_cols].sum(axis=1).astype(np.int8)

df_train['M_na'] = df_train[i_cols].isna().sum(axis=1).astype(np.int8)
df_test['M_na']  = df_test[i_cols].isna().sum(axis=1).astype(np.int8)

df_train['M_type'] = ''
df_test['M_type']  = ''

for col in i_cols:
    df_train['M_type'] = '_'+df_train[col].astype(str)
    df_test['M_type'] = '_'+df_test[col].astype(str)


In [8]:
# Outlier 찾기 : 3시그마 이상

# CalcOutliers(df_trans['TransactionAmt'])

In [9]:
########################### C columns
# C columns are some counts, based on client identity
# Most popular Value is "1" -> that seems to be just a single match 
# (New or stable client)
# You can check that auc score for that cliens are lower than global
# Lets encode such client types

i_cols = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']

df_train['C_sum'] = 0
df_test['C_sum']  = 0

df_train['C_null'] = 0
df_test['C_null']  = 0

for col in i_cols:
    df_train['C_sum'] += np.where(df_train[col]==1,1,0)
    df_test['C_sum']  += np.where(df_test[col]==1,1,0)

    df_train['C_null'] += np.where(df_train[col]==0,1,0)
    df_test['C_null']  += np.where(df_test[col]==0,1,0)
    
    valid_values = df_train[col].value_counts()
    valid_values = valid_values[valid_values>1000]
    valid_values = list(valid_values.index)
    
    df_train[col+'_valid'] = np.where(df_train[col].isin(valid_values),1,0)
    df_test[col+'_valid']  = np.where(df_test[col].isin(valid_values),1,0)


In [10]:
# ########################### Reset values for "noise" card1
# valid_card = df_train['card1'].value_counts()
# valid_card = valid_card[valid_card>10]
# valid_card = list(valid_card.index)
    
# df_train['card1'] = np.where(df_train['card1'].isin(valid_card), df_train['card1'], np.nan)
# df_test['card1']  = np.where(df_test['card1'].isin(valid_card), df_test['card1'], np.nan)


In [11]:
########################### Device info
df_train['DeviceInfo'] = df_train['DeviceInfo'].fillna('unknown_device').str.lower()
df_test['DeviceInfo'] = df_test['DeviceInfo'].fillna('unknown_device').str.lower()

df_train['DeviceInfo_c'] = df_train['DeviceInfo']
df_test['DeviceInfo_c'] = df_test['DeviceInfo']

device_match_dict = {
    'sm':'sm-',
    'sm':'samsung',
    'huawei':'huawei',
    'moto':'moto',
    'rv':'rv:',
    'trident':'trident',
    'lg':'lg-',
    'htc':'htc',
    'blade':'blade',
    'windows':'windows',
    'lenovo':'lenovo',
    'linux':'linux',
    'f3':'f3',
    'f5':'f5'
}
for dev_type_s, dev_type_o in device_match_dict.items():
    df_train['DeviceInfo_c'] = df_train['DeviceInfo_c'].apply(lambda x: dev_type_s if dev_type_o in x else x)
    df_test['DeviceInfo_c'] = df_test['DeviceInfo_c'].apply(lambda x: dev_type_s if dev_type_o in x else x)

df_train['DeviceInfo_c'] = df_train['DeviceInfo_c'].apply(lambda x: 'other_d_type' if x not in device_match_dict else x)
df_test['DeviceInfo_c'] = df_test['DeviceInfo_c'].apply(lambda x: 'other_d_type' if x not in device_match_dict else x)


In [12]:
df_train.loc[df_train['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'

df_train.loc[df_train['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                             'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                             'yahoo.es']), 'R_emaildomain'] = 'Yahoo Mail'
df_train.loc[df_train['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                             'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                             'outlook.es', 'live.com', 'live.fr',
                                             'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
df_train.loc[df_train.R_emaildomain.isin(df_train.R_emaildomain\
                                         .value_counts()[df_train.R_emaildomain.value_counts() <= 300 ]\
                                         .index), 'R_emaildomain'] = "Others"
df_train.R_emaildomain.fillna("NoInf", inplace=True)


In [13]:
df_test.loc[df_test['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'

df_test.loc[df_test['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                             'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                             'yahoo.es']), 'R_emaildomain'] = 'Yahoo Mail'
df_test.loc[df_test['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                             'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                             'outlook.es', 'live.com', 'live.fr',
                                             'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
df_test.loc[df_test.R_emaildomain.isin(df_test.R_emaildomain\
                                         .value_counts()[df_test.R_emaildomain.value_counts() <= 300 ]\
                                         .index), 'R_emaildomain'] = "Others"
df_test.R_emaildomain.fillna("NoInf", inplace=True)


In [14]:
df_train.loc[df_train['id_30'].str.contains('Windows', na=False), 'id_30'] = 'Windows'
df_train.loc[df_train['id_30'].str.contains('iOS', na=False), 'id_30'] = 'iOS'
df_train.loc[df_train['id_30'].str.contains('Mac OS', na=False), 'id_30'] = 'Mac'
df_train.loc[df_train['id_30'].str.contains('Android', na=False), 'id_30'] = 'Android'
df_train['id_30'].fillna("NAN", inplace=True)

In [15]:
df_test.loc[df_test['id_30'].str.contains('Windows', na=False), 'id_30'] = 'Windows'
df_test.loc[df_test['id_30'].str.contains('iOS', na=False), 'id_30'] = 'iOS'
df_test.loc[df_test['id_30'].str.contains('Mac OS', na=False), 'id_30'] = 'Mac'
df_test.loc[df_test['id_30'].str.contains('Android', na=False), 'id_30'] = 'Android'
df_test['id_30'].fillna("NAN", inplace=True)


In [16]:
df_train.loc[df_train['id_31'].str.contains('chrome', na=False), 'id_31'] = 'Chrome'
df_train.loc[df_train['id_31'].str.contains('firefox', na=False), 'id_31'] = 'Firefox'
df_train.loc[df_train['id_31'].str.contains('safari', na=False), 'id_31'] = 'Safari'
df_train.loc[df_train['id_31'].str.contains('edge', na=False), 'id_31'] = 'Edge'
df_train.loc[df_train['id_31'].str.contains('ie', na=False), 'id_31'] = 'IE'
df_train.loc[df_train['id_31'].str.contains('samsung', na=False), 'id_31'] = 'Samsung'
df_train.loc[df_train['id_31'].str.contains('opera', na=False), 'id_31'] = 'Opera'
df_train['id_31'].fillna("NAN", inplace=True)
df_train.loc[df_train.id_31.isin(df_train.id_31.value_counts()[df_train.id_31.value_counts() < 200].index), 'id_31'] = "Others"

In [17]:
df_test.loc[df_test['id_31'].str.contains('chrome', na=False), 'id_31'] = 'Chrome'
df_test.loc[df_test['id_31'].str.contains('firefox', na=False), 'id_31'] = 'Firefox'
df_test.loc[df_test['id_31'].str.contains('safari', na=False), 'id_31'] = 'Safari'
df_test.loc[df_test['id_31'].str.contains('edge', na=False), 'id_31'] = 'Edge'
df_test.loc[df_test['id_31'].str.contains('ie', na=False), 'id_31'] = 'IE'
df_test.loc[df_test['id_31'].str.contains('samsung', na=False), 'id_31'] = 'Samsung'
df_test.loc[df_test['id_31'].str.contains('opera', na=False), 'id_31'] = 'Opera'
df_test['id_31'].fillna("NAN", inplace=True)
df_test.loc[df_test.id_31.isin(df_test.id_31.value_counts()[df_test.id_31.value_counts() < 200].index), 'id_31'] = "Others"


In [18]:
def features_interaction(df, feature_1, feature_2):
    return df[feature_1].astype(str) + '_' + df[feature_2].astype(str)

In [19]:
# Based on https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419

features_interactions = [
    'id_02__id_20',
    'id_02__D8',
    'D11__DeviceInfo',
    'DeviceInfo__P_emaildomain',
    'P_emaildomain__C2',
    'card2__dist1',
    'card1__card5',
    'card2__id_20',
    'card5__P_emaildomain',
    'addr1__card1'
]

for new_feature in features_interactions:
    feature_1, feature_2 = new_feature.split('__')
    
    df_train[new_feature] = features_interaction(df_train, feature_1, feature_2)
    df_test[new_feature] = features_interaction(df_test, feature_1, feature_2)

In [20]:
df_train['count_last'] = df_train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).count())
df_train['mean_last'] = df_train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).mean())
df_train['min_last'] = df_train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).min())
df_train['max_last'] = df_train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).max())
df_train['std_last'] = df_train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())
df_test['count_last'] = df_test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).count())
df_test['mean_last'] = df_test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).mean())
df_test['min_last'] = df_test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).min())
df_test['max_last'] = df_test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).max())
df_test['std_last'] = df_test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())
df_train['trans_mean_last'] = df_train['TransactionAmt'] / df_train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).mean())
df_train['trans_std_last'] = df_train['TransactionAmt'] / df_train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())
df_test['trans_mean_last'] = df_test['TransactionAmt'] / df_test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).mean())
df_test['trans_std_last'] = df_test['TransactionAmt'] / df_test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())
df_train['TransactionAmt_to_mean_card_id'] = df_train['TransactionAmt'] - df_train.groupby(['Card_ID'])['TransactionAmt'].transform('mean')
df_train['TransactionAmt_to_std_card_id'] = df_train['TransactionAmt_to_mean_card_id'] / df_train.groupby(['Card_ID'])['TransactionAmt'].transform('std')
df_test['TransactionAmt_to_mean_card_id'] = df_test['TransactionAmt'] - df_test.groupby(['Card_ID'])['TransactionAmt'].transform('mean')
df_test['TransactionAmt_to_std_card_id'] = df_test['TransactionAmt_to_mean_card_id'] / df_test.groupby(['Card_ID'])['TransactionAmt'].transform('std')
df_train['id_02_to_mean_card_id'] = df_train['id_02'] / df_train.groupby(['Card_ID'])['id_02'].transform('mean')
df_train['id_02_to_std_card_id'] = df_train['id_02'] / df_train.groupby(['Card_ID'])['id_02'].transform('std')
df_test['id_02_to_mean_card_id'] = df_test['id_02'] / df_test.groupby(['Card_ID'])['id_02'].transform('mean')
df_test['id_02_to_std_card_id'] = df_test['id_02'] / df_test.groupby(['Card_ID'])['id_02'].transform('std')
df_train['D15_to_mean_card_id'] = df_train['D15'] / df_train.groupby(['Card_ID'])['D15'].transform('mean')
df_train['D15_to_std_card_id'] = df_train['D15'] / df_train.groupby(['Card_ID'])['D15'].transform('std')
df_test['D15_to_mean_card_id'] = df_test['D15'] / df_test.groupby(['Card_ID'])['D15'].transform('mean')
df_test['D15_to_std_card_id'] = df_test['D15'] / df_test.groupby(['Card_ID'])['D15'].transform('std')
df_train['D15_to_mean_addr1'] = df_train['D15'] / df_train.groupby(['addr1'])['D15'].transform('mean')
df_train['D15_to_std_addr1'] = df_train['D15'] / df_train.groupby(['addr1'])['D15'].transform('std')
df_test['D15_to_mean_addr1'] = df_test['D15'] / df_test.groupby(['addr1'])['D15'].transform('mean')
df_test['D15_to_std_addr1'] = df_test['D15'] / df_test.groupby(['addr1'])['D15'].transform('std')

In [21]:
df_train['card1_count_full'] = df_train['card1'].map(pd.concat([df_train['card1'], df_test['card1']], ignore_index=True).value_counts(dropna=False))
df_test['card1_count_full'] = df_test['card1'].map(pd.concat([df_train['card1'], df_test['card1']], ignore_index=True).value_counts(dropna=False))


df_train['card2_count_full'] = df_train['card2'].map(pd.concat([df_train['card2'], df_test['card2']], ignore_index=True).value_counts(dropna=False))
df_test['card2_count_full'] = df_test['card2'].map(pd.concat([df_train['card2'], df_test['card2']], ignore_index=True).value_counts(dropna=False))

df_train['card3_count_full'] = df_train['card3'].map(pd.concat([df_train['card3'], df_test['card3']], ignore_index=True).value_counts(dropna=False))
df_test['card3_count_full'] = df_test['card3'].map(pd.concat([df_train['card3'], df_test['card3']], ignore_index=True).value_counts(dropna=False))

df_train['card4_count_full'] = df_train['card4'].map(pd.concat([df_train['card4'], df_test['card4']], ignore_index=True).value_counts(dropna=False))
df_test['card4_count_full'] = df_test['card4'].map(pd.concat([df_train['card4'], df_test['card4']], ignore_index=True).value_counts(dropna=False))

df_train['card5_count_full'] = df_train['card5'].map(pd.concat([df_train['card5'], df_test['card5']], ignore_index=True).value_counts(dropna=False))
df_test['card5_count_full'] = df_test['card5'].map(pd.concat([df_train['card5'], df_test['card5']], ignore_index=True).value_counts(dropna=False))

df_train['card6_count_full'] = df_train['card6'].map(pd.concat([df_train['card6'], df_test['card6']], ignore_index=True).value_counts(dropna=False))
df_test['card6_count_full'] = df_test['card6'].map(pd.concat([df_train['card6'], df_test['card6']], ignore_index=True).value_counts(dropna=False))


df_train['addr1_count_full'] = df_train['addr1'].map(pd.concat([df_train['addr1'], df_test['addr1']], ignore_index=True).value_counts(dropna=False))
df_test['addr1_count_full'] = df_test['addr1'].map(pd.concat([df_train['addr1'], df_test['addr1']], ignore_index=True).value_counts(dropna=False))

df_train['addr2_count_full'] = df_train['addr2'].map(pd.concat([df_train['addr2'], df_test['addr2']], ignore_index=True).value_counts(dropna=False))
df_test['addr2_count_full'] = df_test['addr2'].map(pd.concat([df_train['addr2'], df_test['addr2']], ignore_index=True).value_counts(dropna=False))


In [22]:
for feature in ['id_34', 'id_36']:
    # Count encoded for both df_train and df_test
    df_train[feature + '_count_full'] = df_train[feature].map(pd.concat([df_train[feature], df_test[feature]], ignore_index=True).value_counts(dropna=False))
    df_test[feature + '_count_full'] = df_test[feature].map(pd.concat([df_train[feature], df_test[feature]], ignore_index=True).value_counts(dropna=False))
        
for feature in ['id_01', 'id_31', 'id_33', 'id_35', 'id_36']:
    # Count encoded separately for df_train and df_test
    df_train[feature + '_count_dist'] = df_train[feature].map(df_train[feature].value_counts(dropna=False))
    df_test[feature + '_count_dist'] = df_test[feature].map(df_test[feature].value_counts(dropna=False))


In [23]:
df_train['P_isproton']=(df_train['P_emaildomain']=='protonmail.com')
df_train['R_isproton']=(df_train['R_emaildomain']=='protonmail.com')
df_test['P_isproton']=(df_test['P_emaildomain']=='protonmail.com')
df_test['R_isproton']=(df_test['R_emaildomain']=='protonmail.com')


In [24]:
a = np.zeros(df_train.shape[0])
df_train["lasdf_test_browser"] = a
a = np.zeros(df_test.shape[0])
df_test["lasdf_test_browser"] = a
def setbrowser(df):
    df.loc[df["id_31"]=="samsung browser 7.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="opera 53.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="mobile safari 10.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="google search application 49.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="firefox 60.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="edge 17.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 69.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 67.0 for android",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for android",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for ios",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 64.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for android",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for ios",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 65.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for android",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for ios",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 66.0",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for android",'lasdf_test_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for ios",'lasdf_test_browser']=1
    return df
df_train=setbrowser(df_train)
df_test=setbrowser(df_test)


In [25]:
df_train['TransactionAmt_decimal'] = ((df_train['TransactionAmt'] - df_train['TransactionAmt'].astype(int)) * 1000).astype(int)
df_test['TransactionAmt_decimal'] = ((df_test['TransactionAmt'] - df_test['TransactionAmt'].astype(int)) * 1000).astype(int)


In [26]:
########################### Freq encoding
i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8','D9',
          'addr1','addr2',
          'dist1','dist2',
          'P_emaildomain', 'R_emaildomain',
          'id_01','id_02','id_03','id_04','id_05','id_06','id_07','id_08','id_09','id_10',
          'id_11','id_13','id_14','id_17','id_18','id_19','id_20','id_21','id_22','id_24',
          'id_25','id_26','id_30','id_31','id_32','id_33',#'id_33_0','id_33_1',
          'DeviceInfo','DeviceInfo_c',#'id_30_c','id_30_v','id_31_v',
         ]

for col in i_cols:
    temp_df = pd.concat([df_train[[col]], df_test[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()   
    df_train[col+'_fq_enc'] = df_train[col].map(fq_encode)
    df_test[col+'_fq_enc']  = df_test[col].map(fq_encode)


In [27]:
one_value_cols = [col for col in df_train.columns if df_train[col].nunique() <= 1]
one_value_cols_test = [col for col in df_test.columns if df_test[col].nunique() <= 1]
one_value_cols == one_value_cols_test



False

In [28]:
many_null_cols = [col for col in df_train.columns if df_train[col].isnull().sum() / df_train.shape[0] > 0.9]
many_null_cols_test = [col for col in df_test.columns if df_test[col].isnull().sum() / df_test.shape[0] > 0.9]

In [29]:
big_top_value_cols = [col for col in df_train.columns if df_train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in df_test.columns if df_test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

In [30]:
cols_to_drop = list(set(many_null_cols + many_null_cols_test +
                        big_top_value_cols +
                        big_top_value_cols_test +
                        one_value_cols+ one_value_cols_test))
len(cols_to_drop)
print(len(cols_to_drop),cols_to_drop)

112 ['lasdf_test_browser', 'C5_valid', 'V24', 'V321', 'id_24_fq_enc', 'V25', 'V298', 'id_22', 'C7_valid', 'V68', 'V305', 'C13_valid', 'V88', 'V123', 'V119', 'V124', 'V281', 'V284', 'id_25', 'V121', 'V55', 'V318', 'id_26', 'V98', 'V296', 'id_23', 'C3', 'V112', 'V113', 'C11_valid', 'V122', 'V135', 'V86', 'V300', 'V110', 'V109', 'C2_valid', 'V102', 'V27', 'id_18', 'V301', 'dist2_fq_enc', 'V89', 'V132', 'V115', 'V137', 'V136', 'D7', 'V319', 'C8_valid', 'id_21_fq_enc', 'id_27', 'V67', 'C1_valid', 'id_21', 'id_26_fq_enc', 'C3_fq_enc', 'V299', 'V28', 'V118', 'V316', 'V23', 'V111', 'V286', 'C12_valid', 'V117', 'M_sum', 'C6_valid', 'V114', 'C14_valid', 'id_18_fq_enc', 'V125', 'isFraud', 'D7_fq_enc', 'V116', 'V65', 'V311', 'id_08_fq_enc', 'V295', 'V309', 'V297', 'V103', 'V133', 'id_22_fq_enc', 'V26', 'V320', 'V120', 'id_07_fq_enc', 'V108', 'C4_valid', 'V105', 'C3_valid', 'C9_valid', 'V14', 'V101', 'id_07', 'V293', 'id_08', 'V134', 'V77', 'V107', 'V129', 'V66', 'id_25_fq_enc', 'V290', 'id_24', 'R

In [31]:
cols_to_drop.remove('isFraud')

df_train = df_train.drop(cols_to_drop, axis=1)
df_test = df_test.drop(cols_to_drop, axis=1)

In [32]:
# categorical features encoding
from tqdm import tqdm_notebook
# for col in cat_cols:
#     if col in df_train.columns:
#         le = preprocessing.LabelEncoder()
#         le.fit(list(df_train[col].astype(str).values) + list(df_test[col].astype(str).values))
#         df_train[col] = le.transform(list(df_train[col].astype(str).values))
#         df_test[col] = le.transform(list(df_test[col].astype(str).values))   

for col in tqdm_notebook(df_train.columns):
    if df_train[col].dtype == 'object':
        le = preprocessing.LabelEncoder()
        le.fit(list(df_train[col].astype(str).values) + list(df_test[col].astype(str).values))
        df_train[col] = le.transform(list(df_train[col].astype(str).values))
        df_test[col] = le.transform(list(df_test[col].astype(str).values))

HBox(children=(IntProgress(value=0, max=452), HTML(value='')))




In [33]:
X_train = df_train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'Date'], axis=1)
Y_train = df_train.sort_values('TransactionDT')['isFraud']
X_test = df_test.sort_values('TransactionDT').drop(['TransactionDT','Date'], axis=1)
del df_train
df_test = df_test[["TransactionDT"]]



In [34]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)


gc.collect()

Mem. usage decreased to 604.30 Mb (35.2% reduction)
Mem. usage decreased to 529.12 Mb (34.5% reduction)


21

In [35]:
# Hyperparameter 찾기

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit

def objective(params):

    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'gamma': "{:.3f}".format(params['gamma']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }
    
    EPOCHS = 5
    tss = TimeSeriesSplit(n_splits=EPOCHS)
    score_mean = 0
    print("CV SCORE: ")
    for tr_idx, val_idx in tss.split(X_train, Y_train):
        clf = xgb.XGBClassifier(
            n_estimators=2000, random_state=4,
            tree_method='gpu_hist', verbosity=2,
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = Y_train.iloc[tr_idx], Y_train.iloc[val_idx]
        eval_set = [(X_vl, y_vl)]
        clf.fit(X_tr, y_tr,eval_metric = 'auc', eval_set=eval_set ,early_stopping_rounds=50, verbose=False)
        
        y_pred_train = clf.predict_proba(X_vl)[:,1]
        score = roc_auc_score(y_vl, y_pred_train)
        score_mean += score
        del clf
        gc.collect()
        print(f'ROC AUC {score}')
    del X_tr, X_vl, y_tr, y_vl, clf, y_pred_train    
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / EPOCHS} \n')
    return -(score_mean / EPOCHS)

space = {
    'max_depth': hp.quniform('max_depth', 7, 24, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.9),
    'reg_lambda': hp.uniform('reg_lambda', 0.1, 1.0),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.01),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.8),
    'num_leaves': hp.choice('num_leaves', list(range(20, 300, 20))),       
    'min_child_samples': hp.choice('min_child_samples', list(range(10, 80, 3))),
    'feature_fraction': hp.choice('feature_fraction', [.5, .6, .7, .8, .9]),
    'bagging_fraction': hp.choice('bagging_fraction', [.5, .6, .7, .8, .9])
}


In [36]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=40)

# Print best parameters
# best_params = space_eval(space, best)

print("BEST PARAMS: ", best)

best['max_depth'] = int(best['max_depth'])

############## New Run ################
PARAMETERS: 
params  = {'bagging_fraction': 0.9, 'colsample_bytree': 0.4378309249475988, 'feature_fraction': 0.8, 'gamma': 0.5942504733210476, 'learning_rate': 0.009489092224723203, 'max_depth': 22.0, 'min_child_samples': 31, 'num_leaves': 80, 'reg_alpha': 0.26381255542215887, 'reg_lambda': 0.3816239750315529}
CV SCORE: 
ROC AUC 0.9006447594815646
ROC AUC 0.914669305090494
ROC AUC 0.9101905792356325
ROC AUC 0.9324018824229795
ROC AUC 0.9318381766839439
  0%|          | 0/40 [33:45<?, ?it/s, best loss: ?]


UnboundLocalError: local variable 'clf' referenced before assignment

In [37]:
# Fit
clf = xgb.XGBClassifier(
    n_estimators=2000,
    
    **best,early_stopping_rounds=50,
    tree_method='gpu_hist'
)
#num_boost_round
clf.fit(X_train, y_train,eval_metric = 'auc',early_stopping_rounds=50)

y_preds = clf.predict_proba(X_test)[:,1] 

NameError: name 'best' is not defined

In [38]:
results = clf.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

NameError: name 'clf' is not defined

In [39]:
# Feature importance
feature_important = clf.get_booster().get_score(importance_type="weight")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top 10 features
data.head(10)

NameError: name 'clf' is not defined

In [40]:

sample_submission['isFraud'] = y_preds
# sample_submission.to_csv('XGB_hypopt_model.csv')
sample_submission.to_csv('submission.csv')

NameError: name 'y_preds' is not defined