# 1.	Objective and Background

In this kernel, I work with IEEE Fraud Detection competition.

EEE-CIS works across a variety of AI and machine learning areas, including deep neural networks, fuzzy systems, evolutionary computation, and swarm intelligence. Today they’re partnering with the world’s leading payment service company, Vesta Corporation, seeking the best solutions for fraud prevention industry.

Specifically, **the objective of this kernel is to predict the probability that an online transaction is fraudulent**.


*Acknowledgements to inspiring codebooks:*
1. https://www.kaggle.com/artgor/eda-and-models/notebook
2. https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt/notebook
3. https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600

# 2.   Roadmap

a).	Preparation: Import Libraries

b).	Preparation: Define Functions used in this kernel

c).	Data Loading and Overview

d).	Exploratory Data Analysis

e).	Featuring Engineering

f).	Feature Selection

g).	XBGoost Modelling

h).	Make and Submit the Predication

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
pd.options.display.precision = 15


import xgboost as xgb
import time
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import eli5
import shap
from IPython.display import HTML

## Functions used in this kernel

In [None]:
import os
import time
import datetime
import json
import gc
from numba import jit

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn import metrics

from itertools import product

import altair as alt
from altair.vega import v5
from IPython.display import HTML

# using ideas from this kernel: https://www.kaggle.com/notslush/altair-visualization-2018-stackoverflow-survey
def prepare_altair():
    """
    Helper function to prepare altair for working.
    """

    vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v5.SCHEMA_VERSION
    vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
    vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
    vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
    noext = "?noext"
    
    paths = {
        'vega': vega_url + noext,
        'vega-lib': vega_lib_url + noext,
        'vega-lite': vega_lite_url + noext,
        'vega-embed': vega_embed_url + noext
    }
    
    workaround = f"""    requirejs.config({{
        baseUrl: 'https://cdn.jsdelivr.net/npm/',
        paths: {paths}
    }});
    """
    
    return workaround
    

def add_autoincrement(render_func):
    # Keep track of unique <div/> IDs
    cache = {}
    def wrapped(chart, id="vega-chart", autoincrement=True):
        if autoincrement:
            if id in cache:
                counter = 1 + cache[id]
                cache[id] = counter
            else:
                cache[id] = 0
            actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
        else:
            if id not in cache:
                cache[id] = 0
            actual_id = id
        return render_func(chart, id=actual_id)
    # Cache will stay outside and 
    return wrapped
           

@add_autoincrement
def render(chart, id="vega-chart"):
    """
    Helper function to plot altair visualizations.
    """
    chart_str = """
    <div id="{id}"></div><script>
    require(["vega-embed"], function(vg_embed) {{
        const spec = {chart};     
        vg_embed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
        console.log("anything?");
    }});
    console.log("really...anything?");
    </script>
    """
    return HTML(
        chart_str.format(
            id=id,
            chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
        )
    )
    
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# setting up altair
workaround = prepare_altair()
HTML("".join((
    "<script>",
    workaround,
    "</script>",
)))

# 3. Data loading and overview

Data is separated into two datasets: information about the **customer identity** and **transaction**, joined by TransactionID. Not all transactions have corresponding identity information.

- **Numerical Features - Transaction**
    - TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
    - TransactionAMT: transaction payment amount in USD
    - dist: distance
    - C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
    - D1-D15: timedelta, such as days between previous transaction, etc.
    - Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.


- **Categorical Features - Transaction**
    - ProductCD: product code, the product for each transaction
    - card1 - card6: : payment card information, such as card type, card category, issue bank, country, etc.
    - addr1, addr2: address
    - P_emaildomain: purchaser email domain
    - R_emaildomain: recipient email domain
    - M1 - M9: match, such as names on card and address, etc.



- **Explanation on Identity Data**
    - Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
    - They're collected by Vesta’s fraud protection system and digital security partners.(The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)


- **Categorical Features - Identity**
    - DeviceType
    - DeviceInfo
    - id_12 - id_38

*More details about the data: 
https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203*

We will load all the data except 219 V columns that were determined redundant by correlation analysis https://www.kaggle.com/cdeotte/eda-for-columns-v-and-id.

In [None]:
%%time
folder_path = '../input/'

train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')

sub = pd.read_csv(f'{folder_path}sample_submission.csv')

fix = {o:n for o, n in zip(test_identity.columns, train_identity.columns)}#"id" columns in test dataset are different from the train dataset
test_identity.rename(columns=fix, inplace=True)

# let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

del train_identity, train_transaction, test_identity, test_transaction, fix

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

In [None]:
# V COLUMNS TO LOAD DECIDED BY CORRELATION EDA
# https://www.kaggle.com/cdeotte/eda-for-columns-v-and-id
v =  [1, 3, 4, 6, 8, 11]
v += [13, 14, 17, 20, 23, 26, 27, 30]
v += [36, 37, 40, 41, 44, 47, 48]
v += [54, 56, 59, 62, 65, 67, 68, 70]
v += [76, 78, 80, 82, 86, 88, 89, 91]

#v += [96, 98, 99, 104] #relates to groups, no NAN 
v += [107, 108, 111, 115, 117, 120, 121, 123] # maybe group, no NAN
v += [124, 127, 129, 130, 136] # relates to groups, no NAN

# LOTS OF NAN BELOW
v += [138, 139, 142, 147, 156, 162] #b1
v += [165, 160, 166] #b1
v += [178, 176, 173, 182] #b2
v += [187, 203, 205, 207, 215] #b2
v += [169, 171, 175, 180, 185, 188, 198, 210, 209] #b2
v += [218, 223, 224, 226, 228, 229, 235] #b3
v += [240, 258, 257, 253, 252, 260, 261] #b3
v += [264, 266, 267, 274, 277] #b3
v += [220, 221, 234, 238, 250, 271] #b3

v += [294, 284, 285, 286, 291, 297] # relates to grous, no NAN
v += [303, 305, 307, 309, 310, 320] # relates to groups, no NAN
v += [281, 283, 289, 296, 301, 314] # relates to groups, no NAN
#v += [332, 325, 335, 338] # b4 lots NAN

use_Vcols = ['V'+str(x) for x in v]
Vcols = ['V'+str(x) for x in range(1,340)]
drop_Vcols = list(set(Vcols) - set(use_Vcols))

train.drop(drop_Vcols, axis=1, inplace=True)
test.drop(drop_Vcols, axis=1, inplace=True)

In [None]:
print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

So we have two medium-sized datasets with a lot of columns. Train and test data have similar number of rows

In [None]:
print(f'There are {train.isnull().any().sum()} columns in train dataset with missing values.')

In [None]:
one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]
one_value_cols == one_value_cols_test

In [None]:
print(f'There are {len(one_value_cols)} columns in train dataset with one unique value.')
print(f'There are {len(one_value_cols_test)} columns in test dataset with one unique value.')

In [None]:
test[one_value_cols_test].describe()

Most of columns have missing data, which is normal in real world. Also there is one column in the test dataset with one unique value. 

There are a lot of continuous variables and some categorical. Let's have a closer look at them.

# 4. Exploratory Data Analysis

I will start EDA on identity data and transaction respectively. The aim is to answer some questions like:

- What type of data we have on our data?
- How many cols, rows, missing values we have?
- What's the target distribution?
- What's the Transactions values distribution of fraud and no fraud transactions?
- Do we have predominant fraudulent products?
- What features or target shows some interesting patterns?

And a lot of more questions that will raise trought the exploration.

## 4.1 EDA on Identity Data

Let's start with identity information.
id_01 - id_11 are continuous variables, id_12 - id_38 are categorical and the last two columns are obviously also categorical.

*Previously*:
- **Explanation on Identity Data**
    - Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
    - They're collected by Vesta’s fraud protection system and digital security partners.(The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)


- **Categorical Features - Identity**
    - DeviceType
    - DeviceInfo
    - id_12 - id_38

### 4.1.1 EDA on Numerical Features (id_01 - id_11) - Identity

In [None]:
train[['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11']].describe(include='all')

In [None]:
plt.hist(train['id_01'], bins=77);
plt.title('Distribution of id_01 variable');

`id_01` has an interesting distribution: it has 77 unique non-positive values with skewness to 0.

In [None]:
s1 = train['id_03'].value_counts(dropna=False, normalize=True).head()
s2 = train['id_04'].value_counts(dropna=False, normalize=True).head()
s3 = train['id_05'].value_counts(dropna=False, normalize=True).head()
s4 = train['id_06'].value_counts(dropna=False, normalize=True).head()
s5 = train['id_09'].value_counts(dropna=False, normalize=True).head()
s6 = train['id_10'].value_counts(dropna=False, normalize=True).head()
print(pd.concat([s1, s2, s3, s4, s5, s6], axis = 1))

`id_03`, `id_04`, `id_05`, `id_06`, `id_09` and `id_10` have over 76% of missing values and over 90% of values are either missing or equal to 0.
So maybe we will filter out these features in our feature selection part.

In [None]:
train['id_11'].value_counts(dropna=False, normalize=True).head()

22% of values in `id_11` are equal to 100 and 76% are missing. Quite strange.

In [None]:
plt.hist(train['id_07']);
plt.title('Distribution of id_07 variable');

In [None]:
plt.hist(train['id_02']);
plt.title('Distribution of id_02 variable');

In [None]:
plt.hist(train['id_08']);
plt.title('Distribution of id_08 variable');

Some of features seem to be normalized, and some are not. So if someone wants to normalize all variables, it would be necessary to separate such variables which seem to be already normalized.

### 4.1.2 EDA on Categorical Features (id_12 - id_38; DeviceType; DeviceInfo) - Identity

In [None]:
train[['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',
       'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25',
       'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32',
       'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']].describe(include='all')

In [None]:
charts = {}
for i in ['id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']:
    feature_count = train[i].value_counts(dropna=False).reset_index().rename(columns={i: 'count', 'index': i})
    chart = alt.Chart(feature_count).mark_bar().encode(
                y=alt.Y(f"{i}:N", axis=alt.Axis(title=i)),
                x=alt.X('count:Q', axis=alt.Axis(title='Count')),
                tooltip=[i, 'count']
            ).properties(title=f"Counts of {i}", width=400)
    charts[i] = chart                         
    
render((charts['id_12'] | charts['id_15'] | charts['id_16']) & (charts['id_28'] | charts['id_29'] | charts['id_32']) & (charts['id_34'] | charts['id_35'] | charts['id_36']) & (charts['id_37'] | charts['id_38']))

We have several features showing some kind of "found" status and several binary columns.

In [None]:
charts = {}
for i in ['id_30', 'id_31', 'id_33', 'DeviceType', 'DeviceInfo']:
    feature_count = train[i].value_counts(dropna=False)[:40].reset_index().rename(columns={i: 'count', 'index': i})
    chart = alt.Chart(feature_count).mark_bar().encode(
                x=alt.X(f"{i}:N", axis=alt.Axis(title=i)),
                y=alt.Y('count:Q', axis=alt.Axis(title='Count')),
                tooltip=[i, 'count']
            ).properties(title=f"Counts of {i}", width=800)
    charts[i] = chart
    
render(charts['id_30'] & charts['id_31'] & charts['id_33'] & charts['DeviceType'] & charts['DeviceInfo'])

Here we can see some information about client's device. It is important to be careful here - some of info could be for old devices and may be absent from test data.

## 4.2 EDA on Transaction Data
Now let's have a look at transaction data.

*Previously*:
- **Numerical Features - Transaction**
    - TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
    - TransactionAmt: transaction payment amount in USD
    - dist: distance
    - C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
    - D1-D15: timedelta, such as days between previous transaction, etc.
    - Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.
​
​
- **Categorical Features - Transaction**
    - ProductCD: product code, the product for each transaction
    - card1 - card6: : payment card information, such as card type, card category, issue bank, country, etc.
    - addr1, addr2: address
    - P_emaildomain: purchaser email domain
    - R_emaildomain: recipient email domain
    - M1 - M9: match, such as names on card and address, etc.

### 4.2.1 EDA on Numerical Features (TransactionDT; TransactionAmt; dist; C1-C14; D1-D15; Vxxx) - Transaction

In [None]:
plt.hist(train['TransactionDT'], label='train');
plt.hist(test['TransactionDT'], label='test');
plt.legend();
plt.title('Distribution of transactiond dates');

A very important idea: it seems that train and test transaction dates don't overlap, so it would be prudent to use time-based split for validation.
This was already noted in abother kernel: https://www.kaggle.com/robikscube/ieee-fraud-detection-first-look-and-eda

#### Normalize D Columns
The D Columns are "time deltas" from some point in the past. We will transform the D Columns into their point in the past. This will stop the D columns from increasing with time. The formula is D15n = Transaction_Day - D15 and Transaction_Day = TransactionDT/(24*60*60). Afterward we multiple this number by negative one.

In [None]:
# PLOT ORIGINAL D
plt.figure(figsize=(15,5))
plt.scatter(train.TransactionDT,train.D15)
plt.title('Original D15')
plt.xlabel('Time')
plt.ylabel('D15')
plt.show()

In [None]:
# NORMALIZE D COLUMNS
for i in range(1,16):
    if i in [1,2,3,5,9]: continue
    train['D'+str(i)] =  train['D'+str(i)] - train.TransactionDT/np.float32(24*60*60)
    test['D'+str(i)] = test['D'+str(i)] - test.TransactionDT/np.float32(24*60*60) 

In [None]:
# PLOT TRANSFORMED D
plt.figure(figsize=(15,5))
plt.scatter(train.TransactionDT,train.D15)
plt.title('Transformed D15')
plt.xlabel('Time')
plt.ylabel('D15n')
plt.show()

In [None]:
plt.hist(train['TransactionAmt'], label='train');
plt.hist(test['TransactionAmt'], label='test');
plt.legend();
plt.title('Distribution of transaction amount');

#### EDA for columns Vxxx
Please refer to https://www.kaggle.com/cdeotte/eda-for-columns-v-and-id.
And 219 V columns were determined redundant by correlation analysis and removed at the beginning of this kernel.

### 4.2.2 EDA on Categorical Features (ProductCD; card1-card6; addr1, addr2; P_emaildomain; R_emaildomain; M1-M9) - Transaction

In [None]:
charts = {}
for i in ['ProductCD', 'card4', 'card6', 'M4', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9']:
    feature_count = train[i].value_counts(dropna=False).reset_index().rename(columns={i: 'count', 'index': i})
    chart = alt.Chart(feature_count).mark_bar().encode(
                y=alt.Y(f"{i}:N", axis=alt.Axis(title=i)),
                x=alt.X('count:Q', axis=alt.Axis(title='Count')),
                tooltip=[i, 'count']
            ).properties(title=f"Counts of {i}", width=400)
    charts[i] = chart                         
    
render((charts['ProductCD'] | charts['card4']) & (charts['card6'] | charts['M4']) & (charts['card6'] | charts['M4']) & (charts['M1'] | charts['M2']) & (charts['M3'] | charts['M5']) & (charts['M6'] | charts['M7']) & (charts['M8'] | charts['M9']))

So `card6` is type of card, `card4` is credit card company

In [None]:
charts = {}
for i in ['P_emaildomain', 'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2']:
    feature_count = train[i].value_counts(dropna=False).reset_index()[:40].rename(columns={i: 'count', 'index': i})
    chart = alt.Chart(feature_count).mark_bar().encode(
                x=alt.X(f"{i}:N", axis=alt.Axis(title=i)),
                y=alt.Y('count:Q', axis=alt.Axis(title='Count')),
                tooltip=[i, 'count']
            ).properties(title=f"Counts of {i}", width=600)
    charts[i] = chart
    
render((charts['P_emaildomain'] | charts['R_emaildomain']) & (charts['card1'] | charts['card2']) & (charts['card3'] | charts['card5']) & (charts['addr1'] | charts['addr2']))

# 5. Feature engineering

## Mapping Email

In [None]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

us_emails = ['gmail', 'net', 'edu']

# https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
for c in ['P_emaildomain', 'R_emaildomain']:
    train[c + '_bin'] = train[c].map(emails)
    test[c + '_bin'] = test[c].map(emails)
    
    train[c + '_suffix'] = train[c].map(lambda x: str(x).split('.')[-1])
    test[c + '_suffix'] = test[c].map(lambda x: str(x).split('.')[-1])
    
    train[c + '_suffix'] = train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test[c + '_suffix'] = test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

## Encoding Functions
Below are 5 encoding functions. (1) encode_FE does frequency encoding where it combines train and test first and then encodes. (2) encode_LE is a label encoded for categorical features (3) encode_AG makes aggregated features such as aggregated mean and std (4) encode_CB combines two columns (5) encode_AG2 makes aggregated features where it counts how many unique values of one feature is within a group.

In [None]:
# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600/notebook#Load-Data
# FREQUENCY ENCODE TOGETHER
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype('float32')
        print(nm,', ',end='')
        
# LABEL ENCODE
def encode_LE(col,train=train,test=test,verbose=True):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()
    if verbose: print(nm,', ',end='')
        
# GROUP AGGREGATION MEAN AND STD
# https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda
def encode_AG(main_columns, uids, aggregations=['mean'], train_df=train, test_df=test, 
              fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = main_column+'_'+col+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df).astype('float32')
                
                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)
                
                print("'"+new_col_name+"'",', ',end='')
                
# COMBINE FEATURES
def encode_CB(col1,col2,df1=train,df2=test):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) 
    encode_LE(nm,verbose=False)
    print(nm,', ',end='')
    
# GROUP AGGREGATION NUNIQUE
def encode_AG2(main_columns, uids, train_df=train, test_df=test):
    for main_column in main_columns:  
        for col in uids:
            comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
            train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
            test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
            print(col+'_'+main_column+'_ct, ',end='')

## 5.1 Feature engineering on Numerial Variables
Let's do some aggregation on top features found in EDA.

In [None]:
train['Trans_min_mean'] = train['TransactionAmt'] - train['TransactionAmt'].mean()
train['Trans_min_std'] = train['Trans_min_mean'] / train['TransactionAmt'].std()
test['Trans_min_mean'] = test['TransactionAmt'] - test['TransactionAmt'].mean()
test['Trans_min_std'] = test['Trans_min_mean'] / test['TransactionAmt'].std()

In [None]:
train['TransactionAmt_log'] = np.log(train['TransactionAmt'])
test['TransactionAmt_log'] = np.log(test['TransactionAmt'])

In [None]:
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

# train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
# train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
# train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
# train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

# test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
# test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
# test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
# test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('std')

In [None]:
train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

All of the following features where chosen because each increase local validation. The procedure for engineering features is as follows. First you think of an idea and create a new feature. Then you add it to your model and evaluate whether local validation AUC increases or decreases. If AUC increases keep the feature, otherwise discard the feature.

In [None]:
# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600/notebook#Load-Data

%time
# TRANSACTION AMT CENTS
train['cents'] = (train['TransactionAmt'] - np.floor(train['TransactionAmt'])).astype('float32')
test['cents'] = (test['TransactionAmt'] - np.floor(test['TransactionAmt'])).astype('float32')
print('cents, ', end='')
# FREQUENCY ENCODE: ADDR1, CARD1, CARD2, CARD3, P_EMAILDOMAIN
encode_FE(train,test,['addr1','card1','card2','card3','P_emaildomain'])
# COMBINE COLUMNS CARD1+ADDR1, CARD1+ADDR1+P_EMAILDOMAIN
encode_CB('card1','addr1')
encode_CB('card1_addr1','P_emaildomain')
# FREQUENCY ENOCDE
encode_FE(train,test,['card1_addr1','card1_addr1_P_emaildomain'])
# GROUP AGGREGATE
encode_AG(['TransactionAmt','D9','D11'],['card1','card1_addr1','card1_addr1_P_emaildomain'],['mean','std'],usena=True)

## 5.2 Feature engineering on Categorical Variables Based on Cardinality
Let's encode the categorical variables based on cardinality!
- Low Cardinality features: Label Encoding
- High Cardinality features: Target Encoding / Drop


### Prepare the Data for futher Feature Engineering


In [None]:
many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]

In [None]:
big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

In [None]:
cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols+ one_value_cols_test))
cols_to_drop.remove('isFraud')
len(cols_to_drop)

In [None]:
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [None]:
# by https://www.kaggle.com/dimartinot
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)   

# Cleaning infinite values to NaN
train = clean_inf_nan(train)
test = clean_inf_nan(test)

#### Split into Categorical and Numerical dataset

In [None]:
# All Categorial Columns
identity_cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',
                     'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25',
                     'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32',
                     'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
transaction_cat_cols = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
                        'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3',
                        'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3','P_emaildomain', 'R_emaildomain',
                        'P_emaildomain_bin', 'R_emaildomain_bin', 'P_emaildomain_suffix', 'R_emaildomain_suffix',
                        'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

cat_cols = identity_cat_cols + transaction_cat_cols
cat_cols = list(set(cat_cols) & set(train.columns)) # because maybe we have dropped some categorical columns
encoded_cols = ['addr1','card1','card2','card3']
cat_cols = list(set(cat_cols) - set(encoded_cols))

common_cat_cols = [col for col in test.columns if col in cat_cols]

cat_cols == common_cat_cols

In [None]:
print(cat_cols)

In [None]:
print(common_cat_cols)

### Drop the categorial features that do not exist in the test dataset.

In [None]:
cat_train = train[common_cat_cols]
num_train = train.drop(cat_cols, axis=1)

cat_test = test[common_cat_cols]
num_test = test.drop(common_cat_cols, axis=1)

cat_train.columns == cat_test.columns

### Investigating Cardinality

In [None]:
# Get the number of unique entries in each column with categorical data
cat_nunique = list(map(lambda col: cat_train[col].nunique(), common_cat_cols))
d = dict(zip(common_cat_cols, cat_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x:x[1])

In [None]:
# Change the categorical columns' types to Category
cat_train = cat_train.astype('category')
cat_test = cat_test.astype('category')

print(cat_train.dtypes)

In [None]:
# Columns that will be label-encoded
low_cardinality_cols = [col for col in common_cat_cols if cat_train[col].nunique() < 10] # 10 is a number chosen by discretion

# Columns that will be target encoded (or dropped eventally)
high_cardinality_cols = list(set(common_cat_cols) - set(low_cardinality_cols))

print('Categorical columns that will be label encoded: ', low_cardinality_cols)
print('\nCategorical columns that will be target encoded (or dropped eventally): ', high_cardinality_cols)

In [None]:
LE_train = cat_train[low_cardinality_cols]
LE_test = cat_test[low_cardinality_cols]

print(LE_train.describe())
print("---------------")
print(LE_test.describe())

In [None]:
# label encode the data
from sklearn.preprocessing import LabelEncoder

for col in low_cardinality_cols:
    LEncoder = LabelEncoder()
    LEncoder.fit(list(LE_train[col].astype(str).values) + list(LE_test[col].astype(str).values))
    LE_train[col] = LEncoder.transform(list(LE_train[col].astype(str).values))
    LE_test[col] = LEncoder.transform(list(LE_test[col].astype(str).values))
    
train = num_train.merge(LE_train, how='left', left_index=True, right_index=True)
test = num_test.merge(LE_test, how='left', left_index=True, right_index=True)

print(train[low_cardinality_cols].describe())
print("---------------")
print(test[low_cardinality_cols].describe())

**I actually tried target encoding the high cardinality columns, but the local validation turned out to be better off when we just drop them instead. So the following code blocks for target encoding are commented out.**

In [None]:
# high_cardinality_cols

In [None]:
# cat_train[high_cardinality_cols].shape

Our dataset has enough rows to do the following encoding split and label encoding!

In [None]:
# print(cat_train[high_cardinality_cols].describe())
# print("---------------")
# print(cat_test[high_cardinality_cols].describe())

In [None]:
# # target encode the data
# from category_encoders import MEstimateEncoder
# TE_train = cat_train[high_cardinality_cols] 
# TE_train = TE_train.merge(train['isFraud'], how='left', left_index=True, right_index=True)
# TE_test = cat_test[high_cardinality_cols]

# # encoding split
# X_encode = TE_train.sample(frac=0.3, random_state=0)
# y_encode = X_encode.pop("isFraud")

# # training split
# X_pretrain = TE_train.drop(X_encode.index)
# y_train = X_pretrain.pop("isFraud")

# #apply M-Estimate encoding (the choice of m is based on our previous cardinality investigation)
# for col in high_cardinality_cols:
#     if TE_train[col].nunique() < 100:
#         m = 0.5
#     elif TE_train[col].nunique() < 550:
#         m = 2.5
#     else:
#         m = 5
#     encoder = MEstimateEncoder(cols=[col], m=m)
#     encoder.fit(X_encode, y_encode)
#     X_pretrain = encoder.transform(X_pretrain)
#     TE_test = encoder.transform(TE_test)

In [None]:
# print(X_pretrain[high_cardinality_cols].describe())
# print("---------------")
# print(TE_test[high_cardinality_cols].describe())

In [None]:
# X_pretrain.shape

In [None]:
# train = X_pretrain.merge(train, how='left', left_index=True, right_index=True)
# test = TE_test.merge(test, how='left', left_index=True, right_index=True)

# print(train[common_cat_cols].describe())
# print("---------------")
# print(test[common_cat_cols].describe())

In [None]:
del num_train, num_test, LE_train, LE_test#, TE_train, TE_test

# 6. Feature Selection - Time Consistency
After the above feature engineering, we've come a long way!
Now, we have 205 features in the train data. We will now check each of our  for "time consistency". 

Thanks to https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600/notebook, respective mdoels have been built. Each model is trained on the first month of the training data and only use one feature. The model then predicts the last month of the training data. We want both training AUC and validation AUC to be above AUC = 0.5. It turns out that 19 features fail this test so we will remove them. Additionally we will remove 7 D columns that are mostly NAN. More techniques for feature selection are listed at https://www.kaggle.com/c/ieee-fraud-detection/discussion/111308.

In [None]:
train.shape

In [None]:
test.shape

In [None]:
cols = list(train.columns)

# Remove 7 D columns that are mostly NAN
for c in list(set(['D6','D7','D8','D9','D12','D13','D14']) & set(cols)):
    cols.remove(c)
    
# FAILED TIME CONSISTENCY TEST
for c in list(set(['C3','M5','id_08','id_33']) & set(cols)):
    cols.remove(c)
for c in list(set(['card4','id_07','id_14','id_21','id_30','id_32','id_34']) & set(cols)):
    cols.remove(c)
for c in list(set(['id_'+str(x) for x in range(22,28)]) & set(cols)):
    cols.remove(c)

In [None]:
print('NOW USING THE FOLLOWING',len(cols),'FEATURES.')
np.array(cols)

In [None]:
final_train = train[cols].copy()
final_test = test[list(set(cols) - set(['isFraud']))].copy()

print(f'Our Final Train dataset has {final_train.shape[0]} rows and {final_train.shape[1]} columns.')
print(f'Our Final Test dataset has {final_test.shape[0]} rows and {final_test.shape[1]} columns.')

In [None]:
del train, test

# 7. XGBoost Modelling
Hooray! We have 195 final features!

Let's continue with XGBoost Modelling!

## 7.1 Local Validation
For this competition, we will use local validation. I evaluated features by training on the first 75% of the train data and predicting the last 25% of the train data. 

In [None]:
X_train = final_train.copy()
y_train = X_train.pop('isFraud')
X_test = final_test.copy()
X_train,X_test = X_train.align(other=X_test,join='left', axis=1)

print(X_train.describe())
print(X_test.describe())

In [None]:
idxT = X_train.index[:3*len(X_train)//4]
idxV = X_train.index[3*len(X_train)//4:]

In [None]:
y_train.value_counts()/y_train.shape[0]

#### We face the imbalance data problem, but we will solve it by subsampling!

In [None]:
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

clf = xgb.XGBClassifier( 
    n_estimators=2000,
    max_depth=12, 
    learning_rate=0.02, 
    subsample=0.8,
    colsample_bytree=0.4, 
    #missing=-1, 
    eval_metric='auc',
    # USE CPU
    #nthread=4,
    #tree_method='hist' 
    # USE GPU
    tree_method='gpu_hist' 
)

h = clf.fit(X_train.loc[idxT,cols], y_train[idxT], 
            early_stopping_rounds=100,
            eval_set=[(X_train.loc[idxV,cols],y_train[idxV])],
            verbose=50)
            

In [None]:
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,cols)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).iloc[:50])
plt.title('XGB Most Important Features')
plt.tight_layout()
plt.show()
del clf, h; x=gc.collect()

## 7.2 XGB Hyperopt
Reference: https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt/notebook

### Defining the HyperOpt function with parameters space and model

In [None]:
X_train = final_train.copy()
y_train = X_train.pop('isFraud')
X_test = final_test.copy()
X_train,X_test = X_train.align(other=X_test,join='left', axis=1)

In [None]:
from sklearn.model_selection import KFold,TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from xgboost import plot_importance
from sklearn.metrics import make_scorer

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import time
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 7
    count=1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

    tss = TimeSeriesSplit(n_splits=FOLDS)
    y_preds = np.zeros(sub.shape[0])
    y_oof = np.zeros(X_train.shape[0])
    score_mean = 0
    for tr_idx, val_idx in tss.split(X_train, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=2000, random_state=4, verbose=True, 
            tree_method='gpu_hist', early_stopping_rounds=100,
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        #y_pred_train = clf.predict_proba(X_vl)[:,1]
        #print(y_pred_train)
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        # plt.show()
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)


space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    
    # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # more increases accuracy, but may lead to overfitting.
    # num_leaves: the number of leaf nodes to use. Having a large number 
    # of leaves will improve accuracy, but will also lead to overfitting.
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    
    # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    
    # randomly select a fraction of the features.
    # feature_fraction: controls the subsampling of features used
    # for training (as opposed to subsampling the actual training data in 
    # the case of bagging). Smaller fractions reduce overfitting.
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}

In [None]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=25)

# Print best parameters
best_params = space_eval(space, best)

## 7.3 Trainning and Predicting with best Parameters


### Predicting X test

In [None]:
best_params['max_depth'] = int(best_params['max_depth'])

In [None]:
best_params

#### Following is our best parameters derived from the XGB HyperOpt.

#### Just to make our kernel run faster(the HyperOpt process takes 4:45:26), the previous XGB HyperOpt have been commented out.

In [None]:
best_params = {'bagging_fraction': 0.5818772519688797,
 'colsample_bytree': 0.3035307099891744,
 'feature_fraction': 0.795967379488282,
 'gamma': 0.6896677451866189,
 'learning_rate': 0.011336192527320772,
 'max_depth': 20,
 'min_child_samples': 140,
 'num_leaves': 230,
 'reg_alpha': 0.06035695642758,
 'reg_lambda': 0.012734543098346575,
 'subsample': 0.7}

In [None]:
X_train = final_train.copy()
y_train = X_train.pop('isFraud')
X_test = final_test.copy()
X_train,X_test = X_train.align(other=X_test,join='left', axis=1)

In [None]:
print("XGBoost version:", xgb.__version__)

clf = xgb.XGBClassifier(
    n_estimators=2000,
    **best_params,
    tree_method='gpu_hist',
    verbose=50,
    early_stopping_rounds=100
)

clf.fit(X_train, y_train)

y_preds = clf.predict_proba(X_test)[:,1] 

In [None]:
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,cols)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).iloc[:50])
plt.title('XGB Most Important Features')
plt.tight_layout()
plt.show()
del clf; x=gc.collect()

### Seting y_pred to csv

In [None]:
sub['isFraud'] = y_preds
sub.to_csv('XGB_hypopt_model.csv', index=False)

In [None]:
plt.hist(sub.isFraud,bins=100)
plt.ylim((0,5000))
plt.title('XGB Submission')
plt.show()