In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**Read datasets**

In [2]:
import pandas as pd
train_transaction = pd.read_csv("input/train_transaction.csv",index_col='TransactionID')
train_id = pd.read_csv('input/train_identity.csv',index_col='TransactionID')
train_transaction = train_transaction.merge(train_id, how='left', left_index=True, right_index=True)

In [3]:
train_transaction.fillna(-999, inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
cols = ['ProductCD','card1','card2','card3','card4','card5','card6','addr1','addr2','P_emaildomain','R_emaildomain','M1','M2','M3','M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo'
]
category_cols = cols
for col in cols:
    train_transaction[col].fillna("unknown").astype('category')
    train_transaction[col] = LabelEncoder().fit_transform(train_transaction[col].astype(str))

In [6]:
cols = ['id_'+str(x) for x in range(12,39)]
category_cols += cols

for col in cols:
    train_transaction[col].fillna("unknown").astype('category')
    train_transaction[col] = LabelEncoder().fit_transform(train_transaction[col].astype(str))

In [7]:
category_cols

['ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'DeviceType',
 'DeviceInfo',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38']

**Remove V columns**

In [8]:
cols = ['V'+str(x) for x in range(1,340)]
train_transaction_simplified = train_transaction.drop(cols, axis = 1)

In [9]:
# NORMALIZE D COLUMNS
for i in range(1,16):
    if i in [1,2,3,5,9]: continue
    train_transaction_simplified['D'+str(i)] =  train_transaction_simplified['D'+str(i)] - train_transaction_simplified.TransactionDT/np.float32(24*60*60)

**Remove ['D6','D7','D8','D9','D12','D13','D14'] columns - mostly NAN**

In [10]:
train_transaction_simplified = train_transaction_simplified.drop(['D6','D7','D8','D9','D12','D13','D14'], axis = 1)

In [11]:
# TRANSACTION AMT CENTS
train_transaction_simplified['cents'] = (train_transaction_simplified['TransactionAmt'] - np.floor(train_transaction_simplified['TransactionAmt'])).astype('float32')

**Split train data in train and "test"**

In [12]:
x_raw = train_transaction_simplified.drop('isFraud', axis = 1 )
y = train_transaction_simplified.isFraud


#x_train = x_raw[:3*len(x_raw)//4]
#x_test = x_raw[3*len(x_raw)//4:]

#y_train = y[:3*len(x_raw)//4]
#y_test = y[3*len(x_raw)//4:]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sscaler = StandardScaler()
x = sscaler.fit_transform(x_raw)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)

**Import metrics**

In [13]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score

In [None]:
import xgboost as xgbClf

xgb = xgbClf.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        # USE CPU
        #nthread=4,
        #tree_method='hist' 
        # USE GPU
        tree_method='gpu_hist' 
    )
xgb.fit(x_train, y_train, 
        eval_set=[(x_train,y_train)],
        verbose=50, early_stopping_rounds=100)

In [None]:
# predict on test set
xgb_pred = xgb.predict(x_test)
y_xgb  = xgb.predict_proba(x_test)
acc_s_xgb = accuracy_score(y_test, xgb_pred)

print ("Accuracy score XGB: ", acc_s_xgb)

curve_xgb   = roc_curve(y_test, y_xgb[:, 1])
auc_xgb   = auc(curve_xgb[0], curve_xgb[1])

**Use CatBoostClassifier for modeling**

array([[ 0.98932062, -0.4351738 ,  0.54724983, ..., -0.51925515,
        -0.47356535,  1.31426906],
       [-1.40506055, -0.39272565, -2.26188683, ...,  2.60248709,
        -0.47356535, -0.63911728],
       [ 1.69508777,  0.48491261,  0.54724983, ..., -0.51925515,
        -0.47356535, -0.87407648],
       ...,
       [-1.09685993, -0.24671184, -2.26188683, ...,  2.60248709,
        -0.47356535, -0.82109548],
       [ 0.79274864,  0.81941346,  0.54724983, ..., -0.51925515,
        -0.47356535, -0.87407648],
       [ 0.04759329, -0.36409237,  0.54724983, ..., -0.51925515,
        -0.47356535,  1.31426906]])

In [32]:
from catboost import CatBoostClassifier
hyper_params = {
    'n_estimators':5000,
    'learning_rate': 0.07,
    'eval_metric':'AUC',
    'loss_function':'Logloss',
    'random_seed':42,
    'metric_period':500,
    'od_wait':500,
    'depth': 8
} 
cbc = CatBoostClassifier(**hyper_params)

cbc.fit(x_train,y_train,eval_set=(x_test, y_test), use_best_model=True, verbose=True);     



0:	test: 0.6923952	best: 0.6923952 (0)	total: 485ms	remaining: 40m 25s
500:	test: 0.9325845	best: 0.9325845 (500)	total: 2m 6s	remaining: 18m 59s
1000:	test: 0.9469621	best: 0.9469621 (1000)	total: 4m 12s	remaining: 16m 47s
1500:	test: 0.9542216	best: 0.9542216 (1500)	total: 6m 19s	remaining: 14m 44s
2000:	test: 0.9583521	best: 0.9583521 (2000)	total: 8m 23s	remaining: 12m 34s
2500:	test: 0.9612013	best: 0.9612119 (2499)	total: 10m 27s	remaining: 10m 27s
3000:	test: 0.9629919	best: 0.9629919 (3000)	total: 12m 36s	remaining: 8m 23s
3500:	test: 0.9642353	best: 0.9642421 (3496)	total: 14m 39s	remaining: 6m 16s
4000:	test: 0.9652894	best: 0.9652959 (3997)	total: 16m 44s	remaining: 4m 10s
4500:	test: 0.9659295	best: 0.9659295 (4500)	total: 18m 47s	remaining: 2m 5s
4999:	test: 0.9665482	best: 0.9665503 (4997)	total: 20m 51s	remaining: 0us

bestTest = 0.9665503205
bestIteration = 4997

Shrink model to first 4998 iterations.


In [33]:
# predict on test set
cbc_pred = cbc.predict(x_test)
y_cbc  = cbc.predict_proba(x_test)
acc_s_cbc = accuracy_score(y_test, cbc_pred)

print ("Accuracy score cat boost classifier: ", acc_s_cbc)

curve_cbc   = roc_curve(y_test, y_cbc[:, 1])
auc_cbc   = auc(curve_cbc[0], curve_cbc[1])

Accuracy score cat boost classifier:  0.9873054040934286


**Draw ROC**

In [None]:
import matplotlib.pyplot as plt

plt.plot(curve_cbc[0], curve_cbc[1], label='cbc (area = %0.2f)' % auc_cbc)
#plt.plot(curve_xgb[0], curve_xgb[1], label='xgb (area = %0.2f)' % auc_xgb)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve');

plt.legend();

In [None]:
from sklearn.metrics import average_precision_score, auc, roc_curve, precision_recall_curve
average_precision_cbc = average_precision_score(y_test, y_cbc[:,1])
#average_precision_xgb = average_precision_score(y_test, y_xgb[:,1])

print('Average precision-recall score CBC: {}'.format(average_precision_cbc))
#print('Average precision-recall score XGB: {}'.format(average_precision_xgb))

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision_cbc, recall_cbc, _ = precision_recall_curve(y_test, y_cbc[:,1])
#precision_xgb, recall_xgb, _ = precision_recall_curve(y_test, y_xgb[:,1])


plt.plot(recall_cbc, precision_cbc, label='cbc (area = %0.2f)' % average_precision_cbc)
#plt.plot(recall_xgb, precision_xgb, label='xgb (area = %0.2f)' % average_precision_xgb)


plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('2-class Precision-Recall curve');

plt.legend();