## Importing the Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import zipfile 
addresses = ["train_transaction.csv.zip", "test_transaction.csv.zip", "test_identity.csv.zip", "train_identity.csv.zip"]
base = '/content/drive/My Drive/data/' 

for i in range(len(addresses)):
  with zipfile.ZipFile((base + addresses[i]), 'r') as zip_ref:
      zip_ref.extractall('./')

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [4]:
train_identity = pd.read_csv("train_identity.csv")
test_identity = pd.read_csv("test_identity.csv")
train_transaction = pd.read_csv("train_transaction.csv")
test_transaction = pd.read_csv("test_transaction.csv")

In [5]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [6]:
del train_transaction
del test_transaction
del train_identity
del test_identity

## Data Analysis and Preprocessing

#### Handling missing values

In [7]:
# eksik değerleri ve önündeki ve arkasındaki değerleri dikkate alarak doldurduk çünkü çok fazla derecede missing value var

train = train.fillna(method='bfill') 
train = train.fillna(method='ffill')

In [8]:
test = test.fillna(method='bfill')
test = test.fillna(method='ffill')


#### Encoding Categorical Values


In [9]:
#Encoding yapmadan önce train ve test setindeki bazı columnların adları farklı bunları düzeltmeliyiz (id_12 ve id-12 gibi)

test.columns = test.columns.str.replace('-','_') 

In [10]:
oneHot = OneHotEncoder(categories = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 
          'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo', 'id_12', 'id_13', 'id_14', 'id_15',
          'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28',
          'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38'])

le = LabelEncoder()

In [11]:
columns = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 
          'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo', 'id_12', 'id_13', 'id_14', 'id_15',
          'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28',
          'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']

In [12]:
for i in range(len(columns)):
  train[columns[i]] = le.fit_transform(train[columns[i]])
  test[columns[i]] = le.fit_transform(test[columns[i]])

In [13]:
# one-hot encoding'i istediğim kategorilere uygulatma kısmını başaramadım

#### Train/Test Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_y = train['isFraud']
train_x = train.drop(['isFraud'], axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.33, random_state=42)

# bu bir kaggle yarışması olduğu için karşılaştırma yapabilmek adına train'i böldük; 
# test kısmını kaggle adına prediction vs olarak kullanacağız onun haricinde kullanılmayacak. 

In [17]:
del train
del train_x
del train_y

#### XGBoost, CatBoost and LightGBM w/ default parameters

In [18]:
import xgboost as xgb
from xgboost import XGBClassifier
import time

In [35]:
xgb_clf = XGBClassifier(n_jobs=-1)

In [25]:
start = time.time()

xgb_clf.fit(X_train, y_train) #default parameters

end = time.time()
print(end - start)

522.2809903621674


In [26]:
#xgboost with default parameters == 8.7 minutes

y_pred = xgb_clf.predict(X_test)

accuracy_score(y_test, y_pred) # with default parameters

0.973142308817266

In [27]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/20/37/bc4e0ddc30c07a96482abf1de7ed1ca54e59bba2026a33bca6d2ef286e5b/catboost-0.24.4-cp36-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.8MB 99kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [24]:
from catboost import Pool, CatBoostClassifier, cv

catb = CatBoostClassifier(eval_metric='Accuracy',random_seed=42)

In [28]:
start = time.time()

catb.fit(X_train, y_train) # with default parameters

end = time.time()
print(end - start)

Learning rate set to 0.132431
0:	learn: 0.9661023	total: 589ms	remaining: 9m 48s
1:	learn: 0.9672952	total: 1.17s	remaining: 9m 42s
2:	learn: 0.9680408	total: 1.76s	remaining: 9m 44s
3:	learn: 0.9684730	total: 2.27s	remaining: 9m 25s
4:	learn: 0.9689684	total: 2.85s	remaining: 9m 28s
5:	learn: 0.9692944	total: 3.42s	remaining: 9m 27s
6:	learn: 0.9696457	total: 4s	remaining: 9m 28s
7:	learn: 0.9700703	total: 4.63s	remaining: 9m 34s
8:	learn: 0.9705809	total: 5.24s	remaining: 9m 36s
9:	learn: 0.9704949	total: 5.85s	remaining: 9m 39s
10:	learn: 0.9708286	total: 6.48s	remaining: 9m 42s
11:	learn: 0.9711470	total: 7.03s	remaining: 9m 38s
12:	learn: 0.9712153	total: 7.65s	remaining: 9m 40s
13:	learn: 0.9714554	total: 8.35s	remaining: 9m 47s
14:	learn: 0.9714655	total: 8.97s	remaining: 9m 49s
15:	learn: 0.9715211	total: 9.58s	remaining: 9m 49s
16:	learn: 0.9717081	total: 10.1s	remaining: 9m 46s
17:	learn: 0.9718572	total: 10.9s	remaining: 9m 53s
18:	learn: 0.9719381	total: 11.5s	remaining: 9m

In [31]:
# catboost with default parameters - 10.2280405 minutes

y_pred = catb.predict(X_test)

accuracy_score(y_test, y_pred) # with default parameters 

0.9800799470440632

In [19]:
!pip install lightgbm



In [None]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(n_jobs=-1)

In [20]:
start = time.time()

lgb.fit(X_train, y_train) # with default parameters

end = time.time()
print(end - start)

85.94375777244568


In [21]:
# lightgbm with default parameters - 1.43239596 minutes

y_pred = lgb.predict(X_test)

accuracy_score(y_test, y_pred) # with default parameters 

0.9773603107569312

If we are to compare three gradient boosting algorithms with their accuracy and time-dependencies:



*   XGBoost : 8.7 minutes and 0.973142308817266 accuracy
*   CatBoost : 10.2280405 minutes and 0.9800799470440632 accuracy
*   LightGBM : 1.43239596 minutes and 0.9773603107569312 accuracy






#### Hyperparameter Optimization

For these algorithms, I am going to try one hyperparameter optimization method for each of them (Random Search for CatBoost, Grid Search for XGBoost and Bayesian Optimization for LightGBM) and see if the optimized results outperform default parameters. 

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt 

In [37]:
parameters = {'depth'         : sp_randInt(1, 10),
              'learning_rate' : sp_randFloat(),
              'iterations'    : sp_randInt(10, 100)
}

randm = RandomizedSearchCV(estimator = catb, param_distributions = parameters, 
                               cv = 2, n_iter = 10)
start = time.time()

randm.fit(X_train, y_train, eval_set=(X_test,y_test))

end = time.time()
print(end - start)

0:	learn: 0.9653339	test: 0.9643574	best: 0.9643574 (0)	total: 229ms	remaining: 12.1s
1:	learn: 0.9653339	test: 0.9643574	best: 0.9643574 (0)	total: 403ms	remaining: 10.5s
2:	learn: 0.9666936	test: 0.9658506	best: 0.9658506 (2)	total: 580ms	remaining: 9.85s
3:	learn: 0.9667593	test: 0.9659019	best: 0.9659019 (3)	total: 739ms	remaining: 9.24s
4:	learn: 0.9666987	test: 0.9659071	best: 0.9659071 (4)	total: 907ms	remaining: 8.88s
5:	learn: 0.9678209	test: 0.9670873	best: 0.9670873 (5)	total: 1.06s	remaining: 8.5s
6:	learn: 0.9676894	test: 0.9669436	best: 0.9670873 (5)	total: 1.23s	remaining: 8.24s
7:	learn: 0.9679321	test: 0.9672258	best: 0.9672258 (7)	total: 1.39s	remaining: 7.97s
8:	learn: 0.9679927	test: 0.9673079	best: 0.9673079 (8)	total: 1.56s	remaining: 7.8s
9:	learn: 0.9682404	test: 0.9674773	best: 0.9674773 (9)	total: 1.72s	remaining: 7.56s
10:	learn: 0.9682606	test: 0.9675337	best: 0.9675337 (10)	total: 1.89s	remaining: 7.38s
11:	learn: 0.9683921	test: 0.9676158	best: 0.9676158 (

In [38]:
randm.best_params_

{'depth': 6, 'iterations': 76, 'learning_rate': 0.8506940636978821}

In [40]:
print("Start Predicting")
y_pred = randm.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred)  

# it didn't outperform as CatBoost was originally 0.9800799470440632 accuracy

Start Predicting


0.9775912232718764

In [42]:
!pip install scikit-optimize

Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/8b/03/be33e89f55866065a02e515c5b319304a801a9f1027a9b311a9b1d1f8dc7/scikit_optimize-0.8.1-py2.py3-none-any.whl (101kB)
[K     |███▎                            | 10kB 15.6MB/s eta 0:00:01[K     |██████▌                         | 20kB 12.3MB/s eta 0:00:01[K     |█████████▊                      | 30kB 8.5MB/s eta 0:00:01[K     |█████████████                   | 40kB 7.3MB/s eta 0:00:01[K     |████████████████▏               | 51kB 4.4MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 4.9MB/s eta 0:00:01[K     |██████████████████████▊         | 71kB 5.1MB/s eta 0:00:01[K     |██████████████████████████      | 81kB 5.3MB/s eta 0:00:01[K     |█████████████████████████████▏  | 92kB 5.6MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 4.1MB/s 
Collecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df76a6dd

In [43]:
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

In [46]:
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

search_spaces = {'iterations': Integer(10, 300),
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform')}

opt = BayesSearchCV(lgb,
                    search_spaces,
                    scoring= roc_auc,
                    n_iter=2,  
                    return_train_score=False,
                    refit=True, 
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42)

opt.fit(X_train, y_train)

BayesSearchCV(cv=None, error_score='raise',
              estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                       colsample_bytree=1.0,
                                       importance_type='split',
                                       learning_rate=0.1, max_depth=-1,
                                       min_child_samples=20,
                                       min_child_weight=0.001,
                                       min_split_gain=0.0, n_estimators=100,
                                       n_jobs=-1, num_leaves=31, objective=None,
                                       random_state=None, reg_alpha=0.0,
                                       reg_lambda=0.0, si...
              pre_dispatch='2*n_jobs', random_state=42, refit=True,
              return_train_score=False,
              scoring=make_scorer(roc_auc_score, needs_threshold=True),
              search_spaces={'depth': Integer(low=1, high=8, prior='uniform', trans

In [47]:
print("Start Predicting")
y_pred = opt.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred) 

# this did not outperform either

Start Predicting


0.9755027478589279

In [48]:
opt.best_params_

OrderedDict([('depth', 7),
             ('iterations', 266),
             ('learning_rate', 0.04044084484117812)])