In [1]:
##########################################################################
# Experiment 1 
##########################################################################
#
# Hypothesis: 'Wheter boosting algorithms can improve on 
#              the results obtained by the benchmark model'
#
# Dataset: Elliptic dataset (BTC Transaction Graph) [Classification]
#
# Benchmark model: RandomForest '(Anti-Money Laundering in Bitcoin: Experimenting 
#                                with Graph Convolutional Networks for Financial Forensics)'
#
# Models: AdaBoost, LogitBoost, GradientBoosting, XGBoost, LightGBM, CatBoost
#
# Author: Dylan Vassallo <dylan.vassallo.18@um.edu.mt>

# Importing dependencies
import cryptoaml.datareader as cdr
from collections import OrderedDict
from cryptoaml.models import (RandomForestAlgo, 
                              AdaBoostAlgo, 
                              LogitBoostAlgo,
                              GradientBoostAlgo, 
                              XgbBoostAlgo, 
                              LightGbmAlgo, 
                              CatBoostAlgo)

# # Suppress deprecation warning due to numpy
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning) 

  from numpy.core.umath_tests import inner1d


In [2]:
# Create a new instance of the Elliptic Dataset
data = cdr.get_data("elliptic")


In [42]:

# Get different feature sets collection
data_sets = data.train_test_split(train_size=0.7, feat_set=["LF", "AF"], inc_meta=False)
display(data_sets["LF"].test_X.head())

Unnamed: 0,LF_0,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,LF_8,LF_9,...,LF_83,LF_84,LF_85,LF_86,LF_87,LF_88,LF_89,LF_90,LF_91,LF_92
139873,-0.172796,-0.120022,1.018602,-0.12197,-0.063725,-0.113002,-0.061584,-0.163452,-0.169269,-0.049707,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293715,-0.761914,-0.694235,-0.720879,0.025308,0.025217
139891,-0.054145,-0.105252,0.463609,-0.12197,-0.043875,-0.113002,-0.061584,-0.042054,-0.049573,-0.049707,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293583,-0.761719,-0.694058,-0.720663,1.135523,1.135279
139888,-0.111095,-0.064302,-0.091383,-0.12197,-0.043875,-0.113002,-0.061584,-0.100322,-0.107024,-0.049707,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293755,-0.760384,-0.692367,-0.719528,-1.084907,-1.084845
139883,0.504442,0.100068,-0.646376,-0.12197,-0.043875,-0.113002,-0.061584,0.529467,0.513939,-0.049707,...,0.126695,0.119269,1.12559,1.128038,-0.225881,2.477036,3.19268,2.190127,1.135523,1.135279
139882,-0.171828,-0.133534,1.018602,-0.12197,-0.063725,-0.113002,-0.061584,-0.162463,-0.168294,-0.049707,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293557,-0.761856,-0.694235,-0.720776,0.025308,0.025217


In [37]:
# Testing benchmark model 
rf = RandomForestAlgo(n_estimators=50, max_features=50)
print(rf.params)

# for feature_set in data_sets:


{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 50, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': 1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [5]:
# # Create a collection of models which will be tested 
# models = OrderedDict()

# # XGBoost 
# xgboost = XgbBoostAlgo()
# models["xgboost"] = xgboost 

In [6]:
# # Hyperparameter configuration (random_search)

# # Xgboost parameter grid 
# xgboost_tune = {
#     "name": "random_grid",
#     "cv": 5,
#     "n_iter": 2,
#     "param_grid" :{"learning_rate": [0.1, 0.01, 0.001],
#                    "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
#                    "max_depth": [2, 4, 7, 10],
#                    "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
#                    "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
#                    "reg_alpha": [0, 0.5, 1],
#                    "reg_lambda": [1, 1.5, 2, 3, 4.5],
#                    "min_child_weight": [1, 3, 5, 7],
#                    "n_estimators": [100, 250, 500, 1000]}
# }

In [7]:
# # Start training models on different feature sets 

# # Loop all feature sets 
# for feature_set in data_sets:
    
#     # Get current feature set 
#     tmp_dataset = data_sets[feature_set]
    
#     # Training set 
#     tmp_dataset_train_X = tmp_dataset.train_X
#     tmp_dataset_train_y = tmp_dataset.train_y
    
#     # Test set 
#     tmp_dataset_test_X = tmp_dataset.test_X
#     tmp_dataset_test_y = tmp_dataset.train_y

#     # Train/Tune/Test all models 
#     for model in models:
        
#         tmp_model = models[model]
        
#         print(model)
#         print(feature_set)


In [8]:
# # random_grid_tune = {
# #     "name": "random_grid",
# #     "cv": 5,
# #     "n_iter": 2,
# #     "param_grid" :{"learning_rate": [0.1, 0.01, 0.001],
# #                    "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
# #                    "max_depth": [2, 4, 7, 10],
# #                    "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
# #                    "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
# #                    "reg_alpha": [0, 0.5, 1],
# #                    "reg_lambda": [1, 1.5, 2, 3, 4.5],
# #                    "min_child_weight": [1, 3, 5, 7],
# #                    "n_estimators": [100, 250, 500, 1000]}
# # }

# xgboost = XgbBoostAlgo()
# # xgboost.fit(af_datasplit.train_X, af_datasplit.train_y, tune=random_grid_tune)

# xgboost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(xgboost.params)
# metrics = xgboost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(xgboost.feature_importance.head())

In [9]:
# lightGBM = LightGbmAlgo()

# lightGBM.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(lightGBM.params)
# metrics = lightGBM.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(lightGBM.feature_importance.head())

In [10]:
# adaBoost = AdaBoostAlgo()

# adaBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(adaBoost.params)
# metrics = adaBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(adaBoost.feature_importance.head())

In [11]:
# catBoost = CatBoostAlgo(verbose=0)

# catBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(catBoost.params)
# metrics = catBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(catBoost.feature_importance.tail())

In [12]:
# gboost = GradientBoostAlgo()

# gboost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(gboost.params)
# metrics = gboost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(gboost.feature_importance.head())

In [13]:
# logitBoost = LogitBoostAlgo()

# logitBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(logitBoost.params)
# metrics = logitBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(logitBoost.feature_importance.head())

In [14]:
# xgboost.save("saved_models/experiment_1/xgboost_elliptic_af")


# xgboost_load = XgbBoostAlgo()
# xgboost_load.load("saved_models/experiment_1/xgboost_elliptic_af")
# print(xgboost_load.params)
# metrics = xgboost_load.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)    
# print(metrics)
# display(xgboost_load.feature_importance.head())

# y_pred = xgboost.predict(af_datasplit.test_X)
# print(f1_score(af_datasplit.test_y, y_pred, average="binary"))


# # ###############################

In [15]:
# # importing dependencies
# import cryptoaml.datareader as cdr
# from collections import OrderedDict
# from cryptoaml.models import XgbBoostAlgo, LightGbmAlgo

# import xgboost as xgb
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV

In [16]:
# # create a new instance of Elliptic Dataset
# ell = cdr.get_data("elliptic", 
#                    encode_classes=True)

In [17]:
# # xgbAlgo = XgbBoostAlgo() 
# # print(xgbAlgo.params)

# # lgb = LightGbmAlgo() 
# # print(lgb.params)
# af_datasplit = ell.get_data_split(train_perc=0.7, input_feats="AF", inc_unknown=False)

In [18]:
# grid_param = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
#  "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
#  "min_child_weight" : [ 1, 3, 5, 7 ],
#  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
#  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

# classifier = xgb.XGBClassifier()
# gd_sr = RandomizedSearchCV(estimator=classifier,
#                            param_distributions=grid_param,
#                            scoring="f1",
#                            n_iter = 10, 
#                            cv=5,
#                            n_jobs=-1)

# gd_sr.fit(af_datasplit.train_X, af_datasplit.train_y)

In [19]:
# best_parameters = gd_sr.best_params_
# print(best_parameters)

In [20]:
# best_model = gd_sr.best_estimator_
# print(best_model.get_params())

In [21]:
# y_pred = best_model.predict(af_datasplit.test_X)

In [22]:
# from sklearn.metrics import f1_score
# print(f1_score(af_datasplit.test_y, y_pred, average="binary"))

In [23]:
# from sklearn.metrics import f1_score
# classifier_2 = RandomForestClassifier(n_estimators=50, max_features=50)
# classifier_2.fit(af_datasplit.train_X, af_datasplit.train_y)
# y_pred_2 = classifier_2.predict(af_datasplit.test_X)
# print(f1_score(af_datasplit.test_y, y_pred_2, average="binary"))