In [1]:
##########################################################################
# Experiment 1 
##########################################################################
#
# Hypothesis: 'Wheter boosting algorithms can improve on 
#              the results obtained by the benchmark model'
#
# Dataset: Elliptic dataset (BTC Transaction Graph) [Classification]
#
# Benchmark model: RandomForest '(Anti-Money Laundering in Bitcoin: Experimenting 
#                                with Graph Convolutional Networks for Financial Forensics)'
#
# Models: AdaBoost, LogitBoost, GradientBoosting, XGBoost, LightGBM, CatBoost
#
# Author: Dylan Vassallo <dylan.vassallo.18@um.edu.mt>

# Importing dependencies
import cryptoaml.datareader as cdr
from collections import OrderedDict
from cryptoaml.models import (AdaBoostAlgo, 
                              LogitBoostAlgo,
                              GradientBoostAlgo, 
                              XgbBoostAlgo, 
                              LightGbmAlgo, 
                              CatBoostAlgo)

# # Suppress deprecation warning due to numpy
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning) 

  from numpy.core.umath_tests import inner1d


In [2]:
# # TODO -> Add verbose to datareader
# # TODO -> Create function which gets the ordered since I will be using this 

# # Create an ordered dictionary to hold the dataset with different sets of features 
# data_dict = OrderedDict()

# Create a new instance of the Elliptic Dataset
ell = cdr.get_data("elliptic")
print(ell.labels)



# data_dict["LF"] = lf_data

# # Data with all features (local features + aggregated features) 
# af_data = ell.get_data_split(train_perc=0.7, 
#                              input_feats="AF", 
#                              inc_unknown=False)
# data_dict["AF"] = af_data

{'licit': 0, 'illicit': 1, 'unknown': -1}


In [23]:
# Data with local features only 
lf_data = ell.train_test_split(train_size=0.7, feat_set="AF")
display(lf_data.train_X)

data = ell.train_test_split(train_size=0.7, feat_set=["LF","AF"])
display(data["LF"].train_X)
display(data["AF"].train_X)

Unnamed: 0,LF_0,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,LF_8,LF_9,...,AGG_62,AGG_63,AGG_64,AGG_65,AGG_66,AGG_67,AGG_68,AGG_69,AGG_70,AGG_71
3,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,0.043598,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
5328,-0.075979,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.064393,-0.071599,-0.049707,...,3.749738,3.038392,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399
5325,-0.172972,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,-0.169451,-0.049632,...,-0.570998,-0.596794,1.297854,1.297925,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
5321,-0.170492,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163641,-0.166952,-0.026844,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.068808,-0.120613,-0.119792
5314,-0.172974,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,-0.169453,-0.049651,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.068808,1.299939,1.301521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134511,-0.172947,-0.070152,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163606,-0.169421,-0.049707,...,-0.263236,0.900171,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
134512,-0.172960,-0.081852,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163619,-0.169434,-0.049707,...,-0.308073,0.408191,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
134525,-0.172950,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.163609,-0.169424,-0.049707,...,-0.577099,-0.600999,0.241128,0.241406,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399
134528,0.276966,-0.104657,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,0.296720,0.284454,-0.049707,...,-0.053993,-0.172093,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792


Unnamed: 0,LF_0,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,LF_8,LF_9,...,LF_83,LF_84,LF_85,LF_86,LF_87,LF_88,LF_89,LF_90,LF_91,LF_92
3,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,0.043598,...,-0.255168,0.717108,-0.187191,-0.185274,3.991587,0.810879,-0.694235,2.084651,0.025308,0.025217
5328,-0.075979,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.064393,-0.071599,-0.049707,...,-0.071741,-0.077429,1.125590,1.128038,-0.118307,2.570295,3.257716,2.308518,1.135523,1.135279
5325,-0.172972,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,-0.169451,-0.049632,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293897,-0.761274,-0.693556,-0.720554,1.014190,1.135279
5321,-0.170492,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163641,-0.166952,-0.026844,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293897,-0.761276,-0.693591,-0.720508,1.063289,1.135279
5314,-0.172974,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,-0.169453,-0.049651,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293897,-0.761259,-0.693554,-0.720528,1.031689,1.135279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134511,-0.172947,-0.070152,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163606,-0.169421,-0.049707,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293859,-0.048410,0.168705,-0.084557,1.135523,1.135279
134512,-0.172960,-0.081852,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163619,-0.169434,-0.049707,...,-0.254948,-0.259033,-1.499972,-1.498585,1.574721,0.142171,-0.430196,0.697089,-1.084907,-1.084845
134525,-0.172950,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.163609,-0.169424,-0.049707,...,1.282855,1.265309,1.125590,1.128038,-0.293527,1.034900,1.478659,0.881751,1.135523,1.135279
134528,0.276966,-0.104657,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,0.296720,0.284454,-0.049707,...,4.538849,4.506234,1.125590,1.128038,1.889175,1.454720,1.017562,1.970679,-1.084907,-1.084845


Unnamed: 0,LF_0,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,LF_8,LF_9,...,AGG_62,AGG_63,AGG_64,AGG_65,AGG_66,AGG_67,AGG_68,AGG_69,AGG_70,AGG_71
3,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,0.043598,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
5328,-0.075979,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.064393,-0.071599,-0.049707,...,3.749738,3.038392,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399
5325,-0.172972,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,-0.169451,-0.049632,...,-0.570998,-0.596794,1.297854,1.297925,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
5321,-0.170492,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163641,-0.166952,-0.026844,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.068808,-0.120613,-0.119792
5314,-0.172974,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,-0.169453,-0.049651,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.068808,1.299939,1.301521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134511,-0.172947,-0.070152,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163606,-0.169421,-0.049707,...,-0.263236,0.900171,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
134512,-0.172960,-0.081852,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163619,-0.169434,-0.049707,...,-0.308073,0.408191,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
134525,-0.172950,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.163609,-0.169424,-0.049707,...,-0.577099,-0.600999,0.241128,0.241406,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399
134528,0.276966,-0.104657,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,0.296720,0.284454,-0.049707,...,-0.053993,-0.172093,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792


In [4]:
# # Create an ordered dictionary to hold the dataset with different sets of features 
# models = OrderedDict()

# # Create instances for the tested boosting algorithms 
# adaBoost = AdaBoostAlgo()
# models["AdaBoost"] = adaBoost 

# logitBoost = LogitBoostAlgo()
# models["LogitBoost"] = logitBoost 

In [5]:
# # random_grid_tune = {
# #     "name": "random_grid",
# #     "cv": 5,
# #     "n_iter": 2,
# #     "param_grid" :{"learning_rate": [0.1, 0.01, 0.001],
# #                    "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
# #                    "max_depth": [2, 4, 7, 10],
# #                    "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
# #                    "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
# #                    "reg_alpha": [0, 0.5, 1],
# #                    "reg_lambda": [1, 1.5, 2, 3, 4.5],
# #                    "min_child_weight": [1, 3, 5, 7],
# #                    "n_estimators": [100, 250, 500, 1000]}
# # }

# xgboost = XgbBoostAlgo()
# # xgboost.fit(af_datasplit.train_X, af_datasplit.train_y, tune=random_grid_tune)

# xgboost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(xgboost.params)
# metrics = xgboost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(xgboost.feature_importance.head())

In [6]:
# lightGBM = LightGbmAlgo()

# lightGBM.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(lightGBM.params)
# metrics = lightGBM.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(lightGBM.feature_importance.head())

In [7]:
# adaBoost = AdaBoostAlgo()

# adaBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(adaBoost.params)
# metrics = adaBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(adaBoost.feature_importance.head())

In [8]:
# catBoost = CatBoostAlgo(verbose=0)

# catBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(catBoost.params)
# metrics = catBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(catBoost.feature_importance.tail())

In [9]:
# gboost = GradientBoostAlgo()

# gboost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(gboost.params)
# metrics = gboost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(gboost.feature_importance.head())

In [10]:
# logitBoost = LogitBoostAlgo()

# logitBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(logitBoost.params)
# metrics = logitBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(logitBoost.feature_importance.head())

In [11]:
# xgboost.save("saved_models/experiment_1/xgboost_elliptic_af")


# xgboost_load = XgbBoostAlgo()
# xgboost_load.load("saved_models/experiment_1/xgboost_elliptic_af")
# print(xgboost_load.params)
# metrics = xgboost_load.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)    
# print(metrics)
# display(xgboost_load.feature_importance.head())

# y_pred = xgboost.predict(af_datasplit.test_X)
# print(f1_score(af_datasplit.test_y, y_pred, average="binary"))


# # ###############################

In [12]:
# # importing dependencies
# import cryptoaml.datareader as cdr
# from collections import OrderedDict
# from cryptoaml.models import XgbBoostAlgo, LightGbmAlgo

# import xgboost as xgb
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV

In [13]:
# # create a new instance of Elliptic Dataset
# ell = cdr.get_data("elliptic", 
#                    encode_classes=True)

In [14]:
# # xgbAlgo = XgbBoostAlgo() 
# # print(xgbAlgo.params)

# # lgb = LightGbmAlgo() 
# # print(lgb.params)
# af_datasplit = ell.get_data_split(train_perc=0.7, input_feats="AF", inc_unknown=False)

In [15]:
# grid_param = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
#  "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
#  "min_child_weight" : [ 1, 3, 5, 7 ],
#  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
#  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

# classifier = xgb.XGBClassifier()
# gd_sr = RandomizedSearchCV(estimator=classifier,
#                            param_distributions=grid_param,
#                            scoring="f1",
#                            n_iter = 10, 
#                            cv=5,
#                            n_jobs=-1)

# gd_sr.fit(af_datasplit.train_X, af_datasplit.train_y)

In [16]:
# best_parameters = gd_sr.best_params_
# print(best_parameters)

In [17]:
# best_model = gd_sr.best_estimator_
# print(best_model.get_params())

In [18]:
# y_pred = best_model.predict(af_datasplit.test_X)

In [19]:
# from sklearn.metrics import f1_score
# print(f1_score(af_datasplit.test_y, y_pred, average="binary"))

In [20]:
# from sklearn.metrics import f1_score
# classifier_2 = RandomForestClassifier(n_estimators=50, max_features=50)
# classifier_2.fit(af_datasplit.train_X, af_datasplit.train_y)
# y_pred_2 = classifier_2.predict(af_datasplit.test_X)
# print(f1_score(af_datasplit.test_y, y_pred_2, average="binary"))