In [14]:
##########################################################################
# Experiment 1 
##########################################################################
#
# Hypothesis: 'Wheter boosting algorithms can improve on 
#              the results obtained by the benchmark model'
#
# Dataset: Elliptic dataset (BTC Transaction Graph) [Classification]
#
# Benchmark model: RandomForest '(Anti-Money Laundering in Bitcoin: Experimenting 
#                                with Graph Convolutional Networks for Financial Forensics)'
#
# Models: AdaBoost, LogitBoost, GradientBoosting, XGBoost, LightGBM, CatBoost
#
# Author: Dylan Vassallo <dylan.vassallo.18@um.edu.mt>

# Importing dependencies
import cryptoaml.datareader as cdr
from collections import OrderedDict
from cryptoaml.models import (AdaBoostAlgo, 
                              LogitBoostAlgo,
                              GradientBoostAlgo, 
                              XgbBoostAlgo, 
                              LightGbmAlgo, 
                              CatBoostAlgo)

# # Suppress deprecation warning due to numpy
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [11]:
# TODO -> Add verbose to datareader
# TODO -> Create function which gets the ordered since I will be using this 

# Create an ordered dictionary to hold the dataset with different sets of features 
data_dict = OrderedDict()

# Create a new instance of the Elliptic Dataset
ell = cdr.get_data("elliptic", encode_classes=True)

# Data with local features only 
lf_data = ell.get_data_split(train_perc=0.7, 
                             input_feats="LF", 
                             inc_unknown=False)
data_dict["LF"] = lf_data

# Data with all features (local features + aggregated features) 
af_data = ell.get_data_split(train_perc=0.7, 
                             input_feats="AF", 
                             inc_unknown=False)
data_dict["AF"] = af_data

In [16]:
# Create an ordered dictionary to hold the dataset with different sets of features 
models = OrderedDict()

# Create instances for the tested boosting algorithms 
adaBoost = AdaBoostAlgo()
models["AdaBoost"] = adaBoost 

logitBoost = LogitBoostAlgo()
models["LogitBoost"] = logitBoost 

In [3]:
# # random_grid_tune = {
# #     "name": "random_grid",
# #     "cv": 5,
# #     "n_iter": 2,
# #     "param_grid" :{"learning_rate": [0.1, 0.01, 0.001],
# #                    "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
# #                    "max_depth": [2, 4, 7, 10],
# #                    "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
# #                    "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
# #                    "reg_alpha": [0, 0.5, 1],
# #                    "reg_lambda": [1, 1.5, 2, 3, 4.5],
# #                    "min_child_weight": [1, 3, 5, 7],
# #                    "n_estimators": [100, 250, 500, 1000]}
# # }

# xgboost = XgbBoostAlgo()
# # xgboost.fit(af_datasplit.train_X, af_datasplit.train_y, tune=random_grid_tune)

# xgboost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(xgboost.params)
# metrics = xgboost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(xgboost.feature_importance.head())

In [4]:
# lightGBM = LightGbmAlgo()

# lightGBM.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(lightGBM.params)
# metrics = lightGBM.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(lightGBM.feature_importance.head())

In [5]:
# adaBoost = AdaBoostAlgo()

# adaBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(adaBoost.params)
# metrics = adaBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(adaBoost.feature_importance.head())

In [6]:
# catBoost = CatBoostAlgo(verbose=0)

# catBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(catBoost.params)
# metrics = catBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(catBoost.feature_importance.tail())

In [7]:
# gboost = GradientBoostAlgo()

# gboost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(gboost.params)
# metrics = gboost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(gboost.feature_importance.head())

In [8]:
# logitBoost = LogitBoostAlgo()

# logitBoost.fit(af_datasplit.train_X, af_datasplit.train_y)
# print(logitBoost.params)
# metrics = logitBoost.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)
# print(metrics)

# import pandas as pd
# pd.set_option('display.max_rows', 500)
# display(logitBoost.feature_importance.head())

In [9]:
# xgboost.save("saved_models/experiment_1/xgboost_elliptic_af")


# xgboost_load = XgbBoostAlgo()
# xgboost_load.load("saved_models/experiment_1/xgboost_elliptic_af")
# print(xgboost_load.params)
# metrics = xgboost_load.evaluate(metrics=["precision", "recall", "f1", "f1_micro"], 
#                            X_test=af_datasplit.test_X, 
#                            y_test=af_datasplit.test_y)    
# print(metrics)
# display(xgboost_load.feature_importance.head())

# y_pred = xgboost.predict(af_datasplit.test_X)
# print(f1_score(af_datasplit.test_y, y_pred, average="binary"))


# # ###############################

In [10]:
# # importing dependencies
# import cryptoaml.datareader as cdr
# from collections import OrderedDict
# from cryptoaml.models import XgbBoostAlgo, LightGbmAlgo

# import xgboost as xgb
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV

In [11]:
# # create a new instance of Elliptic Dataset
# ell = cdr.get_data("elliptic", 
#                    encode_classes=True)

In [12]:
# # xgbAlgo = XgbBoostAlgo() 
# # print(xgbAlgo.params)

# # lgb = LightGbmAlgo() 
# # print(lgb.params)
# af_datasplit = ell.get_data_split(train_perc=0.7, input_feats="AF", inc_unknown=False)

In [13]:
# grid_param = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
#  "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
#  "min_child_weight" : [ 1, 3, 5, 7 ],
#  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
#  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

# classifier = xgb.XGBClassifier()
# gd_sr = RandomizedSearchCV(estimator=classifier,
#                            param_distributions=grid_param,
#                            scoring="f1",
#                            n_iter = 10, 
#                            cv=5,
#                            n_jobs=-1)

# gd_sr.fit(af_datasplit.train_X, af_datasplit.train_y)

In [14]:
# best_parameters = gd_sr.best_params_
# print(best_parameters)

In [15]:
# best_model = gd_sr.best_estimator_
# print(best_model.get_params())

In [16]:
# y_pred = best_model.predict(af_datasplit.test_X)

In [17]:
# from sklearn.metrics import f1_score
# print(f1_score(af_datasplit.test_y, y_pred, average="binary"))

In [18]:
# from sklearn.metrics import f1_score
# classifier_2 = RandomForestClassifier(n_estimators=50, max_features=50)
# classifier_2.fit(af_datasplit.train_X, af_datasplit.train_y)
# y_pred_2 = classifier_2.predict(af_datasplit.test_X)
# print(f1_score(af_datasplit.test_y, y_pred_2, average="binary"))