In [1]:
###### importing dependencies #############################################
import pprint
import seaborn as sns
import cryptoaml.datareader as cdr
from collections import OrderedDict
from cryptoaml.metrics import results_table 
from cryptoaml.models import RandomForestAlgo, XgboostAlgo, LightGbmAlgo, CatBoostAlgo



In [2]:
# models without tuning 
elliptic = cdr.get_data("elliptic")
elliptic_sets = elliptic.train_test_split(train_size=0.7, 
                                          feat_set=["LF", "AF"], 
                                          inc_meta=False)

In [3]:
# models with default parameters  
models_default = OrderedDict()

rf_default = RandomForestAlgo(n_jobs=-1, n_estimators=50, max_features=50)
models_default[rf_default.model_name_] = rf_default

# Using the default values for XGBoost Classifier will obtain reproducable results 
# => 'gblinear' booster with shotgun updater is nondeterministic as it uses Hogwild algorithm [Default='gbtree']
# =>  parameters such as subsample and colsample_by_* are set to 1, meaning no random sampling will be used 
xgb_default = XgboostAlgo(n_jobs=-1)
models_default[xgb_default.model_name_] = xgb_default

light_default = LightGbmAlgo(n_jobs=-1)
models_default[light_default.model_name_] = light_default

cat_default = CatBoostAlgo(thread_count=-1, verbose=False)
models_default[cat_default.model_name_] = cat_default

In [4]:
# extract results for elliptic dataset on different feature sets

results = OrderedDict()
results["elliptic"] = OrderedDict()
metrics=["precision", "recall", "f1", "f1_micro", "confusion"]

#extracting dataset results 
for model_key, model in models_default.items():
    print("\n######################################################")
    print("Elliptic Dataset - Feature Set [{0}] ".format(model_key))
    print("######################################################")
    results["elliptic"][model_key] = OrderedDict()
        
    for feature_set, feature_set_data in elliptic_sets.items():
        print("- Training & Extracting Results - Feature Set [{}]".format(feature_set))

        if feature_set not in results["elliptic"][model_key]:
            results["elliptic"][model_key][feature_set] = OrderedDict()

        # train model with default parameters
        tmp_train_X = feature_set_data.train_X
        tmp_train_y = feature_set_data.train_y 
        model.fit(tmp_train_X, tmp_train_y)

        # extract results 
        tmp_test_X = feature_set_data.test_X
        tmp_test_y = feature_set_data.test_y
        tmp_result = model.evaluate(metrics=metrics,
                                    X=tmp_test_X, 
                                    y=tmp_test_y)


        print(pprint.pformat(tmp_result))
        print(pprint.pformat(model.get_params()))
        results["elliptic"][model_key][feature_set] = tmp_result        


######################################################
Elliptic Dataset - Feature Set [random_forest] 
######################################################
- Training & Extracting Results - Feature Set [LF]
OrderedDict([('precision', 0.8905882352941177),
             ('recall', 0.6989843028624192),
             ('f1', 0.7832384893947233),
             ('f1_micro', 0.9748650269946011),
             ('confusion', array([[15494,    93],
       [  326,   757]]))])
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 50,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
- Training & Extracting Results - Feature Set [AF]
OrderedDict([('precision', 0.8966704936854191),
 

In [5]:
# RF 
# 0.7801920161697827
# 0.8146214099216711

# 0.7828715365239295
# 0.8103626943005181

# XGBoost 
# 0.7794871794871795
# 0.7794871794871795

# 0.8026652998462326
# 0.8026652998462326

# Light 
# 0.7789580171977744
# 0.7789580171977744

# 0.813929313929314
# 0.813929313929314

# CAT 
# 0.7892004153686397
# 0.7892004153686397
# 0.7892004153686397

# 0.8187565858798735
# 0.8187565858798735
# 0.8187565858798735

In [6]:
# display results 
# print(results)
# models = [x[0] for x in results["elliptic"]["LF"].items()]
# f1_scores = [x["f1"] for x in results["elliptic"]["LF"].values()]

# print(f1_scores)
# ax = sns.barplot(x=models, y=f1_scores)
# ax.set(ylim=(0, 1))



In [7]:
# f1_scores = [x["f1"] for x in results["elliptic"]["AF"].values()]

# print(f1_scores)
# ax = sns.barplot(x=models, y=f1_scores)
# ax.set(ylim=(0, 1))

In [8]:
# confusion_matrix = [x["confusion"] for x in results["elliptic"]["AF"].values()]
# print(confusion_matrix)

In [12]:
import matplotlib.pyplot as plt

def confusion_plt(data, title):
    ax = sns.heatmap(data, 
                     annot=True, 
                     cmap="Blues",  
                     fmt="g")
    ax.set_title(title)
    return ax

In [10]:
# a = confusion_plt(confusion_matrix[0], "test")
# b = confusion_plt(confusion_matrix[0], "test")
# c = confusion_plt(confusion_matrix[0], "test")
# d = confusion_plt(confusion_matrix[0], "test")


# ACCURARCY        = "accuracy"
# F1_BINARY        = "f1"
# F1_MICRO         = "f1_micro"
# RECALL_BINARY    = "recall"
# PRECISION_BINARY = "precision"

# results_table_metrics = {ACCURARCY, F1_BINARY, F1_MICRO, RECALL_BINARY, PRECISION_BINARY}

    
results_tbl = results_table(results["elliptic"])
display(results_tbl)
display(results_tbl.sort_values("f1", ascending=False))

Unnamed: 0,model,precision,recall,f1,f1_micro
0,random_forest_LF,0.890588,0.698984,0.783238,0.974865
1,random_forest_AF,0.89667,0.721145,0.799386,0.976485
2,xg_boost_LF,0.876586,0.701754,0.779487,0.974205
3,xg_boost_AF,0.902074,0.722992,0.802665,0.976905
4,light_boost_LF,0.861298,0.710988,0.778958,0.973785
5,light_boost_AF,0.931034,0.722992,0.813929,0.978524
6,cat_boost_LF,0.901542,0.701754,0.7892,0.975645
7,cat_boost_AF,0.953374,0.717452,0.818757,0.979364


Unnamed: 0,model,precision,recall,f1,f1_micro
7,cat_boost_AF,0.953374,0.717452,0.818757,0.979364
5,light_boost_AF,0.931034,0.722992,0.813929,0.978524
3,xg_boost_AF,0.902074,0.722992,0.802665,0.976905
1,random_forest_AF,0.89667,0.721145,0.799386,0.976485
6,cat_boost_LF,0.901542,0.701754,0.7892,0.975645
0,random_forest_LF,0.890588,0.698984,0.783238,0.974865
2,xg_boost_LF,0.876586,0.701754,0.779487,0.974205
4,light_boost_LF,0.861298,0.710988,0.778958,0.973785


In [11]:
# fig = plt.figure(figsize=(17,15))
# fig.subplots_adjust(hspace=0.3, wspace=0.2)
# i = 1
# for x in confusion_matrix:
#     ax = fig.add_subplot(2, 2, i)
#     sns.set(font_scale=1.4) 
#     confusion_plt(x, "test")
#     i = i+1