In [1]:
import numpy as np
import cryptoaml.datareader as cdr
from cryptoaml.models import RandomForestAlgo
from skmultiflow.meta.adaptive_random_forests import AdaptiveRandomForest


The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.



In [2]:
elliptic = cdr.get_data("elliptic")
data = elliptic.train_test_split(train_size=0.7, 
                                 feat_set="AF", 
                                 inc_meta=False,
                                 inc_unknown=False)

train_X = data.train_X
train_y = data.train_y
test_X = data.test_X
test_y = data.test_y

In [3]:
rf = RandomForestAlgo()
rf.fit(train_X, train_y)

In [4]:
def time_indexed_score(model, metric, X, y):
    results = []
    tmp_data = X.copy()
    tmp_data["label"] = y.copy()
    ts_data = tmp_data.groupby("ts")
    for ts, group in ts_data:
        test_ts_X = group.iloc[:,:-1]
        test_ts_y = group["label"]
        evaluation = model.evaluate([metric], test_ts_X, test_ts_y)
        label_count = group["label"].value_counts()
        results.append({"timestep": ts, "score":evaluation[metric], "total_pos_label": label_count.tolist()[1]}) 
    return results

In [5]:
rf_results = time_indexed_score(rf, "recall", test_X, test_y)

In [6]:
print(rf_results)

[{'timestep': 35, 'score': 0.967032967032967, 'total_pos_label': 182}, {'timestep': 36, 'score': 1.0, 'total_pos_label': 33}, {'timestep': 37, 'score': 0.625, 'total_pos_label': 40}, {'timestep': 38, 'score': 0.9009009009009009, 'total_pos_label': 111}, {'timestep': 39, 'score': 0.9259259259259259, 'total_pos_label': 81}, {'timestep': 40, 'score': 0.6339285714285714, 'total_pos_label': 112}, {'timestep': 41, 'score': 0.9310344827586207, 'total_pos_label': 116}, {'timestep': 42, 'score': 0.7949790794979079, 'total_pos_label': 239}, {'timestep': 43, 'score': 0.0, 'total_pos_label': 24}, {'timestep': 44, 'score': 0.041666666666666664, 'total_pos_label': 24}, {'timestep': 45, 'score': 0.0, 'total_pos_label': 5}, {'timestep': 46, 'score': 0.5, 'total_pos_label': 2}, {'timestep': 47, 'score': 0.0, 'total_pos_label': 22}, {'timestep': 48, 'score': 0.0, 'total_pos_label': 36}, {'timestep': 49, 'score': 0.017857142857142856, 'total_pos_label': 56}]


In [7]:
adaptive_rf = AdaptiveRandomForest(performance_metric="kappa")

tmp_data = train_X.copy()
tmp_data["label"] = train_y.copy()
ts_data = tmp_data.groupby("ts")
for ts, group in ts_data:
    print("Training TS:{}".format(ts))
    test_ts_X = group.iloc[:,:-1]
    test_ts_y = group["label"]
    adaptive_rf = adaptive_rf.partial_fit(test_ts_X.values, test_ts_y.values, classes=np.array([0,1]))

# learner.fit(train_X.values, train_y.values)

Training TS:1
Training TS:2
Training TS:3
Training TS:4
Training TS:5
Training TS:6
Training TS:7
Training TS:8
Training TS:9
Training TS:10
Training TS:11
Training TS:12
Training TS:13
Training TS:14
Training TS:15
Training TS:16
Training TS:17
Training TS:18
Training TS:19
Training TS:20
Training TS:21
Training TS:22
Training TS:23
Training TS:24
Training TS:25
Training TS:26
Training TS:27
Training TS:28
Training TS:29
Training TS:30
Training TS:31
Training TS:32
Training TS:33
Training TS:34


In [10]:
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import NeighbourhoodCleaningRule

ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)

model = xgb.XGBClassifier()
model.fit(train_X, train_y)
# start= False
# tmp_data = train_X.copy()
# tmp_data["label"] = train_y.copy()
# ts_data = tmp_data.groupby("ts")
# for ts, group in ts_data:
    
#     booster = None 
#     if start == True:
#         start = True 
#         booster = model.get_booster()
        
#     print("Training TS:{}".format(ts))
#     test_ts_X = group.iloc[:,:-1]
#     test_ts_y = group["label"]
#     model.fit(train_set_X, train_set_y, xgb_model=booster)
    
    #adaptive_rf = adaptive_rf.partial_fit(test_ts_X.values, test_ts_y.values, classes=np.array([0,1]))

all_x = 0 
tmp_data = test_X.copy()
predictions = []
tmp_data["label"] = test_y.copy()
matrix_all = np.array([[0,0],[0,0]])

for ts in np.arange(test_X["ts"].min(), test_X["ts"].max()):
    train_set = tmp_data[tmp_data["ts"] == ts]
    train_set_X = train_set.iloc[:,:-1]
    train_set_y = train_set["label"]   
    
    
#     X, y = ncr.fit_resample(train_set_X, train_set_y)
    
    test_set = tmp_data[tmp_data["ts"] == ts + 1]
    test_set_X = test_set.iloc[:,:-1]
    test_set_y = test_set["label"]
    
    if ts == 35:
        y_pred = model.predict(train_set_X)
        predictions.append(y_pred)
        evaluation = f1_score(train_set_y, y_pred, average='binary')
        matrix = confusion_matrix(train_set_y, y_pred)
        matrix_all = matrix_all + matrix
        
        print(matrix)
        
        all_x += evaluation
        print("TS: {}, Score: {}".format(ts , evaluation)) 
    
    booster = model.get_booster()
    model.fit(train_set_X, train_set_y, xgb_model=booster)
    y_pred = model.predict(test_set_X)
    predictions.append(y_pred)

#     adaptive_rf = adaptive_rf.partial_fit(train_set_X, train_set_y, classes=np.array([0,1]))
#     y_pred = adaptive_rf.predict(test_set_X)

    
    all_x += evaluation
    evaluation = f1_score(test_set_y, y_pred, average='binary')
    matrix = confusion_matrix(test_set_y, y_pred)
    matrix_all = matrix_all + matrix
    print("TS: {}, Score: {}".format(ts + 1 , evaluation))


print(matrix_all)
print(f1_score(test_y.values, np.concatenate(predictions, axis=0 ), average='binary'))
print(recall_score(test_y.values, np.concatenate(predictions, axis=0 ), average='binary'))

[[1152    7]
 [   6  176]]
TS: 35, Score: 0.9643835616438357
TS: 36, Score: 1.0
TS: 37, Score: 0.8235294117647058
TS: 38, Score: 0.8942307692307693
TS: 39, Score: 0.9102564102564102
TS: 40, Score: 0.7177033492822966
TS: 41, Score: 0.9166666666666667
TS: 42, Score: 0.8439560439560441
TS: 43, Score: 0.17391304347826086
TS: 44, Score: 0.1846153846153846
TS: 45, Score: 0.0
TS: 46, Score: 0.5
TS: 47, Score: 0.11764705882352942
TS: 48, Score: 0.38235294117647056
TS: 49, Score: 0.875
[[15406   181]
 [  230   853]]
0.8058573452999529
0.7876269621421976


In [11]:
# 0.8058573452999529
# 0.7876269621421976

In [102]:
from skmultiflow.meta import LearnNSE
learn_nse = LearnNSE(base_estimator=xgb.XGBClassifier(), n_estimators=10, pruning="error")

tmp_data = train_X.copy()
tmp_data["label"] = train_y.copy()
ts_data = tmp_data.groupby("ts")
for ts, group in ts_data:
    
#     if ts == 1: 
#         model = xgb.XGBClassifier()
#         model.fit(test_ts_X.values, test_ts_y.values)
#         learn_nse = OnlineBoosting(base_estimator=model)
#         continue 
    
    print("Training TS:{}".format(ts))
    test_ts_X = group.iloc[:,:-1]
    test_ts_y = group["label"]
    
    window_size = int(test_ts_X.shape[0]) 
    learn_nse.set_params(window_size=window_size)
    learn_nse = learn_nse.partial_fit(test_ts_X.values, test_ts_y.values, classes=np.array([0,1]))

Training TS:1
Training TS:2
Training TS:3
Training TS:4
Training TS:5
Training TS:6
Training TS:7
Training TS:8
Training TS:9
Training TS:10
Training TS:11
Training TS:12
Training TS:13
Training TS:14
Training TS:15
Training TS:16
Training TS:17
Training TS:18
Training TS:19
Training TS:20
Training TS:21
Training TS:22
Training TS:23
Training TS:24
Training TS:25
Training TS:26
Training TS:27
Training TS:28
Training TS:29
Training TS:30
Training TS:31
Training TS:32
Training TS:33
Training TS:34


In [101]:
tmp_data = test_X.copy()
tmp_data["label"] = test_y.copy()
matrix_all = np.array([[0,0],[0,0]])
for ts in np.arange(test_X["ts"].min(), test_X["ts"].max()):
    train_set = tmp_data[tmp_data["ts"] == ts]
    train_set_X = train_set.iloc[:,:-1]
    train_set_y = train_set["label"]      

    test_set = tmp_data[tmp_data["ts"] == ts + 1]
    test_set_X = test_set.iloc[:,:-1].values
    test_set_y = test_set["label"].values
    
    window_size = int(train_set_X.shape[0]) 
    learn_nse.set_params(window_size=window_size)
    learn_nse = learn_nse.partial_fit(train_set_X.values, train_set_y.values, classes=np.array([0,1]))
    
    y_pred = learn_nse.predict(test_set_X)
    evaluation = f1_score(test_set_y, y_pred, average='binary')
    print("TS {}: {}".format(ts+1, evaluation))

TS 36: 0.0
TS 37: 0.0
TS 38: 0.0
TS 39: 0.0
TS 40: 0.0
TS 41: 0.0
TS 42: 0.0
TS 43: 0.0
TS 44: 0.0
TS 45: 0.0
TS 46: 0.0
TS 47: 0.0
TS 48: 0.0
TS 49: 0.0
