In [39]:
from collections import Counter
import pandas as pd
import cryptoaml.datareader as cdr
from imblearn.under_sampling import NeighbourhoodCleaningRule

In [3]:
elliptic = cdr.get_data("elliptic")
data = elliptic.train_test_split(train_size=0.7, 
                                 feat_set="AF_NE", 
                                 inc_meta=True,
                                 inc_unknown=False)

train_X = data.train_X
train_y = data.train_y
test_X = data.test_X
test_y = data.test_y
counter = Counter(train_y)
print(counter)
counter = Counter(test_y)
print(counter)

Counter({0: 26432, 1: 3462})
Counter({0: 15587, 1: 1083})


In [43]:
ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)

# stratify version of NCL
tmp_data = train_X.copy()
tmp_data["label"] = train_y.copy()
ts_data = tmp_data.groupby("ts")

removed = 0
total_pre = tmp_data.shape[0]
undersampled_set = pd.DataFrame() 
for ts, group in ts_data:   
    
    grouped_X = group.iloc[:,:-1]
    ts_X = grouped_X[elliptic.feature_cols_AF_NE_]
    ts_y = group["label"]   
    counter = Counter(ts_y)
    print("Train set (ts:{}) counter Label: {}".format(ts, counter))

    X, y = ncr.fit_resample(ts_X, ts_y)  
    indices = ncr.sample_indices_
   
    counter = Counter(y)
    print("Train set (ts:{}) counter after NCR Label: {}".format(ts, counter))
    
    total_removed = ts_X.shape[0] - X.shape[0]
    print("Total removed (ts:{}): {}".format(ts, total_removed))
    removed += total_removed 
    
    samples_kept = grouped_X.iloc[indices]
    print("Total samples kept (ts:{}): {}".format(ts, samples_kept.shape[0]))
    
    undersampled_set = undersampled_set.append(samples_kept, ignore_index=True)
    
print("-------------------------------------")
print("Total samples removed: {} from {}".format(removed, total_pre))

undersampled_set = undersampled_set.append(test_X, ignore_index=True)
undersampled_set.drop(elliptic.feature_cols_NE_, inplace=True, axis=1)
display(undersampled_set)


    
#     _, y = ncr.fit_resample(train_X[elliptic_data.feature_cols_AF_NE_], train_y)

#     display(grouped_X)
#     display(test_ts_X)
    
    
# undersample = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)
# X, y = undersample.fit_resample(train_X[elliptic.feature_cols_AF_NE_], train_y)
# indices = undersample.sample_indices_
# counter = Counter(y)
# print(counter)

Train set (ts:1) counter Label: Counter({0: 2130, 1: 17})
Train set (ts:1) counter after NCR Label: Counter({0: 2090, 1: 17})
Total removed (ts:1): 40
Total samples kept (ts:1): 2107
Train set (ts:2) counter Label: Counter({0: 1099, 1: 18})
Train set (ts:2) counter after NCR Label: Counter({0: 1057, 1: 18})
Total removed (ts:2): 42
Total samples kept (ts:2): 1075
Train set (ts:3) counter Label: Counter({0: 1268, 1: 11})
Train set (ts:3) counter after NCR Label: Counter({0: 1238, 1: 11})
Total removed (ts:3): 30
Total samples kept (ts:3): 1249
Train set (ts:4) counter Label: Counter({0: 1410, 1: 30})
Train set (ts:4) counter after NCR Label: Counter({0: 1360, 1: 30})
Total removed (ts:4): 50
Total samples kept (ts:4): 1390
Train set (ts:5) counter Label: Counter({0: 1874, 1: 8})
Train set (ts:5) counter after NCR Label: Counter({0: 1853, 1: 8})
Total removed (ts:5): 21
Total samples kept (ts:5): 1861
Train set (ts:6) counter Label: Counter({0: 480, 1: 5})
Train set (ts:6) counter after 

Unnamed: 0,txId,ts,LF_0,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,...,AGG_62,AGG_63,AGG_64,AGG_65,AGG_66,AGG_67,AGG_68,AGG_69,AGG_70,AGG_71
0,232438397,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
1,182448768,1,-0.075979,-0.081127,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.064393,...,3.749738,3.038392,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399
2,3420550,1,-0.172972,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,...,-0.570998,-0.596794,1.297854,1.297925,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
3,3881034,1,-0.170492,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163641,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.068808,-0.120613,-0.119792
4,3878694,1,-0.172974,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.068808,1.299939,1.301521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45035,157579675,49,-0.172137,0.439032,1.018602,0.778481,-0.043875,0.893345,-0.061584,-0.163637,...,-0.479951,-0.531617,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
45036,158574917,49,-0.172811,0.041762,1.018602,0.103143,-0.043875,0.138585,-0.061584,-0.163631,...,-0.502370,-0.525310,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
45037,194064677,49,-0.007779,-0.123405,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,0.005384,...,-0.509843,-0.556847,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
45038,157631750,49,0.713156,-0.134122,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,0.743007,...,0.170195,0.017130,-0.979074,-0.978556,0.018279,0.412346,1.077353,1.022387,-1.760926,-1.760984


In [1]:
# samples_kept = train_X.iloc[indices]
# undersampled_set = samples_kept.append(test_X, ignore_index=True)

# print(elliptic.feature_cols_NE_)
# display(undersampled_set.drop(elliptic.feature_cols_NE_, axis=1))
# # display(undersampled_set.drop(elliptic.feature_cols_NE_[18]))
# # undersampled_set.to_csv("test.csv", index=False, header=False, )

{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'gpu_id': -1, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.300000012, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': None, 'validate_parameters': False, 'verbosity': None}
OrderedDict([('f1', 0.8065843621399178), ('recall', 0.7239150507848569), ('precision', 0.9105691056910569), ('auc', 0.8594875183352654), ('f1_micro', 0.9774445110977804)])
