In [4]:
import pandas as pd
import numpy as np
import flaml
import sklearn.metrics
from flaml import AutoML

In [7]:
# read in standardized data
data = pd.read_pickle('standardization_data.pkl')
data.head()

Unnamed: 0,nonce,transaction_index,value,gas,gas_price,input,receipt_gas_used,from_scam,to_scam,months,days,hours,minutes
0,4.927516,-0.590909,1.052132,-0.072464,0.048111,0,0.0,0,0,1.54363,-0.230799,-0.934209,-0.164294
1,3.047148,0.329545,-0.16304,0.797101,0.014304,0,0.0,0,0,1.54363,-0.230799,-0.934209,-0.04998
2,3.047321,-0.284091,-0.16304,0.797101,0.014304,0,0.0,0,0,1.54363,-0.230799,-0.934209,0.007177
3,3.047734,0.25,-0.16304,0.797101,0.014304,0,0.0,0,0,1.54363,-0.230799,-0.934209,0.007177
4,3.0478,0.306818,-0.16304,0.797101,0.014304,0,0.0,0,0,1.54363,-0.230799,-0.934209,0.007177


# Validation Split Approach

In [8]:
from sklearn.model_selection import train_test_split
y_fromscam = data.pop('from_scam')
y_toscam = data.pop('to_scam')
x_train,x_test,y_train,y_test = train_test_split(data,y_fromscam) # first predict from scam

# Desired Metrics and Models
## Metrics
- Recall
- Precision
- Accuracy
- F1
- AUC

## Models
- Xgboost
- Random Forest
- L1-Logistic
- L2-Logistic
- KNN
- SVM
- Naive Bayes
- DNN
- Ensemble


In [11]:
# for each metric of interst run automl binary classification for first 3 models
results =[]
for i in ['accuracy','roc_auc','f1']:
    automl = AutoML()
    automl_settings = {
        "metric":i,
        'task':'classification',
        'log_file_name':i,
    }
    
    automl.fit(X_train=x_train, y_train=y_train,
           **automl_settings,estimator_list=['xgboost','rf','lrl1'])
    
    results.append(automl)

[flaml.automl: 02-18 11:19:23] {2055} INFO - task = classification
[flaml.automl: 02-18 11:19:23] {2057} INFO - Data split method: stratified
[flaml.automl: 02-18 11:19:23] {2061} INFO - Evaluation method: holdout
[flaml.automl: 02-18 11:19:23] {2142} INFO - Minimizing error metric: 1-accuracy
[flaml.automl: 02-18 11:19:23] {2200} INFO - List of ML learners in AutoML Run: ['xgboost', 'rf', 'lrl1']
[flaml.automl: 02-18 11:19:23] {2453} INFO - iteration 0, current learner xgboost
  import pandas.util.testing as tm
[flaml.automl: 02-18 11:19:24] {2569} INFO - Estimated sufficient time budget=18859s. Estimated necessary time budget=193s.
[flaml.automl: 02-18 11:19:24] {2621} INFO -  at 0.5s,	estimator xgboost's best error=0.0269,	best estimator xgboost's best error=0.0269
[flaml.automl: 02-18 11:19:24] {2453} INFO - iteration 1, current learner xgboost
[flaml.automl: 02-18 11:19:24] {2621} INFO -  at 0.6s,	estimator xgboost's best error=0.0269,	best estimator xgboost's best error=0.0269
[f

In [12]:
# save results
import pickle
with open('standrdization_results_0_3.pkl', 'wb') as f:
    pickle.dump(results, f)

In [13]:
with open('standrdization_results_0_3.pkl', 'rb') as f:
    results = pickle.load(f)

In [19]:
# for each metric of interst run automl binary classification for last 3 models
for i in ['accuracy','roc_auc','f1']:
    automl1 = AutoML()
    automl_settings = {
        "metric":i,
        'task':'classification',
        'log_file_name':i,
    }
    
    automl1.fit(X_train=x_train, y_train=y_train,
           **automl_settings,estimator_list=['lgbm','lrl2','kneighbor'])
    
    results.append(automl1)

[flaml.automl: 02-18 11:25:39] {2055} INFO - task = classification
[flaml.automl: 02-18 11:25:39] {2057} INFO - Data split method: stratified
[flaml.automl: 02-18 11:25:39] {2061} INFO - Evaluation method: holdout
[flaml.automl: 02-18 11:25:39] {2142} INFO - Minimizing error metric: 1-accuracy
[flaml.automl: 02-18 11:25:39] {2200} INFO - List of ML learners in AutoML Run: ['extra_tree', 'lgbm', 'lrl2', 'kneighbor']
[flaml.automl: 02-18 11:25:39] {2453} INFO - iteration 0, current learner extra_tree
[flaml.automl: 02-18 11:25:40] {2569} INFO - Estimated sufficient time budget=15574s. Estimated necessary time budget=47s.
[flaml.automl: 02-18 11:25:40] {2621} INFO -  at 0.5s,	estimator extra_tree's best error=0.0368,	best estimator extra_tree's best error=0.0368
[flaml.automl: 02-18 11:25:40] {2453} INFO - iteration 1, current learner lgbm
[flaml.automl: 02-18 11:25:40] {2621} INFO -  at 0.5s,	estimator lgbm's best error=0.0368,	best estimator extra_tree's best error=0.0368
[flaml.automl:

ValueError: current limit exceeds maximum limit

In [20]:
len(results)

3

In [13]:
for i in results:
    print("Best Estimator: ")
    print("")
    print(i.model.estimator)
    print("")
    print("Best Configuration: ")
    print("")
    print(i.best_config)
    print("Best configuration train time: ")
    print("")
    print(i.best_config_train_time)
    print("")
    print("Best Iteration")
    print(i.best_iteration)
    print("")
    print("Best loss")
    print(i.best_loss)
    print("")
    print(i.time_to_find_best_model)
    print(i.config_history)
    print("")
    print("////////////////////////////////////////////////////////////////////////////////////////////////////////////")
    print("")

Best Estimator: 

XGBClassifier(base_score=0.5, booster='gbtree',
              colsample_bylevel=0.974982860880507, colsample_bynode=1,
              colsample_bytree=0.6370684336276642, gamma=0, gpu_id=-1,
              grow_policy='lossguide', importance_type='gain',
              interaction_constraints='', learning_rate=0.02896596391154565,
              max_delta_step=0, max_depth=0, max_leaves=251,
              min_child_weight=3.4544988117157507, missing=nan,
              monotone_constraints='()', n_estimators=515, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0.0009765625,
              reg_lambda=0.008415949612508924, scale_pos_weight=1,
              subsample=0.9391146021612687, tree_method='hist',
              use_label_encoder=False, validate_parameters=1, verbosity=0)

Best Configuration: 

{'n_estimators': 515, 'max_leaves': 251, 'min_child_weight': 3.4544988117157507, 'learning_rate': 0.02896596391154565, 'subsample': 0.9391146021612687, '