In [9]:
from data_centric.models import ActiveLearner
import pandas as pd
import numpy as np 
from utils import feature_names, class2id
from flaml import AutoML

from data_centric.uncertainty import uncertainty_sampling, regression_uncertainity_combined, prediction_sampling


df = pd.read_csv("./data/wmt_2020_all_reduced.csv")
def get_features(sample_id):
    sample_features = df.loc[sample_id]
    n_rows = len(sample_id)
    return sample_features[feature_names].values.reshape(n_rows, -1), np.array(
        sample_features["severity"]
    ).reshape(n_rows, 1)

df.severity.replace(class2id, inplace=True)


In [10]:
X = df[feature_names].values
y = df["severity"].values.reshape(-1, 1)

In [12]:
unique_ys = np.unique(df["severity"]).reshape(-1, 1)
init_sample = df.sample(unique_ys.shape[0])
init_sample_id = init_sample.index.values



In [13]:
# Train with labeled input data
X_init = get_features(init_sample_id)[0]
y_init = unique_ys
X_init.shape

(3, 128)

In [14]:

# Specify automl goal and constraints
automl_sev_settings = {
    "time_budget": 2,  # in seconds
    "metric": "micro_f1",
    "estimator_list": ["lgbm", "xgboost"],
    "log_file_name": "my_sev.log",
}


model_sev = ActiveLearner(
    estimator=AutoML(),
    embedding_pipeline="",
    X_training=X_init,
    y_training=y_init,
    query_strategy=prediction_sampling,
    **automl_sev_settings,
)

[flaml.automl: 02-10 12:04:21] {2007} INFO - task = classification
[flaml.automl: 02-10 12:04:21] {2009} INFO - Data split method: stratified
[flaml.automl: 02-10 12:04:21] {2013} INFO - Evaluation method: cv
[flaml.automl: 02-10 12:04:21] {1045} INFO - class 0 augmented from 1 to 20
[flaml.automl: 02-10 12:04:21] {1045} INFO - class 1 augmented from 1 to 20
[flaml.automl: 02-10 12:04:21] {1045} INFO - class 2 augmented from 1 to 20
[flaml.automl: 02-10 12:04:21] {2113} INFO - Minimizing error metric: 1-micro_f1
[flaml.automl: 02-10 12:04:21] {2170} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost']
[flaml.automl: 02-10 12:04:21] {2437} INFO - iteration 0, current learner lgbm
[flaml.automl: 02-10 12:04:21] {2550} INFO - Estimated sufficient time budget=230s. Estimated necessary time budget=0s.
[flaml.automl: 02-10 12:04:21] {2597} INFO -  at 0.0s,	estimator lgbm's best error=0.6667,	best estimator lgbm's best error=0.6667
[flaml.automl: 02-10 12:04:21] {2437} INFO - iterati

<class 'numpy.ndarray'>


[flaml.automl: 02-10 12:04:21] {2597} INFO -  at 0.2s,	estimator lgbm's best error=0.0000,	best estimator lgbm's best error=0.0000
[flaml.automl: 02-10 12:04:21] {2437} INFO - iteration 8, current learner lgbm
[flaml.automl: 02-10 12:04:21] {2597} INFO -  at 0.2s,	estimator lgbm's best error=0.0000,	best estimator lgbm's best error=0.0000
[flaml.automl: 02-10 12:04:21] {2437} INFO - iteration 9, current learner xgboost
[flaml.automl: 02-10 12:04:21] {2597} INFO -  at 0.3s,	estimator xgboost's best error=0.0000,	best estimator lgbm's best error=0.0000
[flaml.automl: 02-10 12:04:21] {2437} INFO - iteration 10, current learner xgboost
[flaml.automl: 02-10 12:04:21] {2597} INFO -  at 0.3s,	estimator xgboost's best error=0.0000,	best estimator lgbm's best error=0.0000
[flaml.automl: 02-10 12:04:21] {2437} INFO - iteration 11, current learner xgboost
[flaml.automl: 02-10 12:04:21] {2597} INFO -  at 0.3s,	estimator xgboost's best error=0.0000,	best estimator lgbm's best error=0.0000
[flaml.au

In [17]:
query_idx, query_feature = model_sev.query(X, n_instances=10)


> [0;32m/home/ahmet/repos/human-benchmark/data_centric/uncertainty.py[0m(189)[0;36mprediction_sampling[0;34m()[0m
[0;32m    187 [0;31m    [0;32mif[0m [0;32mnot[0m [0mrandom_tie_break[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    188 [0;31m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 189 [0;31m        [0;32mreturn[0m [0mmulti_argmax[0m[0;34m([0m[0mprecitions[0m[0;34m,[0m [0mn_instances[0m[0;34m=[0m[0mn_instances[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    190 [0;31m[0;34m[0m[0m
[0m[0;32m    191 [0;31m    [0;32mreturn[0m [0mshuffled_argmax[0m[0;34m([0m[0mprecitions[0m[0;34m,[0m [0mn_instances[0m[0;34m=[0m[0mn_instances[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


In [18]:
X[query_idx]
y[query_idx]

array([[2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [2]])

In [8]:
model_sev.teach(X[query_idx], y[query_idx])

[flaml.automl: 02-10 11:50:08] {2007} INFO - task = classification
[flaml.automl: 02-10 11:50:08] {2009} INFO - Data split method: stratified
[flaml.automl: 02-10 11:50:08] {2013} INFO - Evaluation method: cv
[flaml.automl: 02-10 11:50:08] {1045} INFO - class 0 augmented from 1 to 20
[flaml.automl: 02-10 11:50:08] {1045} INFO - class 1 augmented from 2 to 20
[flaml.automl: 02-10 11:50:08] {1045} INFO - class 2 augmented from 1 to 20
[flaml.automl: 02-10 11:50:08] {2113} INFO - Minimizing error metric: log_loss
[flaml.automl: 02-10 11:50:08] {2170} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl: 02-10 11:50:08] {2437} INFO - iteration 0, current learner lgbm
[flaml.automl: 02-10 11:50:08] {2550} INFO - Estimated sufficient time budget=523s. Estimated necessary time budget=13s.
[flaml.automl: 02-10 11:50:08] {2597} INFO -  at 0.1s,	estimator lgbm's best error=0.9254,	best estimator lgbm's best error=0.

<class 'numpy.ndarray'>


[flaml.automl: 02-10 11:50:08] {2597} INFO -  at 0.2s,	estimator lgbm's best error=0.0040,	best estimator lgbm's best error=0.0040
[flaml.automl: 02-10 11:50:08] {2437} INFO - iteration 7, current learner lgbm
[flaml.automl: 02-10 11:50:08] {2597} INFO -  at 0.3s,	estimator lgbm's best error=0.0040,	best estimator lgbm's best error=0.0040
[flaml.automl: 02-10 11:50:08] {2437} INFO - iteration 8, current learner lgbm
[flaml.automl: 02-10 11:50:08] {2597} INFO -  at 0.3s,	estimator lgbm's best error=0.0040,	best estimator lgbm's best error=0.0040
[flaml.automl: 02-10 11:50:08] {2437} INFO - iteration 9, current learner xgboost
[flaml.automl: 02-10 11:50:08] {2597} INFO -  at 0.3s,	estimator xgboost's best error=0.6828,	best estimator lgbm's best error=0.0040
[flaml.automl: 02-10 11:50:08] {2437} INFO - iteration 10, current learner lgbm
[flaml.automl: 02-10 11:50:08] {2597} INFO -  at 0.4s,	estimator lgbm's best error=0.0029,	best estimator lgbm's best error=0.0029
[flaml.automl: 02-10 1

In [None]:
loss_list = []
n = 1
X_pool = X
y_pool = y

sample_class_list = []
i=0
y_pred = []
y_true = []
while i<100:
    print(f"Querying new sample")
    query_idx, query_feature = model_sev.query(X_pool)
    x_selected = X_pool[query_idx]
    y_selected = y_pool[query_idx]
    print(f"Queried new sample with Label: {y_true}")
    
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)
    
    y_pred_tmp = model_sev.predict(x_selected)
    y_pred.append(y_pred_tmp)
    y_true.append(y_selected)
    model_sev.teach(x_selected, y_selected)
    i += 1 
    break

In [31]:
y_pred_tmp

array(['Minor'], dtype=object)