In [1]:
import os
from os.path import join
import re
import pickle

from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from vowpalwabbit.DFtoVW import DFtoVW
from vowpalwabbit.pyvw import vw

from sklearn.metrics import mean_squared_error
import random
''' import AutoVW class from flaml package '''
from flaml import AutoVW

# Graphical
SUPTITLE_FONTSIZE = 20
SUPTITLE_FONTWEIGHT = "bold"
TITLE_FONTSIZE = 15

from utils import default_feature_str, default_feature_str, get_test_example, get_vw_examples, get_training_example

## Prepare the dataset in the vowpalwabbit format

In [2]:
feature_names =["Num token","Num char","Avg word length","Num ADJ","Num ADP","Num ADV","Num AUX","Num CCONJ","Num DET","Num INTJ","Num NOUN","Num NUM","Num PART","Num PRON","Num PROPN","Num PUNCT","Num SCONJ","Num SYM","Num VERB","Num X","Num LOC","Num MISC","Num ORG","Num PER","Num Abbr=Yes","Num Case=Acc","Num Case=Nom","Num Definite=Def","Num Definite=Ind","Num Degree=Cmp","Num Degree=Pos","Num Degree=Sup","Num Foreign=Yes","Num Gender=Fem","Num Gender=Masc","Num Gender=Neut","Num Mood=Imp","Num Mood=Ind","Num NumForm=Digit","Num NumForm=Word","Num NumType=Card","Num NumType=Mult","Num NumType=Ord","Num Number=Plur","Num Number=Sing","Num Person=1","Num Person=2","Num Person=3","Num Polarity=Neg","Num Poss=Yes","Num PronType=Art","Num PronType=Dem","Num PronType=Int","Num PronType=Prs","Num PronType=Rel","Num Reflex=Yes","Num Tense=Past","Num Tense=Pres","Num VerbForm=Fin","Num VerbForm=Ger","Num VerbForm=Inf","Num VerbForm=Part","Num Voice=Pass","Num Style=Expr","Num NumForm=Roman","Num Mood=Cnd","Num Mood=Sub","Num Number[psor]=Plur","Num Number[psor]=Sing","Num Person[psor]=1","Num Person[psor]=2","Num Person[psor]=3","Num PronType=Exc","Num PronType=Ind","Num PronType=Neg","Num Tense=Fut","Num Tense=Imp","Num Typo=Yes","Num Case=Dat","Num Case=Gen","Num Gender[psor]=Masc,Neut","Num Animacy=Anim","Num Animacy=Inan","Num Aspect=Imp","Num Aspect=Perf","Num Case=Ins","Num Case=Loc","Num Variant=Short","Num VerbForm=Conv","Num Voice=Act","Num Voice=Mid","Num AdpType=Comprep","Num AdpType=Prep","Num AdpType=Voc","Num Case=Voc","Num ConjType=Oper","Num Gender=Fem,Masc","Num Gender=Fem,Neut","Num Gender=Masc,Neut","Num Gender[psor]=Fem","Num Gender[psor]=Masc","Num Hyph=Yes","Num NameType=Com","Num NameType=Geo","Num NameType=Giv","Num NameType=Nat","Num NameType=Sur","Num NumType=Frac","Num NumType=Sets","Num NumValue=1","Num NumValue=1,2,3","Num Number=Dual","Num Number=Plur,Sing","Num Polarity=Pos","Num PrepCase=Npr","Num PrepCase=Pre","Num PronType=Emp","Num PronType=Int,Rel","Num PronType=Tot","Num Style=Arch","Num Style=Coll",
        ]
        
X_train = np.load("./data/X_train.npy")
y_train = np.load("./data/y_train.npy")
X_test = np.load("./data/X_test.npy")
y_test = np.load("./data/y_test.npy")

In [6]:
import numpy as np
import string
import pickle
NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
max_ns_num = 15 # the maximum number of namespaces
orginal_dim = 128
max_size_per_group = int(np.ceil(orginal_dim / float(max_ns_num)))
# sequential grouping
group_indexes = []
for i in range(max_ns_num):
    indexes = [ind for ind in range(i * max_size_per_group,
                min((i + 1) * max_size_per_group, orginal_dim))]
    if len(indexes) > 0:
        group_indexes.append(indexes)

In [7]:
#pickle save and load group_indexes
with open("./data/group_indexes.pkl", "wb") as f:
    pickle.dump(group_indexes, f)
with open("./data/group_indexes.pkl", "rb") as f:
    group_indexes = pickle.load(f)


In [8]:
vw_examples = []
for i in range(X_train.shape[0]):
    ns_content = []
    for zz in range(len(group_indexes)):
        ns_features = ' '.join('{}:{:.6f}'.format(ind, X_train[i][ind]) for ind in group_indexes[zz])
        ns_content.append(ns_features)
    ns_line = '{} |{}'.format(str(y_train[i]), '|'.join('{} {}'.format(NS_LIST[j], ns_content[j]) for j in range(len(group_indexes))))
    vw_examples.append(ns_line)

In [9]:
x = X_test[0]
y=y_test[0]

def get_training_sample(x, y):
    ns_content = []
    for zz in range(len(group_indexes)):
        ns_features = ' '.join('{}:{:.6f}'.format(ind, x[ind]) for ind in group_indexes[zz])
        ns_content.append(ns_features)
    ns_line = '{} |{}'.format( str(y), '|'.join('{} {}'.format(NS_LIST[j], ns_content[j]) for j in range(len(group_indexes))))
    return ns_line



def get_test_sample(x):
    ns_content = []
    for zz in range(len(group_indexes)):
        ns_features = ' '.join('{}:{:.6f}'.format(ind, x[ind]) for ind in group_indexes[zz])
        ns_content.append(ns_features)
    ns_line = '|{}'.format( '|'.join('{} {}'.format(NS_LIST[j], ns_content[j]) for j in range(len(group_indexes))))
    return ns_line

In [10]:
# vw_examples= get_vw_examples(X_train, y_train, isTrain=False)
# vw_examples

In [11]:
get_test_sample(X_test[0])

'|a 0:8.000000 1:107.000000 2:1.155844 3:-1.000000 4:-1.000000 5:1.000000 6:0.000000 7:0.000000 8:2.000000|b 9:0.000000 10:-1.000000 11:0.000000 12:-1.000000 13:2.000000 14:0.000000 15:3.000000 16:1.000000 17:0.000000|c 18:3.000000 19:0.000000 20:-1.000000 21:2.000000 22:0.000000 23:0.000000 24:0.000000 25:10.000000 26:9.000000|d 27:3.000000 28:-1.000000 29:0.000000 30:-5.000000 31:0.000000 32:0.000000 33:15.000000 34:10.000000 35:5.000000|e 36:0.000000 37:2.000000 38:0.000000 39:0.000000 40:0.000000 41:0.000000 42:0.000000 43:4.000000 44:13.000000|f 45:0.000000 46:0.000000 47:5.000000 48:0.000000 49:0.000000 50:2.000000 51:1.000000 52:0.000000 53:1.000000|g 54:0.000000 55:1.000000 56:-1.000000 57:1.000000 58:2.000000 59:0.000000 60:0.000000 61:1.000000 62:2.000000|h 63:0.000000 64:0.000000 65:0.000000 66:0.000000 67:0.000000 68:1.000000 69:0.000000 70:0.000000 71:0.000000|i 72:0.000000 73:0.000000 74:0.000000 75:0.000000 76:0.000000 77:0.000000 78:8.000000 79:5.000000 80:1.000000|j 81

## Interactive Learning

In [12]:

def online_learning_loop(iter_num, vw_examples, vw_alg, seed=0):
    """Implements the online learning loop.
    """
    iter_num = len(vw_examples)
    print('Online learning for', iter_num, 'steps...')
    loss_list = []
    for i in range(iter_num):
        vw_x = vw_examples[i]
        y_true = float(vw_examples[i].split('|')[0])
        # predict step
        y_pred = vw_alg.predict(vw_x)
        # learn step
        vw_alg.learn(vw_x)
        # calculate one step loss
        loss = mean_squared_error([y_pred], [y_true])
        loss_list.append(loss)
    return loss_list

In [13]:
import pdb
def query_next_sample(vw_alg, X_pool, n:int=1):
    pool_examples = get_vw_examples(X_pool, isTrain=False)

    preds = [float(vw_alg.predict(ex)) for ex in pool_examples]
    idxs = np.argsort(preds)[::-1]
    idxs = idxs[:n]
    return idxs

import pdb
def query_next_sample_interaction(vw_alg, X_pool, n:int=1):
    pool_examples = [get_test_sample(_) for _ in X_pool]

    preds = [float(vw_alg.predict(ex)) for ex in pool_examples]
    idxs = np.argsort(preds)[::-1]
    idxs = idxs[:n]
    return idxs

In [16]:
from copy import deepcopy
'''create an AutoVW instance for tuning namespace interactions'''
# configure both hyperparamters to tune, e.g., 'interactions', and fixed arguments about the online learner,
# e.g., 'quiet' in the search_space argument.
autovw_ni = AutoVW(max_live_model_num=5, search_space={'interactions': AutoVW.AUTOMATIC, 'quiet': ''})



from vowpalwabbit import pyvw
''' create a vanilla vw instance '''
vanilla_vw = pyvw.vw('--quiet')

vw_alg = autovw_ni

In [15]:
loss_list = []
n = 10
X_pool = X_train
y_pool = y_train

sample_class_list = []
i=0
while i<100:
    X_pool = X_train
    y_pool = y_train
    print(f"Querying new sample")
    idx = query_next_sample_interaction(vw_alg, X_pool,n=n)
    x = X_pool[idx]
    y_true = y_pool[idx]
    print(f"Queried new sample with Label: {y_true}")
    
    X_train = np.delete(X_train, idx, axis=0)
    y_train = np.delete(y_train, idx, axis=0)
    y_pred = []
    for x_i, y_i in zip(x, y_true):
        # predict step
        y_pred_tmp = vw_alg.predict(get_test_sample(x_i))
        y_pred.append(y_pred_tmp)
        # learn step
        # break
        vw_alg.learn(get_training_sample(x_i,y_i))
        

    sample_class_list.extend(y_true)
    print(f"Predicted {y_pred}")
    loss = mean_squared_error([y_pred], [y_true])
    loss_list.append(loss)
    print('Final progressive validation loss of autovw:', sum(loss_list)/len(loss_list))
    plt.figure(i)
    sns.histplot(y_test, stat="percent", color="red", label="overall_dist")
    sns.histplot(sample_class_list,stat="percent", label="online_dist")
    plt.legend()
    plt.savefig(f"./figures_classification_percent_online/{format(i, '05d')}.jpg")
    i+=1
    




Querying new sample
Queried new sample with Label: [2 2 2 2 2 1 3 2 3 1]
Predicted [0.0]


ValueError: y_true and y_pred have different number of output (1!=10)

In [26]:
idx = query_next_sample_interaction(vw_alg, X_pool,n=n)
vw_alg.learn(get_training_sample(x_i,y_i))

In [17]:
import sys
sys.path.append("/home/ahmet/repos/data_centric/")

In [18]:
from data_centric.models import ActiveLearner
from flaml import AutoML
automl_settings = {
    "time_budget": 2,
    "estimator_list": ['lgbm'],
}

# Initialize Learner 
# TODO: Later we will disable initialization no training samples by providing class names
learner = ActiveLearner(
    estimator=AutoML(),
    embedding_pipeline = "test embedding pieline",
    X_training=X_train[:10], y_training=y_train[:10], **automl_settings
)

[flaml.automl: 01-25 02:12:15] {2007} INFO - task = classification
[flaml.automl: 01-25 02:12:15] {2009} INFO - Data split method: stratified
[flaml.automl: 01-25 02:12:15] {2013} INFO - Evaluation method: cv
[flaml.automl: 01-25 02:12:15] {1045} INFO - class 0 augmented from 1 to 20
[flaml.automl: 01-25 02:12:15] {1045} INFO - class 1 augmented from 6 to 24
[flaml.automl: 01-25 02:12:15] {1045} INFO - class 2 augmented from 3 to 21
[flaml.automl: 01-25 02:12:15] {2113} INFO - Minimizing error metric: log_loss
[flaml.automl: 01-25 02:12:15] {2170} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl: 01-25 02:12:15] {2437} INFO - iteration 0, current learner lgbm
[flaml.automl: 01-25 02:12:15] {2550} INFO - Estimated sufficient time budget=282s. Estimated necessary time budget=0s.
[flaml.automl: 01-25 02:12:15] {2597} INFO -  at 0.0s,	estimator lgbm's best error=0.7286,	best estimator lgbm's best error=0.7286
[flaml.automl: 01-25 02:12:15] {2437} INFO - iteration 1, current

<class 'numpy.ndarray'>


[flaml.automl: 01-25 02:12:15] {2597} INFO -  at 0.2s,	estimator lgbm's best error=0.0028,	best estimator lgbm's best error=0.0028
[flaml.automl: 01-25 02:12:15] {2437} INFO - iteration 7, current learner lgbm
[flaml.automl: 01-25 02:12:15] {2597} INFO -  at 0.3s,	estimator lgbm's best error=0.0028,	best estimator lgbm's best error=0.0028
[flaml.automl: 01-25 02:12:15] {2437} INFO - iteration 8, current learner lgbm
[flaml.automl: 01-25 02:12:15] {2597} INFO -  at 0.4s,	estimator lgbm's best error=0.0028,	best estimator lgbm's best error=0.0028
[flaml.automl: 01-25 02:12:15] {2437} INFO - iteration 9, current learner lgbm
[flaml.automl: 01-25 02:12:15] {2597} INFO -  at 0.4s,	estimator lgbm's best error=0.0028,	best estimator lgbm's best error=0.0028
[flaml.automl: 01-25 02:12:15] {2437} INFO - iteration 10, current learner lgbm
[flaml.automl: 01-25 02:12:16] {2597} INFO -  at 0.5s,	estimator lgbm's best error=0.0028,	best estimator lgbm's best error=0.0028
[flaml.automl: 01-25 02:12:1

In [None]:
i=0
accs = []
sample_class_list = []
while i<200:
    print(f"Querying new sample")
    # querying for labels
    query_idx, query_sample = learner.query(X_train,n_instances=10)
    # TODO: ...obtaining new labels from User here
    print(f"Queried new sample with Label: {y_train[query_idx]}")

    y_pred = learner.predict(X_train[query_idx])
    print(f"Predicted {y_pred}")

    # teaching newly labelled examples
    learner.teach(
        X=X_train[query_idx].reshape(10, -1),
        y=y_train[query_idx].reshape(10, ),
        **automl_settings
    )
    sample_class_list.extend(y_train[query_idx])
    
    X_train = np.delete(X_train, query_idx, axis=0)
    y_train = np.delete(y_train, query_idx, axis=0)

    # print(f"acc: {learner.score(X_test, y_test)}")
    # accs.append(learner.score(X_test, y_test))
    
    plt.figure(i)
    sns.histplot(y_test, stat="percent", color="red", label="overall_dist")
    sns.histplot(sample_class_list,stat="percent", label="online_dist")
    plt.legend()
    plt.savefig(f"./figures_classification_percent_flaml/{format(i, '05d')}.jpg")

    i += 1