# modAL + pyhard- Comparando estratégias

- Amostra por incerteza
- Amostragem aleatória
- Consulta por comitê
- Aprendizado passivo
- Redução do erro esperado

In [1]:
%run -i set_environment

## Bibliotecas

In [22]:
%run -i importing_libraries

## Classificadores

### Algoritmos

In [3]:
%run -i classifiers

### Conjunto de dados

In [4]:
%run -i importing_datasets

## Estatratégias

### Amostra por incerteza

In [34]:
def compute_f1(learner, X, y_true, average = None):
    y_pred = learner.predict(X)
    return metrics.f1_score(y_true, y_pred, average = average)

In [56]:
def uncertain_sampling(X_raw, y_raw, idx_data, idx_bag, classifier, init_size, cost):
    
    from modAL.uncertainty import classifier_uncertainty
    
    sample_size = 0 #contador de amostras utilizadas pela estratégia
    accuracy_history = []
    f1_history = []
    start = timer()
    
    X_train, X_test, y_train, y_test = train_test_split(X_raw[idx_data[idx_bag][TRAIN]], y_raw[idx_data[idx_bag][TRAIN]], train_size= len(np.unique(y_raw)) + init_size, stratify = y_raw[idx_data[idx_bag][TRAIN]])
    
    sample_size = sample_size + len(X_train)

    learner = ActiveLearner (
        estimator= which_classifier(classifier), #cls,
        query_strategy=uncertainty_sampling,
        X_training = X_train, y_training = y_train # AL AJUSTA O CLASSIFIER 
    )
    
    accuracy_history.append(learner.score(X_test, y_test))
    f1_history.append(compute_f1(learner, X_test, y_test, "weighted"))

    total_of_samples = 1
    while (total_of_samples != cost):
        
        #X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, train_size=0.03)
        
        idx = np.random.choice(range(len(idx_data[idx_bag][TRAIN])), size=init_size, replace=False)
        X_train, y_train = X_raw[idx_data[idx_bag][TRAIN][idx]], y_raw[idx_data[idx_bag][TRAIN][idx]]
        
        if classifier_uncertainty(learner, X_train[0].reshape(1,-1)) > 0.2:
            #print("IF", learner.score(X_test, y_test))
            sample_size = sample_size + len(X_train)
            learner.teach(X_train, y_train)
        accuracy_history.append(learner.score(X_test, y_test))
        f1_history.append(compute_f1(learner, X_test, y_test, "weighted"))
        total_of_samples = total_of_samples + 1
    
    end = timer()
    time_elapsed = end - start
    
    return { "accuracy_history": accuracy_history,
             "f1_history": f1_history,
             "auc_history": "auc_history[-1]",
             "package": "modAL",
             "time_elapsed": time_elapsed,
             "classifier": classifier,
             "sample_size": sample_size / len(X_raw), # RETORNAR TODAS AS AMOSTRAS DE CADA PERFORMANCE OU SÓ DO ULTIMO
             "Strategy": "Uncertain Sampling"}

### Amostragem aleatória

In [57]:
def random_sampling(X_raw, y_raw, idx_data, idx_bag, classifier, init_size, cost):
        
    sample_size = 0 #contador de amostras utilizadas pela estratégia
    accuracy_history = []
    f1_history = []
    start = timer()

    for i in range(1, cost+1):

        X_train, X_test, y_train, y_test = train_test_split(X_raw[idx_data[idx_bag][TRAIN]], y_raw[idx_data[idx_bag][TRAIN]], train_size= len(np.unique(y_raw)) + init_size, stratify = y_raw[idx_data[idx_bag][TRAIN]])
        sample_size = sample_size + len(X_train)
        
        cls = which_classifier(classifier)
        cls.fit(X_train, y_train)

        accuracy_history.append(cls.score(X_test,y_test))
        f1_history.append(compute_f1(cls, X_test, y_test, "weighted"))

        
    end = timer()
    time_elapsed = end - start

    return { "accuracy_history": accuracy_history,
         "f1_history": f1_history,
         "auc_history": "auc_history[-1]",
         "package": "modAL",
         "time_elapsed": time_elapsed,
         "classifier": classifier,
         "sample_size": sample_size / len(X_raw),
         "Strategy": "Random Sampling"}

### Consulta por comitê

In [63]:
def query_by_committee(X_raw, y_raw, idx_data, idx_bag, classifier, init_size, cost):

    from modAL.models import ActiveLearner, Committee
    from modAL.disagreement import vote_entropy_sampling

    sample_size = 0 #contador de amostras utilizadas pela estratégia
    accuracy_history = []
    f1_history = []
    start = timer()

    learner_list = []

    for j in range(1, cost+1): # Loop para criação do comitê

        X_train, X_pool, y_train, y_pool = train_test_split(X_raw[idx_data[idx_bag][TRAIN]], y_raw[idx_data[idx_bag][TRAIN]], train_size= len(np.unique(y_raw)) + init_size, stratify = y_raw[idx_data[idx_bag][TRAIN]])
        sample_size = sample_size + len(X_train)

        # initializing learner
        learner = ActiveLearner(
            estimator= which_classifier(classifier),
            X_training = X_train, y_training = y_train 
        )
        learner_list.append(learner)

    # assembling the committee
    committee = Committee(
        learner_list=learner_list,
        query_strategy=vote_entropy_sampling)
    
    # query by committee
    for idx in range(cost):
        # print("\t Size of X_pool:", len(X_pool))
        query_idx, query_instance = committee.query(X_pool, n_instances = init_size+1)
        sample_size = sample_size + len(query_idx)
        
        committee.teach(
            X = X_pool[query_idx],
            y = y_pool[query_idx]
        )

        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx)

        accuracy_history.append(committee.score(X_pool, y_pool))
        f1_history.append(compute_f1(committee, X_pool, y_pool, "weighted"))

        
    end = timer()
    time_elapsed = end - start

    return { "accuracy_history": accuracy_history,
         "f1_history": f1_history,
         "auc_history": "auc_history[-1]",
         "package": "modAL",
         "time_elapsed": time_elapsed,
         "classifier": classifier,
         "sample_size": sample_size / len(X_raw),
         "Strategy": "Query by Committee"}

### Expected Error Reduction

In [69]:
def exp_error_reduction(X_raw, y_raw, idx_data, idx_bag, classifier, init_size, cost):

    from modAL.expected_error import expected_error_reduction
    
    sample_size = 0 #contador de amostras utilizadas pela estratégia
    accuracy_history = []
    f1_history = []
    start = timer()
    
    # parte randomica inicial da estratégia
    #initial_idx = np.random.choice(range(len(idx_data[idx_bag][TRAIN])), size=init_size, replace=False)
    #X_train, y_train = X_raw[idx_data[idx_bag][TRAIN][initial_idx]], y_raw[idx_data[idx_bag][TRAIN][initial_idx]]
    #X_pool, y_pool = X_raw[idx_data[idx_bag][TEST]], y_raw[idx_data[idx_bag][TEST]]
    
    X_train, X_pool, y_train, y_pool = train_test_split(X_raw[idx_data[idx_bag][TRAIN]], y_raw[idx_data[idx_bag][TRAIN]], train_size= len(np.unique(y_raw)) + init_size, stratify = y_raw[idx_data[idx_bag][TRAIN]])
    sample_size = sample_size + len(X_train)

    X_pool, y_pool = X_raw[idx_data[idx_bag][TEST]], y_raw[idx_data[idx_bag][TEST]]
    
    learner = ActiveLearner (
        estimator = which_classifier(classifier),
        X_training = X_train, y_training = y_train
    )
    accuracy_history.append(learner.score(X_pool, y_pool))
    f1_history.append(compute_f1(learner, X_pool, y_pool, "weighted"))

    total_of_samples = 1
    while (total_of_samples != cost):
        print("\t Size of X_pool:", len(X_pool))
        exp_error_idx = expected_error_reduction(learner, X_pool, 'binary', n_instances=init_size)

        learner.teach(X_pool[exp_error_idx], y_pool[exp_error_idx])
        sample_size = sample_size + init_size
    
        # X_pool = np.delete(X_pool, exp_error_idx, axis=0)
        # y_pool = np.delete(y_pool, exp_error_idx)
        
        accuracy_history.append(learner.score(X_pool, y_pool))
        f1_history.append(compute_f1(learner, X_pool, y_pool, "weighted"))
        
        total_of_samples = total_of_samples + 1
    
    end = timer()
    time_elapsed = end - start


    return { "accuracy_history": accuracy_history,
         "f1_history": f1_history,
         "auc_history": "auc_history[-1]",
         "package": "modAL",
         "time_elapsed": time_elapsed,
         "classifier": classifier,
         "sample_size": sample_size / len(X_raw),
         "Strategy": "Expected Error Reduction"}

### Expected Model Change

In [75]:
def exp_model_change(X_raw, y_raw, idx_data, idx_bag, classifier, init_size, cost):

    from modAL.expected_error import expected_error_reduction
    sample_size = 0 #contador de amostras utilizadas pela estratégia
    accuracy_history = []
    f1_history = []
    start = timer()
    
    X_train, X_pool, y_train, y_pool = train_test_split(X_raw[idx_data[idx_bag][TRAIN]], y_raw[idx_data[idx_bag][TRAIN]], train_size= len(np.unique(y_raw)) + init_size, stratify = y_raw[idx_data[idx_bag][TRAIN]])
    sample_size = sample_size + len(X_train)

    learner = ActiveLearner (
        estimator = which_classifier(classifier),
        X_training = X_train, y_training = y_train
    )
    
    accuracy_history.append(learner.score(X_pool, y_pool))
    f1_history.append(compute_f1(learner, X_pool, y_pool, "weighted"))

    total_of_samples = 1
    while (total_of_samples != cost):
         #print("\t Size of X_pool:", len(X_pool))
        exp_error_idx = np.random.choice(range(len(X_pool)), size=init_size, replace=False)
        aux = deepcopy(learner)

        aux.teach(X_pool[exp_error_idx], y_pool[exp_error_idx])
        score_aux = aux.score(X_pool, y_pool)
        score_learner = learner.score(X_pool, y_pool)

        if score_aux > score_learner:
            learner = deepcopy(aux)
            sample_size = sample_size + init_size
        
        X_pool = np.delete(X_pool, exp_error_idx, axis=0)
        y_pool = np.delete(y_pool, exp_error_idx, axis=0)
        
        accuracy_history.append(learner.score(X_pool, y_pool))
        f1_history.append(compute_f1(learner, X_pool, y_pool, "weighted"))

        total_of_samples = total_of_samples + 1
    
    end = timer()
    time_elapsed = end - start

    return { "accuracy_history": accuracy_history,
         "f1_history": f1_history,
         "auc_history": "auc_history[-1]",
         "package": "modAL",
         "time_elapsed": time_elapsed,
         "classifier": classifier,
         "sample_size": sample_size / len(X_raw),
         "Strategy": "Expected Model Change"}

## Pyhard Strategies

In [10]:
def config(section, filename='strategies.config'):
    from configparser import ConfigParser

    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read("../" + filename)
    # get section, default to postgresql
    strategy = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            strategy[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))

    # transformando texto em bool
    strategy['ascending'] = list(map(lambda x: bool(0 if x == "False" else 1), strategy['ascending'].split(',')))
    strategy['sortby'] = strategy['sortby'].split(',')
    
    print(strategy)
    
    return strategy

In [81]:
def pyhard_strategies(X_raw, y_raw, idx_data, idx_bag, classifier, init_size, cost, strategy):
    
    from modAL.uncertainty import classifier_uncertainty
    
    sample_size = 0 #contador de amostras utilizadas pela estratégia
    accuracy_history = []
    f1_history = []
    start = timer()
    
    strategy = config(strategy)
    
    # parte randomica inicial da estratégia
    
    X_train, X_test, y_train, y_test = train_test_split(X_raw[idx_data[idx_bag][TRAIN]], y_raw[idx_data[idx_bag][TRAIN]], train_size= len(np.unique(y_raw)) + init_size, stratify = y_raw[idx_data[idx_bag][TRAIN]])
    
    sample_size = sample_size + len(X_train)

    learner = ActiveLearner (
        estimator= which_classifier(classifier), #cls,
        query_strategy=uncertainty_sampling,
        X_training = X_train, y_training = y_train # AL AJUSTA O CLASSIFIER 
    )

    accuracy_history.append(learner.score(X_test, y_test))
    f1_history.append(compute_f1(learner, X_test, y_test, "weighted"))

    total_of_samples = 1

    #X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, train_size=0.03)

    idx = np.random.choice(range(len(idx_data[idx_bag][TRAIN])), size=init_size, replace=False)
    X_train, y_train = X_raw[idx_data[idx_bag][TRAIN][idx]], y_raw[idx_data[idx_bag][TRAIN][idx]]

    X_rawAndY_raw = np.column_stack([X_raw[idx_data[idx_bag][TRAIN]],y_raw[idx_data[idx_bag][TRAIN]]])
    np.savetxt("data.csv", X_rawAndY_raw, fmt='%i', delimiter=",")
    
    which_pyhard_measure(strategy['measure'])

    !pyhard --no-isa

    df = pd.read_csv('metadata.csv')

    idx = list(df.sort_values(by=strategy['sortby'], ascending=strategy['ascending'])['instances'][:cost])

    X_train = X_raw[idx_data[idx_bag][TRAIN][idx]]
    y_train = y_raw[idx_data[idx_bag][TRAIN][idx]]

    sample_size = cost
    learner.teach(X_train, y_train)
    
    accuracy_history.append(learner.score(X_test, y_test))
    f1_history.append(compute_f1(learner, X_test, y_test, "weighted"))
    
    end = timer()
    time_elapsed = end - start

    return { "accuracy_history": accuracy_history,
         "f1_history": f1_history,
         "auc_history": "auc_history[-1]",
         "package": "Pyhard",
         "time_elapsed": time_elapsed,
         "classifier": classifier,
         "sample_size": sample_size / len(X_raw),
         "Strategy": strategy['name']}

In [81]:
total_performance_history = []
#pyhard_strategies_names = ['H','U','H+U','LSC','N2','F3']

pyhard_strategies_names = ['H+U']

for ds in datasets:
    for classifier in classifiers:
        X_raw, y_raw, idx_data, dataset_name = which_arff_dataset(ds)
        #para cada i em idx_bag ("n_splits") (1 a 5)
        for idx_bag in range(n_splits):
            for ph_strategy in pyhard_strategies_names:
                tqdm.write("Testando: " + str(ds[:-5]) + " " + str(classifier) + " " + str(idx_bag) + "/" + str(n_splits) + " " + ph_strategy)
                result = pyhard_strategies(deepcopy(X_raw), deepcopy(y_raw), idx_data, idx_bag, classifier, k, cost, ph_strategy)
                result['dataset'] = ds[:-5]
                total_performance_history.append(result)
                tqdm.write("Passou: " + str(ds[:-5]) + " " + str(classifier) + " " + str(idx_bag) + "/" + str(n_splits) + " " + ph_strategy)        

Testando: 61_iris 5NN 0/5 H+U
{'name': 'Lowest H, Highest U Sampling', 'measure': 'U+H', 'sortby': ['feature_Usefulness', 'feature_Harmfulness'], 'ascending': [False, True]}
^C
Traceback (most recent call last):
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/bin/pyhard", line 5, in <module>
    from pyhard.cli import cli
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/pyhard/cli.py", line 10, in <module>
    from pyispace.example import save_opts
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/pyispace/__init__.py", line 3, in <module>
    from .train import train_is
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/pyispace/train.py", line 11, in <module>
    from .trace import TraceOutput, trace
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/pyispace/trace.py", line 10, in <module>
    impo

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-82-791ea12a43b5>", line 13, in <module>
    result = pyhard_strategies(deepcopy(X_raw), deepcopy(y_raw), idx_data, idx_bag, classifier, k, cost, ph_strategy)
  File "<ipython-input-81-8e47be37e8ac>", line 35, in pyhard_strategies
    np.savetxt("data.csv", X_rawAndY_raw, fmt='%i', delimiter=",")
  File "<__array_function__ internals>", line 6, in savetxt
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/numpy/lib/npyio.py", line 1367, in savetxt
    fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/numpy/lib/_datasource.py", line 194, in open
    return ds.open(path, mode, encoding=e

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-82-791ea12a43b5>", line 13, in <module>
    result = pyhard_strategies(deepcopy(X_raw), deepcopy(y_raw), idx_data, idx_bag, classifier, k, cost, ph_strategy)
  File "<ipython-input-81-8e47be37e8ac>", line 35, in pyhard_strategies
    np.savetxt("data.csv", X_rawAndY_raw, fmt='%i', delimiter=",")
  File "<__array_function__ internals>", line 6, in savetxt
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/numpy/lib/npyio.py", line 1367, in savetxt
    fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
  File "/mnt/c/Users/ahmou/Onedrive/Documentos/ubuntu_wd/act_len/lib/python3.7/site-packages/numpy/lib/_datasource.py", line 194, in open
    return ds.open(path, mode, encoding=e


KeyboardInterrupt



In [82]:
total_performance_history

[{'accuracy_history': [0.7010309278350515, 0.8865979381443299],
  'f1_history': [0.6244144858577848, 0.8818721677250521],
  'auc_history': 'auc_history[-1]',
  'package': 'Pyhard',
  'time_elapsed': 60.12400930000149,
  'classifier': '5NN',
  'sample_size': 0.06666666666666667,
  'Strategy': 'Lowest H, Highest U Sampling',
  'dataset': '61_iris'},
 {'accuracy_history': [0.8144329896907216, 0.9175257731958762],
  'f1_history': [0.796328237209154, 0.9152484997691952],
  'auc_history': 'auc_history[-1]',
  'package': 'Pyhard',
  'time_elapsed': 20.384290000001783,
  'classifier': '5NN',
  'sample_size': 0.06666666666666667,
  'Strategy': 'Lowest H, Highest U Sampling',
  'dataset': '61_iris'}]

ERROR! Session/line number was not unique in database. History logging moved to new session 22


## Setup

In [12]:
def which_pyhard_measure(measure='LSC'):
    import yaml
    with open(r'config-template.yaml') as file:
        configs_list = yaml.load(file, Loader=yaml.FullLoader)

        if measure == 'LSC':
            configs_list['measures_list'] = ['LSC']
        elif measure == 'Harmfulness':
            configs_list['measures_list'] = ['Harmfulness']
        elif measure == 'Usefulness':
            configs_list['measures_list'] = ['Usefulness']
        elif measure == 'U+H':
            configs_list['measures_list'] = ['Harmfulness','Usefulness']
        elif measure == 'N2':
            configs_list['measures_list'] = ['N2']
        elif measure == 'F3':
            configs_list['measures_list'] = ['F3']

    with open(r'config.yaml', 'w') as file:
        yaml.dump(configs_list, file)

In [13]:
def which_dataset(dataset = "iris", n_splits = 5):
    
    # Futuramente essa etapa será ajustada para receber qualquer dataset (ou lista com datasets)
    if (dataset == "iris"):
        data = load_iris()
        X_raw = data['data']
        y_raw = data['target']
    
    if (dataset == "wine"):
        data = load_wine()
        X_raw = data['data']
        y_raw = data['target']
        
    if (dataset == "digits"):
        data = load_digits()
        X_raw = data['data']
        y_raw = data['target']
        
    # cross validation bags
    data_cv = StratifiedShuffleSplit(n_splits= n_splits, train_size=0.7, random_state=0) #n_splits
    
    # extraindo ids do data_cv
    idx_data = []
    for train_index, test_index in data_cv.split(X_raw):
            idx_data.append([train_index, test_index])

    return X_raw, y_raw, idx_data

In [14]:
def which_oml_dataset(dataset_id, n_splits = 5):
    data = openml.datasets.get_dataset(dataset_id)
    
    X_raw, y_raw, categorical_indicator, attribute_names = data.get_data(
    dataset_format="array", target=data.default_target_attribute)
    
    le = preprocessing.LabelEncoder()
    le.fit(y_raw)
    y_raw = le.transform(y_raw)
    
    X_raw = np.nan_to_num(X_raw)
    
    data_cv = StratifiedShuffleSplit(n_splits= n_splits, train_size=0.7, random_state=0) #n_splits
    
    idx_data = []
    for train_index, test_index in data_cv.split(X_raw):
            idx_data.append([train_index, test_index])

    return X_raw, y_raw, idx_data, data.name

In [15]:
def which_arff_dataset(dataset, n_splits = 5):
   
    from sklearn.preprocessing import OrdinalEncoder
    
    data = arff.loadarff('datasets/luis/' + dataset)
    data = pd.DataFrame(data[0])

    X_raw = data[data.columns[:-1]].to_numpy()
    y_raw = data[data.columns[-1]].to_numpy()
    
    lex = preprocessing.OrdinalEncoder()
    lex.fit(X_raw)
    X_raw = lex.transform(X_raw)
        
    ley = preprocessing.LabelEncoder()
    ley.fit(y_raw)
    y_raw = ley.transform(y_raw)
    
    # cross validation bags
    data_cv = StratifiedShuffleSplit(n_splits= n_splits, train_size=0.7, random_state=0) #n_splits
    data_cv.get_n_splits(X_raw,y_raw)
    
    # extraindo ids do data_cv
    idx_data = []
    for train_index, test_index in data_cv.split(X_raw, y_raw):
            idx_data.append([train_index, test_index])

    return X_raw, y_raw, idx_data, dataset

In [16]:
def which_classifier(classifier = '5NN'):
    
    if (classifier == '5NN'):
        return KNeighborsClassifier(5)
    elif (classifier == 'C4.5'):
        return tree.DecisionTreeClassifier()
    elif (classifier == 'NB'):
        return GaussianNB()
    elif (classifier == 'SVM'):
        return SVC(probability=True, gamma='auto')
    elif (classifier == 'RF'):
        return RandomForestClassifier()

In [17]:
def fetch_datasets(dataset):
    
    data = arff.loadarff('./datasets/luis/' + dataset)
    metadata = data[1]
    data = pd.DataFrame(data[0])
    
    instances = len(data)
    classes = len(data.iloc[:,-1].value_counts())
    attributes = len(data.columns)- 1
    nominal_attributes = str(metadata).count("nominal")
    
    proportion = data.iloc[:,-1].value_counts()
    proportion = proportion.map(lambda x: round(x/instances*100,2))

    majority = max(proportion)
    minority = min(proportion)

    
    return {
        "name": dataset[:-5],
        "instances": instances,
        "classes": classes,
        "attributes": attributes,
        "nominal attributes": nominal_attributes,
        "majority": majority,
        "minority": minority
    }

In [18]:
datasets = os.listdir('./datasets/luis')
classifiers = ['5NN', 'C4.5', 'NB','RF']
total_performance_history = []

In [19]:
datasets

['61_iris.arff']

In [20]:
metadata = []

for ds in datasets:
    metadata.append(fetch_datasets(ds))

metadata = pd.DataFrame.from_dict(metadata)
metadata

Unnamed: 0,name,instances,classes,attributes,nominal attributes,majority,minority
0,61_iris,150,3,4,1,33.33,33.33


In [21]:
pyhard_strategies_names = ['H','U','H+U','LSC','N2','F3']

for ds in datasets:
    for classifier in classifiers:
        X_raw, y_raw, idx_data, dataset_name = which_arff_dataset(ds)
        #para cada i em idx_bag ("n_splits") (1 a 5)
        for idx_bag in range(n_splits):
            for ph_strategy in pyhard_strategies_names:
                tqdm.write("Testando: " + str(ds[:-5]) + " " + str(classifier) + " " + str(idx_bag) + "/" + str(n_splits) + " " + ph_strategy)
                result = pyhard_strategies(deepcopy(X_raw), deepcopy(y_raw), idx_data, idx_bag, classifier, k, cost, ph_strategy)
                result['dataset'] = ds[:-5]
                total_performance_history.append(result)
                tqdm.write("Passou: " + str(ds[:-5]) + " " + str(classifier) + " " + str(idx_bag) + "/" + str(n_splits) + " " + ph_strategy)        

Testando: 61_iris 5NN 0/5 H
{'name': 'Lowest Harmfulness Sampling', 'measure': 'Harmfulness', 'sortby': ['feature_Harmfulness'], 'ascending': [True]}
run 'pyhard --help' to see all options.
[INFO] 2021-04-27 18:46:28,351 - Configuration file: '/mnt/c/Users/ahmou/OneDrive/Documentos/ubuntu_wd/act_len/active_learning/data/act_len labs/pyhard/config.yaml'
[INFO] 2021-04-27 18:46:28,355 - Reading input dataset: '/mnt/c/Users/ahmou/OneDrive/Documentos/ubuntu_wd/act_len/active_learning/data/act_len labs/pyhard/data.csv'
[INFO] 2021-04-27 18:46:28,372 - Type of problem: 'classification'
[INFO] 2021-04-27 18:46:28,373 - Building metadata.
[INFO] 2021-04-27 18:46:33,762 - Calculating measure 'Harmfulness'
[INFO] 2021-04-27 18:46:33,783 - Assessing performance of classifier 'random_forest'
[INFO] 2021-04-27 18:46:33,783 - Estimating instance performance...
[INFO] 2021-04-27 18:46:33,785 - Evaluating testing fold #1
[INFO] 2021-04-27 18:46:34,638 - Test fold mean accuracy: 0.9615384615384616
[INF

[INFO] 2021-04-27 18:51:17,870 - Test fold mean accuracy: 0.9423076923076923
[INFO] 2021-04-27 18:51:17,870 - Iteration 1/1 completed.
[INFO] 2021-04-27 18:51:17,870 - Mean accuracy on test instances (iteration #1): 0.9519
[INFO] 2021-04-27 18:51:17,925 - Total elapsed time: 7.0s
[INFO] 2021-04-27 18:51:17,925 - Instance Hardness analysis finished.
Passou: 61_iris 5NN 0/5 F3
Testando: 61_iris 5NN 1/5 H
{'name': 'Lowest Harmfulness Sampling', 'measure': 'Harmfulness', 'sortby': ['feature_Harmfulness'], 'ascending': [True]}
run 'pyhard --help' to see all options.
[INFO] 2021-04-27 18:52:19,679 - Configuration file: '/mnt/c/Users/ahmou/OneDrive/Documentos/ubuntu_wd/act_len/active_learning/data/act_len labs/pyhard/config.yaml'
[INFO] 2021-04-27 18:52:19,685 - Reading input dataset: '/mnt/c/Users/ahmou/OneDrive/Documentos/ubuntu_wd/act_len/active_learning/data/act_len labs/pyhard/data.csv'
[INFO] 2021-04-27 18:52:19,697 - Type of problem: 'classification'
[INFO] 2021-04-27 18:52:19,697 - Bu

[INFO] 2021-04-27 18:58:19,673 - Assessing performance of classifier 'random_forest'
[INFO] 2021-04-27 18:58:19,673 - Estimating instance performance...
[INFO] 2021-04-27 18:58:19,674 - Evaluating testing fold #1
[INFO] 2021-04-27 18:58:20,491 - Test fold mean accuracy: 0.9807692307692307
[INFO] 2021-04-27 18:58:20,492 - Evaluating testing fold #2
[INFO] 2021-04-27 18:58:21,801 - Test fold mean accuracy: 0.9423076923076923
[INFO] 2021-04-27 18:58:21,801 - Iteration 1/1 completed.
[INFO] 2021-04-27 18:58:21,802 - Mean accuracy on test instances (iteration #1): 0.9615
[INFO] 2021-04-27 18:58:21,893 - Total elapsed time: 7.5s
[INFO] 2021-04-27 18:58:21,893 - Instance Hardness analysis finished.
Passou: 61_iris 5NN 1/5 F3
Testando: 61_iris 5NN 2/5 H
{'name': 'Lowest Harmfulness Sampling', 'measure': 'Harmfulness', 'sortby': ['feature_Harmfulness'], 'ascending': [True]}
run 'pyhard --help' to see all options.
[INFO] 2021-04-27 18:59:26,768 - Configuration file: '/mnt/c/Users/ahmou/OneDrive/

[INFO] 2021-04-27 19:06:08,949 - Calculating measure 'F3'
[INFO] 2021-04-27 19:06:09,076 - Assessing performance of classifier 'random_forest'
[INFO] 2021-04-27 19:06:09,076 - Estimating instance performance...
[INFO] 2021-04-27 19:06:09,078 - Evaluating testing fold #1
[INFO] 2021-04-27 19:06:10,082 - Test fold mean accuracy: 0.9230769230769231
[INFO] 2021-04-27 19:06:10,082 - Evaluating testing fold #2
[INFO] 2021-04-27 19:06:11,069 - Test fold mean accuracy: 0.9423076923076923
[INFO] 2021-04-27 19:06:11,069 - Iteration 1/1 completed.
[INFO] 2021-04-27 19:06:11,070 - Mean accuracy on test instances (iteration #1): 0.9327
[INFO] 2021-04-27 19:06:11,132 - Total elapsed time: 10.7s
[INFO] 2021-04-27 19:06:11,132 - Instance Hardness analysis finished.
Passou: 61_iris 5NN 2/5 F3
Testando: 61_iris 5NN 3/5 H
{'name': 'Lowest Harmfulness Sampling', 'measure': 'Harmfulness', 'sortby': ['feature_Harmfulness'], 'ascending': [True]}
run 'pyhard --help' to see all options.
[INFO] 2021-04-27 19:07

[INFO] 2021-04-27 19:12:46,471 - Reading input dataset: '/mnt/c/Users/ahmou/OneDrive/Documentos/ubuntu_wd/act_len/active_learning/data/act_len labs/pyhard/data.csv'
[INFO] 2021-04-27 19:12:46,490 - Type of problem: 'classification'
[INFO] 2021-04-27 19:12:46,490 - Building metadata.
[INFO] 2021-04-27 19:13:01,117 - Calculating measure 'F3'
[INFO] 2021-04-27 19:13:01,378 - Assessing performance of classifier 'random_forest'
[INFO] 2021-04-27 19:13:01,379 - Estimating instance performance...
[INFO] 2021-04-27 19:13:01,380 - Evaluating testing fold #1
[INFO] 2021-04-27 19:13:02,379 - Test fold mean accuracy: 0.9615384615384616
[INFO] 2021-04-27 19:13:02,379 - Evaluating testing fold #2
[INFO] 2021-04-27 19:13:03,344 - Test fold mean accuracy: 0.9038461538461539
[INFO] 2021-04-27 19:13:03,344 - Iteration 1/1 completed.
[INFO] 2021-04-27 19:13:03,345 - Mean accuracy on test instances (iteration #1): 0.9327
[INFO] 2021-04-27 19:13:03,419 - Total elapsed time: 17.0s
[INFO] 2021-04-27 19:13:03

KeyError: 'feature_LSC'

In [None]:
functions = ["uncertain_sampling", "random_sampling", "query_by_committee", "exp_error_reduction", "exp_model_change"]
parameters = "(deepcopy(X_raw), deepcopy(y_raw), idx_data, idx_bag, classifier, k, cost)"

for ds in tqdm(datasets,  desc ="Dataset"):
    for classifier in classifiers:
        X_raw, y_raw, idx_data, dataset_name = which_arff_dataset(ds)
        #para cada i em idx_bag ("n_splits") (1 a 5)
        for idx_bag in range(n_splits):
            for func in functions:
                tqdm.write("Testando: " + str(ds[:-5]) + " " + str(classifier) + " " + str(idx_bag+1) + "/" + str(n_splits) + " " + func)
                result = eval(func+parameters)
                result['dataset'] = ds[:-5]
                total_performance_history.append(result)
                tqdm.write("Passou: " + str(ds[:-5]) + " " + str(classifier) + " " + str(idx_bag+1) + "/" + str(n_splits) + " " + func)

In [None]:
total_performance_history

## Visualization

### Preprocessing

In [None]:
df = pd.DataFrame.from_dict(total_performance_history)

In [None]:
pd.unique(df['Strategy'])

In [None]:
df2 = df
df2.groupby(['Strategy', 'classifier']).agg({'performance_history':['mean','std'],'time_elapsed':['mean','std'], 'sample_size':['mean','std']})

In [None]:
performance_mean = df2.groupby(['Strategy', 'classifier']).mean()
performance_std = df2.groupby(['Strategy', 'classifier']).std()

In [None]:
performance_mean

In [None]:
df = df.explode('performance_history')

In [None]:
df[df.Strategy != "Query by Committee"].sort_values('performance_history', ascending = False)

In [None]:
df[df.Strategy == "Expected Error Reduction"].sort_values('time_elapsed', ascending = False)

In [None]:
df.info()

### Plots

In [None]:
g = sns.relplot(
    data= df,
    x="performance_history", y="time_elapsed",
    hue="Strategy", size="sample_size",
    palette=sns.color_palette(n_colors=10), sizes=(100, 300), alpha=0.3
)
g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
_ = g.despine(left=True, bottom=True)

In [None]:
g = sns.relplot(
    data= performance_mean,
    x="performance_history", y="time_elapsed",
    hue="Strategy", size="sample_size", style="classifier",
    palette=sns.color_palette(n_colors=10), sizes=(100, 300), alpha=0.3
)
g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
_ = g.despine(left=True, bottom=True)

In [None]:
g = sns.relplot(
    data= df[(df.Strategy != "Uncertain Sampling") & (df.Strategy != "Query by Committee")],
    x="performance_history", y="time_elapsed",
    hue="Strategy", size="sample_size",
    palette=sns.color_palette(n_colors=3), sizes=(100, 300), alpha=0.3
)
g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
_ = g.despine(left=True, bottom=True)

In [None]:
g = sns.relplot(
    data= df[(df.Strategy == "Uncertain Sampling") | (df.Strategy == "Query by Committee")],
    x="performance_history", y="time_elapsed",
    hue="Strategy", size="sample_size",
    palette=sns.color_palette(n_colors=2), sizes=(100, 300), alpha=0.3
)
g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
_ = g.despine(left=True, bottom=True)

## Baixando datasets

In [None]:
from tqdm.notebook import tqdm, trange
p_bar = tqdm(datalist)
for dataset_id in p_bar:
    X_raw, y_raw, idx_data, dataset_name = which_oml_dataset(dataset_id)
    p_bar.set_description(f'"{dataset_name}"')

In [None]:
ds = "1465_breast-tissue.arff"

X_raw, y_raw, idx_data, dataset_name = which_arff_dataset(ds)
   
from modAL.uncertainty import classifier_uncertainty

print(len(np.unique(y_raw)))
X_train, X_test, y_train, y_test = train_test_split(X_raw[idx_data[idx_bag][TRAIN]], y_raw[idx_data[idx_bag][TRAIN]], train_size= len(np.unique(y_raw)), stratify = y_raw[idx_data[idx_bag][TRAIN]])
print(y_train)

learner = ActiveLearner (
    estimator= which_classifier(classifier), #cls,
    query_strategy=uncertainty_sampling,
    X_training = X_train, y_training = y_train # AL AJUSTA O CLASSIFIER 
)

uncertain_sample_score = learner.score(X_test, y_test)

total_of_samples = 1
while (total_of_samples != cost):

    #X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, train_size=0.03)

    idx = np.random.choice(range(len(idx_data[idx_bag][TRAIN])), size=init_size, replace=False)
    X_train, y_train = X_raw[idx_data[idx_bag][TRAIN][idx]], y_raw[idx_data[idx_bag][TRAIN][idx]]

    if classifier_uncertainty(learner, X_train[0].reshape(1,-1)) > 0.2:
        #print("IF", learner.score(X_test, y_test))
        learner.teach(X_train, y_train)
        uncertain_sample_score = learner.score(X_test, y_test)
        performance_history.append(uncertain_sample_score)
    total_of_samples = total_of_samples + 1


In [None]:
train_size= len(np.unique(y_raw)) + init_size