# Comparando estratégias

- Amostra por incerteza
- Amostragem aleatória
- Consulta por comitê
- Aprendizado passivo
- Redução do erro esperado

In [1]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
# Plots
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [3]:
from copy import deepcopy

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

In [6]:
from timeit import default_timer as timer

In [7]:
from sklearn.model_selection import ShuffleSplit

## Classificadores

### Algoritmos

In [8]:
from sklearn.neighbors import KNeighborsClassifier

### Conjunto de dados

In [9]:
from sklearn.datasets import load_iris, load_digits, load_wine, load_breast_cancer

## Amostra por incerteza

In [54]:
def uncertain_sampling(X_raw, y_raw, dx_data, idx_bag, classifier, init_size, n_queries):
    
    from modAL.uncertainty import classifier_uncertainty
    start = timer()
    sample_size = 0
    performance_history = []
    
    initial_idx = np.random.choice(range(len(idx_data[idx_bag][0])), size=init_size, replace=False)
    X_train, y_train = X_raw[idx_data[idx_bag][0][initial_idx]], y_raw[idx_data[idx_bag][0][initial_idx]]
    X_test, y_test = X_raw[idx_data[idx_bag][1]], y_raw[idx_data[idx_bag][1]]
    
    sample_size = sample_size + len(X_train)

    classifier.fit(X_train,y_train)

    learner = ActiveLearner (
        estimator=classifier,
        query_strategy=uncertainty_sampling
    )
    uncertain_sample_score = learner.score(X_test, y_test)

    performance_history.append(uncertain_sample_score)

    total_of_samples = 0
    while (total_of_samples != n_queries - 1):
        
        #X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, train_size=0.03)
        
        idx = np.random.choice(range(len(idx_data[idx_bag][0])), size=init_size, replace=False)
        X_train, y_train = X_raw[idx_data[idx_bag][0][idx]], y_raw[idx_data[idx_bag][0][idx]]
        print("WHILE", classifier_uncertainty(learner, X_train[0].reshape(1,-1)))
        if classifier_uncertainty(learner, X_train[0].reshape(1,-1)) > 0.2:
            print("IF", learner.score(X_test, y_test))
            sample_size = sample_size + len(X_train)
            learner.teach(X_train, y_train)
            uncertain_sample_score = learner.score(X_test, y_test)
            performance_history.append(uncertain_sample_score)
            #print("Uncertainty Sampling score: ", uncertain_sample_score)
            performance_history.append(uncertain_sample_score)    
        total_of_samples = total_of_samples + 1
    
    end = timer()
    time_elapsed = end - start
    
    return { "performance_history": performance_history, 
             "time_elapsed": time_elapsed,
             "sample_size": sample_size,
             "Strategy": "Uncertainty Sampling"}

## Amostragem aleatória

In [None]:
def random_sampling(X_raw, y_raw, n_queries):
    
    performance_history = []

    for i in range(1,n_queries+1):

        #high = X_raw.shape[0] = qtd amostras no dataset
        training_indices = np.random.randint(low=0, high=X_raw.shape[0], size=k+i)

        X_train = X_raw[training_indices]
        y_train = y_raw[training_indices]

        X_test = np.delete(X_raw, training_indices, axis=0)
        y_test = np.delete(y_raw, training_indices, axis=0)

        knn = KNeighborsClassifier(k)
        knn.fit(X_train, y_train)

        pred = knn.predict(X_test)
        #print("Random Sampling score: ", knn.score(X_test,y_test))
        performance_history.append(knn.score(X_test,y_test))

    performance_history_total.append(performance_history)
    legend.append("Random Sampling")

## Consulta por comitê

In [None]:
def query_by_committee(X_raw,y_raw,n_queries):

    from modAL.models import ActiveLearner, Committee
    from modAL.disagreement import vote_entropy_sampling
    performance_history = []

    for i in range(1,n_queries+1):

        learner_list = []

        for j in range(1,n_queries+1):

            X_train, _, y_train, _ = train_test_split(X_raw, y_raw, train_size=0.03)

            knn = KNeighborsClassifier(k)
            #knn.fit(X_train,y_train)

            # initializing learner
            learner = ActiveLearner(
                estimator=knn,
                query_strategy=uncertainty_sampling,
                X_training = X_train, y_training = y_train 
            )
            learner_list.append(learner)

        # assembling the committee
        committee = Committee(
            learner_list=learner_list,
            query_strategy=vote_entropy_sampling)

        X_pool, y_pool = X_raw, y_raw

        # query by committee
        for idx in range(n_queries):
            query_idx, query_instance = committee.query(X_pool, n_instances = k+1)
            committee.teach(
                X=X_pool[query_idx],
                y=y_pool[query_idx]
            )

            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

        #print(idx, n_queries, "Query by Committee: ", committee.score(X_pool, y_pool))

        performance_history.append(committee.score(X_pool, y_pool))

    performance_history_total.append(performance_history)
    legend.append("Query by committee")

### Setup

In [11]:
def which_dataset(dataset = "iris", n_splits = 5):
    
    # Futuramente essa etapa será ajustada para receber qualquer dataset (ou lista com datasets)
    if (dataset == "iris"):
        data = load_iris()
        X_raw = data['data']
        y_raw = data['target']
        
    # cross validation bags
    data_cv = ShuffleSplit(n_splits= n_splits, test_size=0.3, random_state=0) #n_splits
    
    # extraindo ids do data_cv
    idx_data = []
    for train_index, test_index in data_cv.split(X_raw):
            idx_data.append([train_index, test_index])

    return X_raw, y_raw, idx_data

In [12]:
def which_classifier(parameters, classifier = 'k'):
    
    if (classifier == 'k'):
        return KNeighborsClassifier(parameters)

In [13]:
n_queries = 10
n_splits = 5
k = 3
performance_history_total = []
legend = []

X_raw, y_raw, idx_data = which_dataset()


#uncertain_sampling(deepcopy(X_raw), deepcopy(y_raw), n_queries)
#random_sampling(deepcopy(X_raw), deepcopy(y_raw), n_queries)
#query_by_committee(X_raw,y_raw,n_queries)

In [55]:
performance_history = []
for idx_bag, cv_bag in enumerate(idx_data):
    classifier = which_classifier(k)
    uncertain_score = uncertain_sampling(X_raw, y_raw, idx_data, idx_bag, classifier, k, n_queries)
    performance_history.append(uncertain_score)
performance_history_total.append(performance_history)

WHILE [0.66666667]
IF 0.35555555555555557
WHILE [0.33333333]
IF 0.35555555555555557
WHILE [0.33333333]
IF 0.6
WHILE [0.]
WHILE [0.]
WHILE [0.]
WHILE [0.]
WHILE [0.33333333]
IF 0.9111111111111111
WHILE [0.33333333]
IF 0.9111111111111111
WHILE [0.66666667]
IF 0.26666666666666666
WHILE [0.33333333]
IF 0.37777777777777777
WHILE [0.33333333]
IF 0.6444444444444445
WHILE [0.]
WHILE [0.]
WHILE [0.]
WHILE [0.33333333]
IF 0.8222222222222222
WHILE [0.33333333]
IF 0.8888888888888888
WHILE [0.]
WHILE [0.66666667]
IF 0.35555555555555557
WHILE [0.33333333]
IF 0.37777777777777777
WHILE [0.33333333]
IF 0.6888888888888889
WHILE [0.33333333]
IF 0.6888888888888889
WHILE [0.]
WHILE [0.33333333]
IF 0.7111111111111111
WHILE [0.]
WHILE [0.]
WHILE [0.]
WHILE [0.33333333]
IF 0.35555555555555557
WHILE [0.33333333]
IF 0.35555555555555557
WHILE [0.33333333]
IF 0.7333333333333333
WHILE [0.33333333]
IF 0.8
WHILE [0.]
WHILE [0.]
WHILE [0.]
WHILE [0.]
WHILE [0.]
WHILE [0.66666667]
IF 0.4
WHILE [0.66666667]
IF 0.4
WHIL

In [56]:
performance_history

[{'performance_history': [0.35555555555555557,
   0.35555555555555557,
   0.35555555555555557,
   0.6,
   0.6,
   0.9111111111111111,
   0.9111111111111111,
   0.9111111111111111,
   0.9111111111111111,
   0.9111111111111111,
   0.9111111111111111,
   0.9111111111111111,
   0.9111111111111111,
   0.8888888888888888,
   0.8888888888888888],
  'time_elapsed': 0.09302545199898304,
  'sample_size': 18,
  'Strategy': 'Uncertainty Sampling'},
 {'performance_history': [0.26666666666666666,
   0.37777777777777777,
   0.37777777777777777,
   0.6444444444444445,
   0.6444444444444445,
   0.8222222222222222,
   0.8222222222222222,
   0.8222222222222222,
   0.8222222222222222,
   0.8222222222222222,
   0.8888888888888888,
   0.8888888888888888,
   0.9333333333333333,
   0.9333333333333333,
   0.9333333333333333],
  'time_elapsed': 0.05336953299956804,
  'sample_size': 18,
  'Strategy': 'Uncertainty Sampling'},
 {'performance_history': [0.35555555555555557,
   0.37777777777777777,
   0.377777777777

In [None]:
#  https://seaborn.pydata.org/examples/scatterplot_sizes.html