# Comparando estratégias

- Amostra por incerteza
- Amostragem aleatória
- Consulta por comitê
- Aprendizado passivo
- Redução do erro esperado

In [1]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
# Plots
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [3]:
from copy import deepcopy

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

## Classificadores

### Algoritmos

In [6]:
from sklearn.neighbors import KNeighborsClassifier

### Conjunto de dados

In [7]:
from sklearn.datasets import load_iris, load_digits, load_wine, load_breast_cancer

## Amostra por incerteza

In [8]:
def uncertain_sampling(X_raw, y_raw, n_queries):
    from modAL.uncertainty import classifier_uncertainty

    perfomance_history = []

    X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, train_size=0.03, random_state=1)

    initial_idx = np.random.choice(range(len(X_raw)), size=k+1, replace=False)
    X_train, y_train = X_raw[initial_idx], y_raw[initial_idx]

    knn = KNeighborsClassifier(k)
    knn.fit(X_train,y_train)

    learner = ActiveLearner (
        estimator=knn,
        query_strategy=uncertainty_sampling
    )
    uncertain_sample_score = learner.score(X_test, y_test)

    perfomance_history.append(uncertain_sample_score)

    total_of_samples = 0
    while (total_of_samples != n_queries - 1):
        X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, train_size=0.03)
        if classifier_uncertainty(learner, X_test[0].reshape(1,-1)) > 0.2:
            learner.teach(X_train, y_train)
            uncertain_sample_score = learner.score(X_test, y_test)
            perfomance_history.append(uncertain_sample_score)
            print("Uncertainty Sampling score: ", uncertain_sample_score)
            total_of_samples = total_of_samples + 1
    performance_history_total.append(perfomance_history)
    legend.append("Uncertainty Sampling")

## Amostragem aleatória

In [9]:
def random_sampling(X_raw, y_raw, n_queries):
    
    performance_history = []

    for i in range(1,n_queries+1):

        #high = X_raw.shape[0] = qtd amostras no dataset
        training_indices = np.random.randint(low=0, high=X_raw.shape[0], size=k+i)

        X_train = X_raw[training_indices]
        y_train = y_raw[training_indices]

        X_test = np.delete(X_raw, training_indices, axis=0)
        y_test = np.delete(y_raw, training_indices, axis=0)

        knn = KNeighborsClassifier(k)
        knn.fit(X_train, y_train)

        pred = knn.predict(X_test)
        print("Random Sampling score: ", knn.score(X_test,y_test))
        performance_history.append(knn.score(X_test,y_test))

    performance_history_total.append(performance_history)
    legend.append("Random Sampling")

## Consulta por comitê

In [18]:
#def query_by_committee(X_raw,y_raw,n_queries):
    
from modAL.models import ActiveLearner, Committee
from modAL.disagreement import vote_entropy_sampling
performance_history = []

for i in range(1,n_queries+1):

    learner_list = []
    
    for j in range(1,n_queries+1):
    
        X_train, _, y_train, _ = train_test_split(X_raw, y_raw, train_size=0.03)

        knn = KNeighborsClassifier(k)
        #knn.fit(X_train,y_train)

        # initializing learner
        learner = ActiveLearner(
            estimator=knn,
            query_strategy=uncertainty_sampling,
            X_training = X_train, y_training = y_train 
        )
        learner_list.append(learner)

    # assembling the committee
    committee = Committee(
        learner_list=learner_list,
        query_strategy=vote_entropy_sampling)
    
    X_pool, y_pool = X_raw, y_raw
    
    # query by committee
    for idx in range(n_queries):
        query_idx, query_instance = committee.query(X_pool, n_instances = k+1)
        committee.teach(
            X=X_pool[query_idx],
            y=y_pool[query_idx]
        )
        
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx)
        
    print(idx, n_queries, "Query by Committee: ", committee.score(X_pool, y_pool))

    #performance_history.append()
        
#performance_history_total.append(performance_history)
#legend.append("Query by committee")

9 10 Query by Committee:  0.990909090909091
9 10 Query by Committee:  0.9818181818181818
9 10 Query by Committee:  0.9727272727272728
9 10 Query by Committee:  0.9818181818181818
9 10 Query by Committee:  0.9636363636363636
9 10 Query by Committee:  0.9727272727272728
9 10 Query by Committee:  0.9454545454545454
9 10 Query by Committee:  0.9818181818181818
9 10 Query by Committee:  0.9727272727272728
9 10 Query by Committee:  0.9636363636363636


In [None]:
performance_history

### Setup

In [11]:
n_queries = 10
k = 3
performance_history_total = []
legend = []

data = load_iris()
X_raw = data['data']
y_raw = data['target']

uncertain_sampling(deepcopy(X_raw), deepcopy(y_raw), n_queries)
random_sampling(deepcopy(X_raw), deepcopy(y_raw), n_queries)

Uncertainty Sampling score:  0.6575342465753424
Uncertainty Sampling score:  0.4452054794520548
Uncertainty Sampling score:  0.4931506849315068
Uncertainty Sampling score:  0.8356164383561644
Uncertainty Sampling score:  0.8972602739726028
Uncertainty Sampling score:  0.8972602739726028
Uncertainty Sampling score:  0.9041095890410958
Uncertainty Sampling score:  0.9041095890410958
Uncertainty Sampling score:  0.9178082191780822
Random Sampling score:  0.3219178082191781
Random Sampling score:  0.6620689655172414
Random Sampling score:  0.6527777777777778
Random Sampling score:  0.7655172413793103
Random Sampling score:  0.9225352112676056
Random Sampling score:  0.9219858156028369
Random Sampling score:  0.8928571428571429
Random Sampling score:  0.900709219858156
Random Sampling score:  0.8768115942028986
Random Sampling score:  0.644927536231884
