# Active Learner sobre 20 News Groups

## Imports

### Del proyecto

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn import metrics

### Del framework

In [None]:
from core import ActiveLearner, Dataset, Oracle
from querys import CertaintySelector, UncertaintySelector, RandomSelector, MinDiffSelector, EntropySelector

## Funciones y constantes para el preproceso de los datos y partición del mismo

In [None]:
def get_n_each_category(dataset, n):
    train = []
    for cat in range(len(dataset.target_names)):
        count = 0
        i = 0
        while count < n and i < len(dataset.target):
            if dataset.target[i] == cat:
                train.append(i)
                count += 1
            i += 1
    train.sort()
    return train

In [None]:
def remove_from_dataset(dataset, i):
    del dataset.data[i]
    dataset.target = np.delete(dataset.target, i)
    dataset.filenames = np.delete(dataset.filenames, i)
    return dataset

In [None]:
def remove_many_from_dataset(dataset, indices):
    for i, index in enumerate(indices):
        dataset = remove_from_dataset(dataset, index-i)
    return dataset

In [None]:
def split_train_data(dataset, train_indices):
    train_data = []
    train_target = []
    for i in train_indices:
        train_data.append(dataset.data[i])
        train_target.append(dataset.target[i])
    dataset = remove_many_from_dataset(dataset, train_indices)
    return dataset, train_data, train_target

In [None]:
def clean_dataset(dataset):
    chars = set("abcdefghijklmnopqrstuvwxyz")
    to_remove = []
    for i in range(len(dataset.data)):
        dataset.data[i] = dataset.data[i].strip()
        dataset.data[i] = dataset.data[i].lower()
        if len(dataset.data[i]) == 0:
            to_remove.append(i)
        elif not any((c in chars) for c in dataset.data[i]):
            to_remove.append(i)
        else:
            txt = dataset.data[i].split(' ')
            if len(txt) < 100:
                to_remove.append(i)
        
    
    return remove_many_from_dataset(dataset, to_remove)

In [None]:
categories = [
    'alt.atheism',
    'comp.graphics',
#    'comp.os.ms-windows.misc',
#    'comp.sys.ibm.pc.hardware',
#    'comp.sys.mac.hardware',
#    'comp.windows.x',
    'misc.forsale',
#    'rec.autos',
    'rec.motorcycles',
#    'rec.sport.baseball',
#    'rec.sport.hockey',
#    'sci.crypt',
#    'sci.electronics',
    'sci.med',
#    'sci.space',
#    'soc.religion.christian',
    'talk.politics.guns',
#    'talk.politics.mideast',
#    'talk.politics.misc',
#    'talk.religion.misc',
]

In [None]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

## Obtengo los datos

Solo las categorias definidas anteriormente

In [None]:
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)

Limpio los artículos que no posean información relevante, por ejemplo, hay más de uno que esta vacío.

In [None]:
#dataset = clean_dataset(dataset)

Selecciono 10 instancias de cada clase para iniciar el Active Lerning

In [None]:
train_indices = get_n_each_category(dataset, 10)
dataset, train_data, train_target = split_train_data(dataset, train_indices)

Datos para el testing

In [None]:
test_dataset = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)
#test_dataset = clean_dataset(test_dataset)
test_data = test_dataset.data
test_target = test_dataset.target

## Obtengo los features TF-IDF

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_unlabeled = vectorizer.transform(dataset.data)
X_test = vectorizer.transform(test_data)

In [None]:
X_train = np.array(train_data).reshape(len(train_data), 1)
X_unlabeled = np.array(dataset.data).reshape(len(dataset.data), 1)
X_test = np.array(test_data).reshape(len(test_data), 1)

## Instancio lo mínimo necesario para el framework

Esto es, el **Dataset** y el **Oracle**. Además al Oracle lo modifico para que en lugar de pedir las etiquetas al usuario etiquetador, las devuelva el mismo. Esto lo hago para acelerar el proceso de evaluación del Framework.

In [None]:
class NGDataset(Dataset):
    dataset = dataset
    
    def get_unlabeled_readable(self, i):
        #return self.dataset.data[i]
        return self.dataset.target[i]


class NewsGroupOracle(Oracle):
    target_names = dataset.target_names
    
    def ask(self, X_readable, recoms):
        return X_readable

## Evaluación Final

Comparación de los selectores de Active Learning y también del selector RandomSelector (el cual es equivalente a no aplicar Active Learning. Se compara una vez a cada uno de los selectores del framework excepto el RandomSelector, que es el unico que tiene un comportamiento no determinístico. Por esta razón se ejecutan 5 instancias del mismo y luego se calcula un promedio de ellos para ver el desepempeño medio del mismo.

In [None]:
class SampleExtractor(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
        
    def transform(self, x, y=None):
        return [ elem[0] for elem in x]

In [None]:
selectors = [CertaintySelector, UncertaintySelector, MinDiffSelector, EntropySelector, RandomSelector, RandomSelector, RandomSelector, RandomSelector, RandomSelector]

In [None]:
import time

In [None]:
selector_scores = []
a = time.time()
for selector in selectors:
    y_train = np.array(train_target)
    ngdataset = NGDataset(X_train, y_train, X_unlabeled)

    model = Pipeline([
        ('extractor', SampleExtractor()),
        ('tfidf', TfidfVectorizer()),
        ('model', MultinomialNB(alpha=0.1))
    ])
    oracle = NewsGroupOracle()
    al = ActiveLearner(model, ngdataset, selector, oracle)
    scores = []

    al.fit()
    scores.append(al.model.score(X_test, test_target))

    for _ in tqdm(range(300)):
        selected = al.select(10)
        y = al.ask(selected)
        al.tag_elements(selected, y)
        al.fit()
        scores.append(al.model.score(X_test, test_target))
    
    selector_scores.append(scores)
b = time.time()

In [None]:
b - a

In [None]:
random_avg = []
for i in range(len(selector_scores[0])):
    suma = 0
    for selector in selector_scores[4:]:
        suma += selector[i]
    random_avg.append(suma/5)

In [None]:
import matplotlib
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 20}

matplotlib.rc('font', **font)

In [None]:
plt.figure(figsize=(20,15))
plt.title('Comparación de selectores')
plt.xlabel('Iteraciones')
plt.ylabel('Precision')
plt.plot(selector_scores[0][:500])
plt.plot(selector_scores[1][:500])
plt.plot(selector_scores[2][:500])
plt.plot(selector_scores[3][:500])
plt.plot(selector_scores[4][:500], color='silver')
plt.plot(selector_scores[5][:500], color='silver')
plt.plot(selector_scores[6][:500], color='silver')
plt.plot(selector_scores[7][:500], color='silver')
plt.plot(selector_scores[8][:500], color='silver')
#plt.plot(random_avg[:500], color='black')
plt.legend(['Certainty', 'Uncertainty', 'MinDiff', 'Entropy', 'Random'])
plt.savefig('clean_4categories')
plt.show()