# Active Learning with small-text

In [46]:
import numpy as np

from datasets import load_dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer

from small_text.active_learner import PoolBasedActiveLearner
from small_text.base import LABEL_UNLABELED
from small_text.integrations.transformers import TransformersDataset, TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.query_strategies import BreakingTies

In [47]:
# Configs
DATASET_NAME = "bergr7/weakly_supervised_ag_news"
TRANSFORMER_MODEL = "distilbert-base-uncased"
LABELS = load_dataset('ag_news')["train"].features["label"].names
NUM_SAMPLES = 5

Using custom data configuration default
Found cached dataset ag_news (/Users/alekshiidenhovi/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
100%|██████████| 2/2 [00:00<00:00, 259.71it/s]


In [48]:
print(LABELS)

['World', 'Sports', 'Business', 'Sci/Tech']


In [49]:
# files
labeled_data_files = {
    "train": "train.csv",
    "validation": "validation.csv", 
    "test": "test.csv"
}
unlabeled_data_files = {"unlabeled": "unlabeled_train.csv"}
# features
labeled_features = Features(
    {
        "text": Value("string"),
        "label": ClassLabel(
            num_classes=4,
            names=['World', 'Sports', 'Business', 'Sci/Tech']
        )
    }
)
unlabeled_features = Features({"text": Value("string")})

# load data
labeled_dataset = load_dataset(
    DATASET_NAME,
    data_files=labeled_data_files,
    features=labeled_features
)

unlabeled_dataset = load_dataset(
    DATASET_NAME,
    data_files=unlabeled_data_files,
    features=unlabeled_features
)

Using custom data configuration bergr7--weakly_supervised_ag_news-6f78f309523478bd
Found cached dataset csv (/Users/alekshiidenhovi/.cache/huggingface/datasets/bergr7___csv/bergr7--weakly_supervised_ag_news-6f78f309523478bd/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 3/3 [00:00<00:00, 927.67it/s]
Using custom data configuration bergr7--weakly_supervised_ag_news-9442e7dc9bdd01c3
Found cached dataset csv (/Users/alekshiidenhovi/.cache/huggingface/datasets/bergr7___csv/bergr7--weakly_supervised_ag_news-9442e7dc9bdd01c3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 1/1 [00:00<00:00, 536.22it/s]


## Tokenization

In [50]:
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL)

# Helper function to tokenize the input text
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset
unlabeled_tokenized = unlabeled_dataset.map(tokenize, batched=True, remove_columns=["text"])
labeled_tokenized = labeled_dataset.map(tokenize, batched=True, remove_columns=["text"])

# Set convenient output format
unlabeled_tokenized.set_format("torch")
labeled_tokenized.set_format("torch")

100%|██████████| 59/59 [00:07<00:00,  7.88ba/s]
100%|██████████| 38/38 [00:04<00:00,  7.87ba/s]
100%|██████████| 24/24 [00:03<00:00,  7.91ba/s]
100%|██████████| 8/8 [00:00<00:00,  8.50ba/s]


## Training Dataset

In [52]:
train_text = [row["text"] for row in labeled_dataset["train"]]
training_labels = np.array([row["label"] for row in labeled_dataset["train"]])
target_labels = set(LABELS)

# Create the dataset for small-text
training_dataset = TransformersDataset.from_arrays(train_text, LABELS, tokenizer, target_labels=target_labels)



## Validation Dataset

In [53]:
validation_text = [row["text"] for row in labeled_dataset["validation"]]
validation_labels = np.array([row["label"] for row in labeled_dataset["validation"]])
validation_dataset = TransformersDataset.from_arrays(validation_text, validation_labels, tokenizer, target_labels=target_labels)

## Testing Dataset

In [None]:
test_text = [row["text"] for row in labeled_dataset["test"]]
test_dataset = TransformersDataset.from_arrays(test_text, LABELS, tokenizer, target_labels=target_labels)

## Unlabeled Dataset

In [None]:
unlabeled_text = [  
  row["text"] for row in unlabeled_dataset["unlabeled"]
]
# unlabeled_dataset = TransformersDataset.from_arrays(unlabeled_text, LABELS, tokenizer, target_labels=target_labels)

## Active Learning Strategy

In [None]:
# Define our classifier
clf_factory = TransformerBasedClassificationFactory(
    TransformerModelArguments(TRANSFORMER_MODEL),
    num_classes=len(target_labels),
    # If you have a cuda device, specify it here.
    # Otherwise, just remove the following line.
    kwargs={"device": "cuda"}
)

# Define our query strategy
query_strategy = BreakingTies()

# Use the active learner with a pool containing all unlabeled data
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, training_dataset)

In [None]:
from small_text.initialization import random_initialization

np.random.seed(42)


# Number of samples in our queried batches
NUM_SAMPLES = 5

# Randomly draw an initial subset from the data pool
initial_indices = random_initialization(dataset, NUM_SAMPLES)

In [None]:
import rubrix as rb

rb.init(api_url="http://rubrix:80")

In [None]:
# Choose a name for the dataset
DATASET_NAME = "test_with_active_learning_test"

# Define labeling schema
settings = rb.TextClassificationSettings(label_schema=LABELS)

# Create dataset with a label schema
rb.configure_dataset(name=DATASET_NAME, settings=settings)

# Create records from the initial batch
records = [
    rb.TextClassificationRecord(
        text=ag_news_data["train"]["text"][idx],
        metadata={"batch_id": 0},
        id=idx,
    )
    for idx in initial_indices
]

# Log initial records to Rubrix
rb.log(records, DATASET_NAME)

In [None]:
from rubrix.listeners import listener
from sklearn.metrics import accuracy_score

# Define some helper variables
# LABEL2INT = ag_news_data["train"].features["label"].str2int
LABEL2INT = dict(zip(LABELS, range(4)))
ACCURACIES = []

# Set up the active learning loop with the listener decorator
@listener(
    dataset=DATASET_NAME,
    query="status:Validated AND metadata.batch_id:{batch_id}",
    condition=lambda search: search.total==NUM_SAMPLES,
    execution_interval_in_seconds=3,
    batch_id=0
)
def active_learning_loop(records, ctx):

    # 1. Update active learner
    print(f"Updating with batch_id {ctx.query_params['batch_id']} ...")
    print('Please go to rubrix to label the data...')
    y = np.array([LABEL2INT[rec.annotation] for rec in records])
    
    print(f"{NUM_SAMPLES} records have been labeled updating active learner...")
    # initial update
    if ctx.query_params["batch_id"] == 0:
        indices = np.array([rec.id for rec in records])
        active_learner.initialize_data(indices, y)
    # update with the prior queried indices
    else:
        active_learner.update(y)
    print("Done!")
    

    # 2. Query active learner
    print("Querying new data points ...")
    queried_indices = active_learner.query(num_samples=NUM_SAMPLES)
    ctx.query_params["batch_id"] += 1
    new_records = [
        rb.TextClassificationRecord(
            text=ag_news_data["train"]["text"][idx],
            metadata={"batch_id": ctx.query_params["batch_id"]},
            id=idx,
        )
        for idx in queried_indices
    ]

    # 3. Log the batch to Rubrix
    rb.log(new_records, DATASET_NAME)

    # 4. Evaluate current classifier on the test set
    print("Evaluating current classifier ...")
    accuracy = accuracy_score(
        dataset_test.y,
        active_learner.classifier.predict(dataset_test),
    )
    ACCURACIES.append(accuracy)
    print("Done!")

    print("Waiting for annotations ...")

In [None]:
active_learning_loop.start()

In [None]:
active_learning_loop.stop()