#Intent classification

This notebook fine-tunes `xlm-roberta-base` to identify the speaker's intention. Let's create a dataset with some of the possible questions for training.

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer,DataCollatorWithPadding
import numpy as np
import torch
torch.cuda.empty_cache()

In [2]:
dataset = load_dataset("json", data_files="preguntas.json",split="train").train_test_split(test_size=0.1,seed=1)
print(dataset['train'][0],"\n",
dataset['test'][0])

{'text': 'Hace unos días, mientras trabajaba en mi jardín, observé cómo las flores que planté meses atrás finalmente alcanzaban su pleno esplendor. Este proceso de crecimiento y transformación me hizo reflexionar sobre la paciencia y la dedicación necesarias para ver florecer los frutos de nuestro trabajo. ¿Tienes alguna experiencia en la que la paciencia haya sido clave para alcanzar tus objetivos?', 'labels': 'chitchat'} 
 {'text': 'Estoy considerando unirme a un club de lectura local. Me encantaría conocer gente nueva y compartir ideas sobre libros. ¿Alguna vez participaste en un club de lectura?', 'labels': 'chitchat'}


In [3]:
   
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [4]:
id2label = {0: "direct", 1: "indirect", 2:"chitchat"}
label2id = {"direct": 0, "indirect": 1, "chitchat":2}

In [5]:
def preprocess_function(batch):
    tokenized_batch = tokenizer(batch["text"], truncation=True,padding=True)
    tokenized_batch["labels"] = [label2id[label] for label in batch["labels"]]
    return tokenized_batch

In [6]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

## Fine-tuning

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, id2label=id2label, label2id=label2id)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="intent_model",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [10]:
trainer.evaluate()

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.0013415400171652436,
 'eval_accuracy': 1.0,
 'eval_runtime': 0.2424,
 'eval_samples_per_second': 90.768,
 'eval_steps_per_second': 12.377,
 'epoch': 10.0}

## Results

In [22]:
text = "Visitaste ayer al equipo de relaciones internacionales?"
path_file = "./intent_model/checkpoint-200/"
tokenizer = AutoTokenizer.from_pretrained(path_file, local_files_only=True)
inputs = tokenizer(text, return_tensors="pt")
model = AutoModelForSequenceClassification.from_pretrained(path_file, local_files_only=True)
with torch.no_grad():
    logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'indirect'

## Hyperparameter opt

In [12]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, id2label=id2label, label2id=label2id)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()