In [None]:
!pip install setfit

In [None]:
# SETFIT ERROR as of December 2024
# AttributeError: 'CallbackHandler' object has no attribute 'tokenizer'
# https://github.com/huggingface/setfit/issues/564
# solution is to install older version of transformers
!pip install transformers==4.45.2

In [None]:
from setfit import SetFitModel

model = SetFitModel.from_pretrained("BAAI/bge-small-en-v1.5")

In [1]:
from datasets import load_dataset

dataset = load_dataset("data-is-better-together/10k_prompts_ranked")

dataset

README.md:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10331 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description', 'topic'],
        num_rows: 10331
    })
})

In [2]:
dataset_tt = dataset["train"].train_test_split(test_size=0.5)

In [None]:
dataset_tt

In [None]:
from setfit import sample_dataset

# Mimic only having a few labeled examples from each class

- here i'm taking 20 of each class (`topic` is the `label` in our current dataset, will rename later)
- so `sample_dataset` from setfit should take 20 **of each possible label** so we should see 20 * 10 = 200 samples in train dataset below (since there are 10 labels in this dataset)

## Update

doing 20 samples for all 10 classes, with 500 steps, i get accuracy of 0.505 after 20 mins

- the advice is that you get better results with more data, so i'll try 100 samples of each now

In [None]:
#train_dataset = sample_dataset(dataset_tt["train"], label_column="topic", num_samples=20) # get 0.505 accuracy with 500 steps

train_dataset = sample_dataset(dataset_tt["train"], label_column="topic", num_samples=100) 

In [None]:
train_dataset

In [None]:
test_dataset = dataset_tt["test"]

In [None]:
test_dataset

In [None]:
from setfit import TrainingArguments

args = TrainingArguments(
    batch_size=32,
    #num_epochs=1, # adjusted down from 10
    max_steps=500, # get 40% accuracy at 50 steps on the test dataset, tried small number to make sure it works
    report_to="none",
)

"""
The num_epochs and max_steps arguments are frequently used to increase and decrease the number of total training steps. Consider that with SetFit, better performance is reached with more data, not more training! Don’t be afraid to train for less than 1 epoch if you have a lot of data.
"""

In [None]:
from setfit import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    column_mapping={"prompt":"text", "topic":"label"} # <--- SETFIT EXPECTS text AND label RESPECTIVELY
)

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import wandb
wandb.init(mode="disabled")

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
model.predict(["write a new novel for me", "what is the best place to go to visit in italy?"])

# Summary

- with 20 samples and 500 training steps, get 50.5 % accuracy (not shown in notebook, deleted test run)
- with 100 samples from each class, and 500 training steps, get 56.4 % accuracy

## Try much larger number of samples, then will compare with finetuning a BERT type model

- use 200 samples from each class

In [None]:
train_dataset2 = sample_dataset(dataset_tt["train"], label_column="topic", num_samples=200) 

In [None]:
args2 = TrainingArguments(
    batch_size=32,
    #num_epochs=1, # adjusted down from 10
    max_steps=500, # get 40% accuracy at 50 steps on the test dataset, tried small number to make sure it works
    report_to="none",
)

trainer2 = Trainer(
    model=model,
    args=args2,
    train_dataset=train_dataset2,
    column_mapping={"prompt":"text", "topic":"label"} # <--- SETFIT EXPECTS text AND label RESPECTIVELY
)

In [None]:
trainer2.train()

In [None]:
trainer2.evaluate(test_dataset)

## Comments

- 56.8 with way more samples compared to 56.4 with 100 samples per label

# Do a trained distilBERT comparison

In [None]:
dataset_tt

In [None]:
dataset_tt["train"].features

In [None]:
!pip install evaluate accelerate

In [None]:
MODEL_NAME = "distilbert/distilbert-base-uncased"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
dataset_tt

In [None]:
dataset_tt = dataset_tt.rename_column("prompt","text")

In [None]:
dataset_tt = dataset_tt.rename_column("topic","label")

In [None]:
dataset_tt

In [None]:
dataset_tt = dataset_tt.remove_columns(['quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description'])

dataset_tt

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length") # adding padding = "max_length" here

In [None]:
tokenized_ds = dataset_tt.map(preprocess_function, batched=True)

# deleted data collator, causing bugs i can't understand why (it's literally the docs on HF site)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Get labels, they don't seem to be accessible in dataset itseflf

In [None]:
TVT_KEYS = ("train","test")
ALL_LABELS_NAMES = set()

for key in TVT_KEYS:
    for sample in dataset_tt[key]:
        ALL_LABELS_NAMES.add(sample["label"])

In [None]:
ALL_LABELS_NAMES = sorted(ALL_LABELS_NAMES)

In [None]:
print(ALL_LABELS_NAMES, len(ALL_LABELS_NAMES))

In [None]:
label2id = {label_name:idx for idx,label_name in enumerate(ALL_LABELS_NAMES)}

print(label2id)

In [None]:
id2label = {idx:label_name for label_name,idx in label2id.items()}

print(id2label)

In [None]:
NUM_LABELS = len(label2id)

print(NUM_LABELS)

In [None]:
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer

model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id
)

In [None]:
tokenized_ds

In [None]:
#tokenized_ds = tokenized_ds.remove_columns(tokenized_ds["train"].column_names)

In [None]:
tokenized_ds

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding="max_length", truncation=True)

In [None]:
tokenized_ds["train"]

In [None]:
tokenized_ds["train"][0]

In [None]:
training_args = TrainingArguments(
    output_dir="finetuned-prompt-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    #data_collator=data_collator, # <--- deleted during debug, doesn't work and answers are unclear on HF github
    compute_metrics=compute_metrics,
)

training_metrics = trainer.train()

# Update - maybe the label field needs to be the INTEGER value not the string (so unclear)

In [4]:
from datasets import load_dataset

dataset = load_dataset("data-is-better-together/10k_prompts_ranked")

dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description', 'topic'],
        num_rows: 10331
    })
})

In [5]:
dataset_tt = dataset["train"].train_test_split(test_size=0.5)

In [6]:
dataset_tt

DatasetDict({
    train: Dataset({
        features: ['prompt', 'quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description', 'topic'],
        num_rows: 5165
    })
    test: Dataset({
        features: ['prompt', 'quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description', 'topic'],
        num_rows: 5166
    })
})

In [7]:
!pip install evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [8]:
MODEL_NAME = "distilbert/distilbert-base-uncased"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

dataset_tt = dataset_tt.rename_column("prompt","text")

dataset_tt = dataset_tt.rename_column("topic","label_STRING")

dataset_tt = dataset_tt.remove_columns(['quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description'])




The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
TVT_KEYS = ("train","test")
ALL_LABELS_NAMES = set()

for key in TVT_KEYS:
    for sample in dataset_tt[key]:
        ALL_LABELS_NAMES.add(sample["label_STRING"])

ALL_LABELS_NAMES = sorted(ALL_LABELS_NAMES)

label2id = {label_name:idx for idx,label_name in enumerate(ALL_LABELS_NAMES)}


id2label = {idx:label_name for label_name,idx in label2id.items()}


NUM_LABELS = len(label2id)

In [14]:
dataset_tt

DatasetDict({
    train: Dataset({
        features: ['text', 'label_STRING'],
        num_rows: 5165
    })
    test: Dataset({
        features: ['text', 'label_STRING'],
        num_rows: 5166
    })
})

In [19]:
def convert_to_id(example):
    example["label"] = label2id[ example["label_STRING"] ]
    return example

dataset_tt = dataset_tt.map(convert_to_id)

Map:   0%|          | 0/5165 [00:00<?, ? examples/s]

Map:   0%|          | 0/5166 [00:00<?, ? examples/s]

In [20]:
dataset_tt

DatasetDict({
    train: Dataset({
        features: ['text', 'label_STRING', 'label'],
        num_rows: 5165
    })
    test: Dataset({
        features: ['text', 'label_STRING', 'label'],
        num_rows: 5166
    })
})

In [21]:
dataset_tt = dataset_tt.remove_columns("label_STRING")

dataset_tt

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5165
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5166
    })
})

In [23]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [24]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_ds = dataset_tt.map(preprocess_function, batched=True)

Map:   0%|          | 0/5165 [00:00<?, ? examples/s]

Map:   0%|          | 0/5166 [00:00<?, ? examples/s]

In [25]:
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer

model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
import evaluate



accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [27]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
training_args = TrainingArguments(
    output_dir="finetuned-prompt-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

training_metrics = trainer.train()

  trainer = Trainer(
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.439651,0.554007
2,No log,1.335148,0.577623


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [29]:
print(training_metrics)

TrainOutput(global_step=324, training_loss=1.5420913696289062, metrics={'train_runtime': 346.3371, 'train_samples_per_second': 29.826, 'train_steps_per_second': 0.936, 'total_flos': 1291514725825200.0, 'train_loss': 1.5420913696289062, 'epoch': 2.0})


In [31]:
trainer.evaluate(tokenized_ds["test"])

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 1.3351483345031738,
 'eval_accuracy': 0.5776229190863337,
 'eval_runtime': 47.3161,
 'eval_samples_per_second': 109.181,
 'eval_steps_per_second': 3.424,
 'epoch': 2.0}

# BERT training

- 57.7 accuracy vs 56.8 with the SetFit faster approach

Need to comparge larger embedding models maybe

Not sure why this dataset is so difficult also?