# SetFit for Text Classification
https://arxiv.org/pdf/2209.11055.pdf

In this notebook, we'll learn how to do:
- few-shot text classification with SetFit
- distillate a model
- quantize a model
- zero-shot text classification


Please note that this notebook is GREATLY inspired from the notebooks available in the official [repository](https://github.com/huggingface/setfit/tree/main/notebooks).
Here we combine some of them to have an overview of SetFit.

## Setup

If you're running this Notebook on Colab or some other cloud platform, you will need to install a few libraries.
Uncomment the following cell and run it to install the requirements.

In [None]:
%pip install setfit
%pip install datasets
%pip install pandas
%pip install scikit-learn
%pip install onnx
%pip install optimum
%pip install onnxruntime
%pip install neural_compressor
%pip install evaluate
%pip install huggingface_hub
%pip install matplotlib

Please relaunch the kernel after installing the libraries

This notebook is designed to work with any multiclass [text classification dataset](https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads) and pretrained [Sentence Transformer](https://huggingface.co/models?library=sentence-transformers&sort=downloads) on the Hub. Change the values below to try a different dataset / model!

Here we will use the all-mpnet-base-v2 that gives the best quality on the current sentence transformers open source datasets
See https://www.sbert.net/docs/pretrained_models.html for more information

## Loading and sampling the dataset

In [None]:
from IPython.display import display, Markdown
import pandas as pd
import sklearn
import datasets
from datasets import Dataset, load_dataset, concatenate_datasets
from setfit import SetFitModel
#from setfit.data import concatenate_datasets


def create_dataset(input_file):
    """Create dataset from local file"""
    # Load raw dataset
    df = pd.read_csv(input_file)
    labels = sorted(list(df["label"].unique()))
    
    # Prepare the dataset
    dataset_dict = {
        'idx': df.index,
        'sentence': df["text"],
        'label': df["label"]
    }
    features = datasets.Features({
        'idx': datasets.Value('int32'),
        'sentence': datasets.Value('string'),
        'label': datasets.ClassLabel(num_classes=len(labels), names=labels),
    })

    # Create dataset in HF dataset format
    dataset = datasets.Dataset.from_dict(dataset_dict, features, split="train")
    return dataset


def evaluate_more_metrics(trainer):
    """Create a classification report using a trainer object"""
    trainer._validate_column_mapping(trainer.eval_dataset)
    eval_dataset = trainer.eval_dataset

    if trainer.column_mapping is not None:
        eval_dataset = trainer._apply_column_mapping(trainer.eval_dataset, trainer.column_mapping)

    x_test = eval_dataset["text"]
    y_test = eval_dataset["label"]

    y_pred = trainer.model.predict(x_test)
    print(sklearn.metrics.classification_report(y_test, y_pred))
    return y_test, y_pred


def sample_dataset(dataset: Dataset, label_column: str = "label", num_samples: int = 8, seed: int = 42, class_weights = None) -> Dataset:
    """Samples a Dataset to create an equal number of samples per class (when possible)."""
    
    # Shuffle the dataset 
    shuffled_dataset = dataset.shuffle(seed=seed)
    
    samples = []
    # Get unique labels
    num_labels =  dataset.unique(label_column)
    
    # For each label
    for label in num_labels:
        data = shuffled_dataset.filter(lambda example: example[label_column] == label)
        # Get the number of data to sample based on the class_weights
        num_label_samples = min(len(data), num_samples * class_weights[label])
        # Add sample
        samples.append(data.select([i for i in range(num_label_samples)]))
    
    # Concatenate all the samples
    all_samples = concatenate_datasets(samples)
    # Shuffle the samples
    return all_samples.shuffle(seed=seed)

In [None]:
import evaluate
import numpy as np
import torch
from tqdm.auto import tqdm
from pathlib import Path
from time import perf_counter
import matplotlib.pyplot as plt
import pandas as pd

class PerformanceBenchmark:
    def __init__(self, model, dataset, optim_type, metric_monitor):
        """Initiatilize the Performance Benchmark"""
        self.model = model
        self.dataset = dataset
        self.optim_type = optim_type
        self.metric = evaluate.load(metric_monitor)
        self.metric_monitor = metric_monitor
        

    def compute_metric(self):
        """Compute the self.metric for the the self.dataset """
        preds = self.model.predict(self.dataset["sentence"])
        labels = self.dataset["label"]
        args = {"predictions": preds, "references": labels}
        if self.metric_monitor == "f1":
            args["average"] = "macro"
        #metric_compute = self.metric.compute(predictions=preds, references=labels)
        metric_compute = self.metric.compute(**args)
        print(f"{self.metric_monitor} on test set - {metric_compute[self.metric_monitor]:.3f}")
        return metric_compute

    def compute_size(self):
        """Compute the size of the self.model"""
        state_dict = self.model.model_body.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        # Calculate size in megabytes
        size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
        # Delete temporary file
        tmp_path.unlink()
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}

    def time_model(self, query="What is the pin number for my account?", nb_warmup=10, nb_inference= 100):
        """Compute the average inference time of the self.model"""
        latencies = []
        # Warmup using nb_warmup requests
        # The first inference usually takes more time because of the loading
        # of the model in the memory
        for _ in range(nb_warmup):
            _ = self.model([query])
        # Timed run using nb_inference requests
        # To have significant results we launch the same request nb_inference tims
        for _ in range(nb_inference):
            start_time = perf_counter()
            _ = self.model([query])
            latencies.append(perf_counter() - start_time)
        # Compute run statistics in ms
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

    def run_benchmark(self):
        """ Launch the benchmark (compute size, metric and inference time)"""
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.compute_metric())
        metrics[self.optim_type].update(self.time_model())
        return metrics


def plot_metrics(perf_metrics, current_optim_type, metric_monitor):
    """ Plot the metrics (time x metric)"""
    
    # Create perf_metrics dataset
    df = pd.DataFrame.from_dict(perf_metrics, orient="index")
    
    metric_range_values = []
    # Loop over the dataframe
    for idx in df.index:
        df_opt = df.loc[idx]
        metric_range_values.append(df_opt[metric_monitor] * 100)
        # Print a point corresponding to (time, metric)
        plt.scatter(
                df_opt["time_avg_ms"],
                df_opt[metric_monitor] * 100,
                alpha=0.5,
                s=df_opt["size_mb"],
                label=idx,
                # Add a dashed circle around the current optimization type
                marker="$\u25CC$" if idx == current_optim_type else None,
            )

    # Add legend
    legend = plt.legend(bbox_to_anchor=(1, 1))
    for handle in legend.legendHandles:
        handle.set_sizes([20])

    # Adapt y axis with the range values
    plt.ylim(max(-1, min(metric_range_values) - 10), 95)
    # Use the slowest model to define the x-axis range
    xlim = int(max(df["time_avg_ms"]) + 5)
    
    plt.xlim(0, xlim)
    plt.ylabel(f"{metric_monitor} (%)")
    plt.xlabel("Average latency (ms)")
    plt.show()

We will use the 🤗 Datasets library to download the data, however note that you can use any other dataset in a csv format with two columns ('label' and 'text')

Most datasets on the Hub have many more labeled examples than those one encounters in few-shot settings. To simulate the effect of training on a limited number of examples, let's subsample the training set to have 8 labeled examples per class:

In [None]:
## Dataset choice
# Choose any of the available text classification dataset below.
# Note that you can make it work with any text classification dataset available in https://huggingface.co/datasets?pipeline_tag=text-classification
datasets_available = {
    # sentiment analysis pos + neg
    "sst2": {"datasets": {"train":"train", "validation": "validation"}, "mapping":{"sentence": "text", "label": "label"}},
    # sentiment analysis movie revies pos + neg    
    "imdb": {"datasets":{"train":"train", "validation": "test"}, "mapping":{"sentence": "text", "label": "label"}},
    # entailment (0), neutral (1), contradiction (2)
    "anli": {"datasets": {"train":"train_r1", "validation": "dev_r1"}, "mapping":{"sentence": "premise", "label": "label"}},
    # sentiment analysis, movies reviews pos + neg
    "rotten_tomatoes": {"datasets": {"train":"train", "validation": "validation"}, "mapping":{"sentence": "text", "label": "label"}},
    # sentiment analysis, non toxic + toxic
    "mteb/toxic_conversations_50k": {"datasets": {"train":"train", "validation": "test"}, "mapping":{"sentence": "text", "label": "label"}},
    # language identification (20 languages)
    # arabic (ar), bulgarian (bg), german (de), modern greek (el), english (en), spanish (es), french (fr), hindi (hi), italian (it), japanese (ja), dutch (nl), polish (pl), portuguese (pt), russian (ru), swahili (sw), thai (th), turkish (tr), urdu (ur), vietnamese (vi), and chinese (zh)
    "papluca/language-identification": {"datasets": {"train":"train", "validation": "test"}, "mapping":{"sentence": "text", "label": "label"}},
    # topic 14 classes
    # too long
    "dbpedia_14": {"datasets": {"train":"train", "validation": "test"}, "mapping":{"sentence": "content", "label": "label"}}    
}

DATASET_ID = "sst2"


## Dataset parameters
USE_LOCAL_DATASET = False
TRAIN = "TODO"
VAL = "TODO"
USE_CLASS_WEIGHTS = False # Put more examples in the subset for training
NUM_SAMPLES = 8 # default to 8, but try to make this vary to observe the results

## Model parameters
# Here are the models that will be used, for the purpose of this workshop
# we should not modify those however you can try by using models from the hub
# to compare.
MODEL_ID = "sentence-transformers/all-mpnet-base-v2"

# Monitor parameters
# This is the metric to monitor, here we will use the f1 score
# more suited for unbalanced datasets
METRIC_MONITOR = "accuracy"


# Load datasets
if USE_LOCAL_DATASET:
    to_disp = f'Here we use a local dataset from the files {TRAIN} and {VAL}'
    # If you use a local dataset make sure that you have two columns: 'text' and 'label'
    train_dataset = create_dataset(TRAIN)
    eval_dataset = create_dataset(VAL)
else:
    to_disp = f'Here we use the dataset {DATASET_ID}'
    dataset = load_dataset(DATASET_ID)
    train_dataset = dataset[datasets_available[DATASET_ID]["datasets"]["train"]]
    eval_dataset = dataset[datasets_available[DATASET_ID]["datasets"]["validation"]]
    if "label" not in train_dataset.features:
        train_dataset = train_dataset.rename_column(datasets_available[DATASET_ID]["mapping"]["sentence"], "sentence")
        eval_dataset = eval_dataset.rename_column(datasets_available[DATASET_ID]["mapping"]["label"], "label")
    if "sentence" not in train_dataset.features:
        train_dataset = train_dataset.rename_column(datasets_available[DATASET_ID]["mapping"]["sentence"], "sentence")
        eval_dataset = eval_dataset.rename_column(datasets_available[DATASET_ID]["mapping"]["sentence"], "sentence")

# Load the metric to monitor
metric_monitor = evaluate.load(METRIC_MONITOR)

# Get class weights
vc = pd.Series(train_dataset["label"]).value_counts()
class_weights = {x: 1 for x in vc.index}
if USE_CLASS_WEIGHTS:
    class_weights = {x: int(y) for x,y in (vc / min(vc)).items()}

# New function to sample dataset
train_dataset_sampled = sample_dataset(train_dataset, num_samples=NUM_SAMPLES, class_weights=class_weights)

to_disp += f' we have {len(train_dataset_sampled)} total examples to train with since dataset has {len(set(train_dataset["label"]))} classes'
if USE_CLASS_WEIGHTS:
    to_disp += f' and we sample the following for each class: {class_weights}'
else:
    to_disp += f' and we sample {NUM_SAMPLES} per class'
display(Markdown(to_disp))

We can do a small data analysis to observe our data:
- Display some raw examples
- Display the number of examples by label
You can do that on the `train_data` and `train_data_sampled` to see if the ratios are similar or if it respects the class_weights indicated.
Have a look at the `train_data_sampled` and check if there seems to be enough diversity in the input

In [None]:
def data_analysis(dataset, name="", nb_examples_per_class=2):
    """ Create the data analysis """
    display(f'Data analysis of {name}')
    du = pd.DataFrame()
    du["label"] = dataset["label"]
    du["sentence"] = dataset["sentence"]
    # Display the label distriubtion
    display(du["label"].value_counts() / len(du["label"]))

    # Display nb_examples_per _class
    indices = []
    for l in du["label"].unique():
        for idx in du[du["label"]==l].sample(nb_examples_per_class).index:
            indices.append(idx)
    display(du[du.index.isin(indices)])

    # Return the dataframe
    return du

# Display nb_examples per class
nb_examples_per_class = 4
# TODO: Launch data analysis for train_dataset
_ = data_analysis(train_dataset, name="train_dataset", nb_examples_per_class=nb_examples_per_class)
#  TODO: Launch data analysis for train_dataset_sampled
du  = data_analysis(train_dataset_sampled, name="train_dataset_sampled", nb_examples_per_class=nb_examples_per_class)


# Show the train_dataset_sampled
display("Display the whole train_dataset_sampled")
display(du)

Okay, now that we have the dataset, let's load and train a model!
## Fine-tuning the model

To train a SetFit model, the first thing to do is download a pretrained checkpoint from the Hub. We can do so by using the `from_pretrained()` method associated with the `SetFitModel` class:

In [None]:
# TODO: create a model using SetFitModel.from_pretrained using MODEL_ID
model = SetFitModel.from_pretrained(MODEL_ID)

Here, we've downloaded a pretrained Sentence Transformer from the Hub and added a logistic classification head to the create the SetFit model. As indicated in the message, we need to train this model on some labeled examples. We can do so by using the `SetFitTrainer` class as follows:

In [None]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer

# TODO create the  SetFitTrainer using the
# model, the train_dataset, eval_dataset, the CosineSimilarityLoss,
# We set the num_iterations to 20.
# You can play with the num_epochs to observe the results, the default one is 1
# The column_mapping is set to {"sentence": "text", "label": "label"}
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset_sampled,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    num_epochs=1,
    column_mapping={"sentence": "text", "label": "label"}
)

The main arguments to notice in the trainer is the following:

* `loss_class`: The loss function to use for contrastive learning with the Sentence Transformer body
* `num_iterations`: The number of text pairs to generate for contrastive learning
* `column_mapping`: The `SetFitTrainer` expects the inputs to be found in a `text` and `label` column. This mapping automatically formats the training and evaluation datasets for us.

Now that we've created a trainer, we can train it!

In [None]:
# TODO: Launch the trainiing
# The trainer has a train method
...

The next step is to compute the model's performance using the `evaluate_more_metrics()` method:

In [None]:
# TODO: Evaluate the metrics on the trainer
...

The final step is to compute to run a benchmark for our model to get the avg inference time and plot it with its metric to monitor:

In [None]:
# TODO: Create a Performance Benchmark using
# the model trained (trainer.model), the dataset that will be the eval_dataset
# optim_type ise the name that we want to display (here MNPNET) and the metric_monitor that is METRIC_MONITOR
# Create benchmark
optim_type = ...
pb = PerformanceBenchmark(
    model=..., dataset=..., optim_type=..., metric_monitor=...
)
# TODO then we can run the benchmark using the function run_benchmark of the pb object
perf_metrics = ...

# TODO plot the metrics using the plot_metrics  that takes the perf_metrics as input, the optim_type and then METRIC_MONITOR
# Plot metrics
plot_metrics(..., ..., ...)

You can try it the setfit finetuned model on some specific examples.
The following examples are used to classify the sentimeent analysis of the.

In [None]:
examples = [
    "i loved the spiderman movie!",
    "pineapple on pizza is the worst 🤮",
    "I care for you very much",
    "I hate this",
    "Fuck you",
    "Asshole",
    "You little piece of shit",
    "I am very happy"
]
# Launch the predictions
preds = trainer.model(examples)

# Display the results
df = pd.DataFrame()
df["example"] = examples
df["prediction"] = preds
display(df)

# Distillation

In [None]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, DistillationSetFitTrainer

Very often we have constraints in terms of resources:
- inference time, the less the better for real time usage
- memory usage, the less the better for reducing the cost of usage of our model

One way of reducing both of those metrics is to do something called distillation.

Model distillation, also known as knowledge distillation, is a technique used in machine learning to compress the knowledge of a large, complex model (called the teacher model) into a smaller, simpler model (called the student model). The purpose of this process is to create a more efficient model that retains much of the performance of the original, larger model while reducing computational resources, memory requirements, and inference time.

For more information on distillation please check the following links:
- https://medium.com/nlplanet/a-model-distillation-survey-7f0e1b56b3cf
- https://neptune.ai/blog/knowledge-distillation

In this workshop:
- we use Relation-based knowledge distillation, we learn the embedding space of our smaller model using the embedding trained of our teacher and at the same time the logits of the teacher
- for our teacher, the same architecture as before MPNet
- for our student, a model called MiniLM-L3, a way smaller model


In [None]:
# Model of the student
# TODO: Here we use a "paraphrase-MiniLM-L3-v2"  as MODEL_STUDENT_ID
MODEL_STUDENT_ID = ...

We first train two baselines:
- a model based on MPNet
- a model based one MiniLM-L3 without distillation

In [None]:
column_mapping = {"sentence": "text", "label": "label"}

# TODO Get the model from the MODEL_STUDENT_ID using SetFitModel.from_pretrained
minilm = ...

# TODO: Prepare the dataset for the teacher  using the train_dataset, the NUM_SAMPLES and the class_weights defined before
train_dataset_teacher = sample_dataset(..., num_samples=..., class_weights=...)

# TODO: Create smaller model and train it the same way using
# the minilm on the train_dataset_teacher, evaluating on the eval_dataset,
# using the CosineSimilarityLoss and the column_mapping defined before
minilm_trainer = SetFitTrainer(
    model=...,
    train_dataset=...,
    eval_dataset=...,
    loss_class=...
    column_mapping=...
)

# TODO Train the minilm model using the train() method of minilm_trainer
...


####### TEACHER MODEL ####### 
# TODO Get the model from the MODEL_ID using SetFitModel.from_pretrained
teacher_model = ...

# TODO: Create smaller model and train it the same way using
# the teacher_model ont the train_dataset_teacher, evaluating on the eval_dataset,
# using the CosineSimilarityLoss and the column_mapping defined before
teacher_trainer = SetFitTrainer(
    model=...,
    train_dataset=...,
    eval_dataset=...,
    loss_class=...,
    column_mapping=...
)
# TODO Train the teacher model using the train() method of teacher_trainer
...

####### TEACHER MODEL #######

After the training is done we can as before:
- Evaluate both of our models
- Launch a benchmark on both models
- Plots the metrics obtained

In [None]:
# TODO Evalute the metrics using evaluate_more_metrics on the two models trained teacher_trainer and minilm_trainer
y_true, y_pred = ...
y_true, y_pred = ...

# TODO Launch benchmark for both of our trained models teacher_trainer.model and minilm_trainer.model
perf_metrics = {}
for model_name, model in {
    "MPNet": ...,
    "MiniLM-L3": ...,
    }.items():
    display(model_name)
    pb = PerformanceBenchmark(
        model=model, dataset=eval_dataset, optim_type=model_name, metric_monitor=METRIC_MONITOR
    )
    res_metrics = pb.run_benchmark()
    perf_metrics.update(res_metrics)

# Plot metrics, we highlight "MiniLM-L3"
plot_metrics(perf_metrics, "MiniLM-L3", metric_monitor=METRIC_MONITOR)

Next, we can distillate our teacher model into a model based on the smaller
architecture (here MiniLM-L3).

In [None]:
# Number of data points to select from the train_dataset to train the student on
DISTILLATION_TRAIN_SIZE = 500
train_dataset_student = train_dataset.shuffle(seed=0).select(range(DISTILLATION_TRAIN_SIZE))

####### STUDENT MODEL #######
# TODO Load small student model using the MODEL_STUDENT_ID using SetFitModel.from_pretrained
student_model = ...



# Create trainer for knowledge distillation
# TODO: Create the student trainer using
# the teacher_model, the train_dataset_student for the train_dataset,
# the student_model, evaluating on the eval_dataset,
# using the CosineSimilarityLoss and the column_mapping defined before
# You can play a bit with the different values
student_trainer = DistillationSetFitTrainer(
    teacher_model=...,
    train_dataset=...,
    student_model=...,
    eval_dataset=eval_dataset,
    loss_class=...,
    column_mapping=...,
    metric=METRIC_MONITOR,
    batch_size=16,
    num_iterations=20,
    num_epochs=1,
)
# Train student with knowledge distillation

# TODO: Launch the student training using the train() method of student_trained
...
####### STUDENT MODEL #######

The final step consists in:
- Evaluating our student model
- Benchmarking our student model
- Adding its results to the perf_metrics
- Plotting all the perf_metrics

In [None]:
# Display evaluation for the distilled model
# TODO Evalute the metrics on the student_trainer model using evaluate_more_metrics applied
y_true, y_pred = evaluate_more_metrics(student_trainer)

# TODO Launch benchmark for this distilled model student_trainer.model on the eval_dataset, with the optim_type and the METRIC_MONITOR for metric_monitor
optim_type = "MiniLM-L3 (distilled)"
pb = PerformanceBenchmark(
    model=..., dataset=..., optim_type=..., metric_monitor=...
)

# TODO run the benchmark using run_benchmark of the pb object
perf_metrics.update(...)

# Plot metrics
plot_metrics(perf_metrics, optim_type, metric_monitor=METRIC_MONITOR)

# Quantization

In [None]:
from setfit.exporters.onnx import export_onnx

Quantization in machine learning is a technique used to reduce the memory and computational requirements of a model by approximating the continuous values of its parameters (e.g., weights and biases) with a smaller set of discrete values. This process can lead to more efficient models that consume less power and have lower latency, making them suitable for deployment on resource-constrained devices

There are several types of quantization:
- aware training, quantization done during the training
- post training,
    - dynamic, activations range are computed on the fly at runtime
    - static, activations range are at quantization time (will be only used at runtime)
dynamic is easy to implement, static is usually faster but with a drop in accuracy. We usually test with dynamic and if the results are okay we stick with it


For more information on distillation please check the following links:
- https://huggingface.co/docs/optimum/concept_guides/quantization
- https://deci.ai/quantization-and-quantization-aware-training
- https://github.com/intel/neural-compressor/blob/master/docs/source/quantization.md

In [None]:
# TODO: Put a name to your model_student_quantized that will be pushed to huggingface hub
MODEL_STUDENT_QUANTIZED = "USERNAME/PROJET_NAME"

# Name of the quant conf file for quantization that will be dumped 
QUANT_CONF = "MiniLM_L3_distilled_onnx_dynamic.yaml"

In [None]:
import os
import functools
from sklearn.linear_model import LogisticRegression
from transformers import AutoTokenizer
from setfit.exporters.utils import mean_pooling
import onnxruntime
from optimum.onnxruntime import ORTModelForFeatureExtraction
#from optimum.pipelines import ORTModelForFeatureExtraction
from neural_compressor.experimental import Quantization, common
import shutil

class OnnxSetFitModel:
    def __init__(self, ort_model, tokenizer, model_head):
        self.ort_model = ort_model
        self.tokenizer = tokenizer
        self.model_head = model_head

    def predict(self, inputs):
        encoded_inputs = self.tokenizer(
            inputs, padding=True, truncation=True, return_tensors="pt"
        )
        outputs = self.ort_model(**encoded_inputs)
        embeddings = mean_pooling(
            outputs["last_hidden_state"], encoded_inputs["attention_mask"]
        )
        return self.model_head.predict(embeddings)

    def __call__(self, inputs):
        return self.predict(inputs)

    
class OnnxPerformanceBenchmark(PerformanceBenchmark):
    def __init__(self, *args, model_path, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_path = model_path

    def compute_size(self):
        size_mb = Path(self.model_path).stat().st_size / (1024 * 1024)
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}

    def compute_metric(self):
        preds = []
        chunk_size = 100
        for i in tqdm(range(0, len(self.dataset["text"]), chunk_size)):
            preds.extend(self.model.predict(self.dataset["text"][i : i + chunk_size]))
        labels = self.dataset["label"]
        args = {"predictions": preds, "references": labels}
        if METRIC_MONITOR == "f1":
            args["average"] = "macro"
        mm = metric_monitor.compute(**args)
        print(f"{METRIC_MONITOR} on test set - {mm[METRIC_MONITOR]:.3f}")
        return mm


def eval_func(model, model_head=student_model.model_head):
    ort_model = ORTModelForFeatureExtraction.from_pretrained(onnx_path)
    ort_model.model = onnxruntime.InferenceSession(model.SerializeToString(), None)
    onnx_setfit_model = OnnxSetFitModel(ort_model, tokenizer, model_head)
    preds = []
    chunk_size = 100
    for i in tqdm(range(0, len(test_dataset["text"]), chunk_size)):
        preds.extend(
            onnx_setfit_model.predict(test_dataset["text"][i : i + chunk_size])
        )
    labels = test_dataset["label"]
    mm = metric_monitor.compute(predictions=preds, references=labels)
    print(mm[METRIC_MONITOR])
    return mm[METRIC_MONITOR]


def build_dynamic_quant_yaml():
    # 1% relative accuracy loss is set as the accuracy target for auto-tuning.
    yaml = """
        model:
          name: bert
          framework: onnxrt_integerops

        device: cpu

        quantization:
          approach: post_training_dynamic_quant

        tuning:
          accuracy_criterion:
            relative: 0.01
          exit_policy:
            timeout: 0
          random_seed: 9527
    """
    # Here we quantize the model so that we have at most 0.01 difference in accuracy
    with open(QUANT_CONF, "w", encoding="utf-8") as f:
        f.write(yaml)

After the results obtained on distillations we can analyze what the quantization can add in this whole pipeline. Basically the goal is to reduce the size of the model (on the same architecture) and raise the inference time.

First we need to dump our model in onnx format. It is an open source format and the conversion is necessary to quantize using onnxruntime:
- Get the model and tokenizer
- Dump the model in onnx format in './onnx/model.onnx'


In [None]:
# TODO connect to your huggingface-cli in order to be able to upload a model
# Get the token from https://huggingface.co/settings/tokens (write token) and replace TODO by its value
!huggingface-cli login --token TODO

In [None]:
# TODO Push to the hub your model using the push_to_hub function of the student_trainer with MODEL_STUDENT_QUANTIZED
...

In [None]:
# Path where to dump the model
if os.path.exists("./onnx"):
    shutil.rmtree('./onnx')
onnx_path = Path("onnx")

# Get the feature extractor part
# TODO Load the model that you uploaded using MODEL_STUDENT_QUANTIZED
ort_model = ORTModelForFeatureExtraction.from_pretrained(
    ..., from_transformers=True
)

# TODO Get the tokenizer from the model in the hub MODEL_STUDENT_QUANTIZED
tokenizer = AutoTokenizer.from_pretrained(...)

# TODO: Save the model and the tokenizer to onnx_path using save_pretrained methods
ort_model.save_pretrained(...)
tokenizer.save_pretrained(...)
model = ort_model

Then we launch the quantization of our model:
- Load our model using the OnnxSetFitModel class
- Prepare the quantization pipeline
- Add the model and the evaluation function
- Quantize the model
- Output in in 'onnx/model_quantized.onnx'

In [None]:
# Prepare the quantization pipeline
build_dynamic_quant_yaml()
# TODO: build the quantizer using the configuration defined before QUANT_CONF with the method Quantization
quantizer = ...

# Prepare dataset for quantization
test_dataset = student_trainer.eval_dataset
if student_trainer.column_mapping is not None:
    test_dataset = trainer._apply_column_mapping(student_trainer.eval_dataset, student_trainer.column_mapping)
    
# Add model and evaluation function
# TODO: Get the model stored in onnx/model.onnx 
quantizer.model = common.Model(...)

# TODO: Setup the eval_func of the quantizer using the eval_func defined before
quantizer.eval_func = functools.partial(...)

# TODO: Launch quantization using quantizer()
quantized_model = ...

# TODO Output the model quantized to "onnx/model_quantized.onnx"
quantized_model.save(...)

The final step consists in:
- Loading the model quantized
- Launching the benchmark
- Adding the benchmark to all the benchmark
- Printing the whole benchmark

In [None]:
# TODO Load model quantized wusing onnx_path and mode_quantized.onnx as file_name
ort_model = ORTModelForFeatureExtraction.from_pretrained(
    ..., file_name= ...
)
# TODO: load onnx setfit model using the ort_model, the tokenizer and the student_model.model_head
onnx_setfit_model = OnnxSetFitModel(..., ..., ...)

In [None]:
# Launch the performance benchmark on the quantized model
# TODO Launch benchmark for this distilled model onnx_setfit_model on the test_dataset
pb = OnnxPerformanceBenchmark(
    ...,
    ...,
    "MiniLM-L3 (distilled + quantized)",
    model_path="onnx/model_quantized.onnx",
    metric_monitor = METRIC_MONITOR
)
# TODO: Launch the run_benchmark of the pb object
perf_metrics.update(pb.run_benchmark())

plot_metrics(perf_metrics, "MiniLM-L3 (distilled + quantized)", metric_monitor=METRIC_MONITOR)

# Zero Shot Classification




Zero-shot classification is a machine learning task where a model is expected to classify input data into classes that it has not seen or been trained on. This is in contrast to traditional supervised learning, where a model learns to classify input data based on examples of each class provided during the training process.

The key idea behind zero-shot classification is that the model can generalize its knowledge to unseen classes by leveraging its understanding of the relationships between various concepts and the context of the input data. This makes zero-shot classification particularly useful in situations where it is difficult or impractical to obtain labeled training data for every possible class.





In [None]:
SEED = 412
dataset_id = "emotion"
TEST_DATA_SIZE = 100
# TODO Load the dataset using load_dataset with the dataset_id as parameter
reference_dataset = ...

# TODO For time issue we want to sample the dataset using TEST_DATA_SIZE examples
reference_dataset["test"] = reference_dataset["test"].shuffle(seed=SEED).select(range(...))

In [None]:
# Extract ClassLabel feature from "label" column
# TODO: get the labels using the feature["label"]  from the reference_dataset["train"]
label_features = ...
# Label names to classify with
candidate_labels = label_features.names



The first thing we need to do is create a dataset of synthetic examples. In setfit, we can do this by applying the get_templated_dataset() function to a dummy dataset. This function expects a few main things:

    A list of candidate labels to classify with. We'll use the labels from the reference dataset here, but this could be anything that's relevant to the task and dataset at hand.
    A template to generate examples with. By default, it is "This sentence is {}", where the {} will be filled by one of the candidate labels
    A sample size 

, which will create synthetic examples per class. We find

    usually works best.

Armed with this information, let's first extract some candidate labels from the dataset:

In [None]:
from datasets import Dataset
from setfit import get_templated_dataset

# A dummy dataset to fill with synthetic examples
dummy_dataset = Dataset.from_dict({})
train_dataset = get_templated_dataset(dummy_dataset, candidate_labels=candidate_labels, sample_size=8)
train_dataset

In [None]:
from setfit import SetFitTrainer
from setfit import SetFitModel
# We will use the mpnet model here
MODEL_ID = "sentence-transformers/paraphrase-mpnet-base-v2"
# TODO Launch the setfitmodel using MODEL_ID with the SetFitModel.from_pretrained method
model = ...

# TODO Prepare the SetFitTrainer with the model on the train_dataset using reference_dataset["test"]
# as eval_dataset
trainer = SetFitTrainer(
    model=...,
    train_dataset=...,
    eval_dataset=...,
)

In [None]:
# TODO launch the training using the train method from the trainer
...

In [None]:
%%time
# TODO Evalute the trainer using the evaluate method from the trainer object
zeroshot_metrics = ...
zeroshot_metrics

In [None]:
# Evaluate a bit more metrics from the trainer
y_true, y_pred = evaluate_more_metrics(trainer)

In [None]:
from transformers import pipeline
import evaluate

# TODO Load the "zero-shot-classification" pipeline
pipe = pipeline(...)

In [None]:
%%time
# Launch the predictions using the loaded pipeline
zeroshot_preds = pipe(reference_dataset["test"]["text"], batch_size=16, candidate_labels=candidate_labels)

In [None]:
# Post process the predictions
preds = [label_features.str2int(pred["labels"][0]) for pred in zeroshot_preds]

# TODO evaluate the "accuracy" metric
metric = evaluate.load(...)

# Compute the metrics and display those
transformers_metrics = metric.compute(predictions=preds, references=reference_dataset["test"]["label"])
transformers_metrics