# pipelines.classifier

> Wrappers for few-shot classification

In [None]:
# | default_exp pipelines.classifier

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export

import os
from typing import Any, Dict, Generator, List, Optional, Tuple, Union, Callable
from setfit import SetFitModel, TrainingArguments, Trainer, sample_dataset, SetFitTrainer

In [None]:
# | export

import warnings
from abc import ABC, abstractmethod
DEFAULT_SETFIT_MODEL = "sentence-transformers/paraphrase-mpnet-base-v2"
SMALL_SETFIT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET_TEXT = "text"
DATASET_LABEL = "label"

class ClassifierBase(ABC):

  
    @abstractmethod
    def train(self,
              X:List[str],
              y:Union[List[int], List[str]],
              max_steps:int=50,
              num_epochs:int=10,
              batch_size:int=32,
              metric='accuracy',
              callbacks=None,
              **kwargs,
             ):
        """
        Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
        Additional keyword arguments are passed directly to `SetFit.TrainingArguments`.

        **Args:**

        - *X*: List of texts
        - *y*: List of integers representing labels
        - *max_steps*: If set to a positive number, the total number of training steps to perform. Overrides num_epochs. 
        - *num_epochs*: Number of epochs to train
        - *batch_size*: Batch size
        - *metric*: metric to use
        - *callbacks*:  A list of callbacks to customize the training loop.

        **Returns:**

        - None
        """
        pass

    @abstractmethod
    def save(self, save_path:str):
        """
        Save model to specified folder path, `save_path`
        """
        pass


    def sample_examples(self, X:list, y:list, num_samples:int=8,
                        text_key:str=DATASET_TEXT, label_key:str=DATASET_LABEL):
        """
        Sample a dataset with `num_samples` per class
        """
        full_dataset = self.arrays2dataset(X, y, text_key=text_key, label_key=label_key)
                
        sample = sample_dataset(full_dataset, label_column=label_key, num_samples=num_samples)
        return sample.to_dict()[text_key], sample.to_dict()[label_key]
        

    
    def arrays2dataset(self, X:List[str], y:Union[List[int], List[str]], 
                       text_key:str=DATASET_TEXT, label_key:str=DATASET_LABEL):
        """
        Convert train or test examples to HF dataset
        """
        from datasets import Dataset
        return Dataset.from_dict({text_key:X, label_key:y})


    def dataset2arrays(self, dataset, text_key:str=DATASET_TEXT, label_key:str=DATASET_LABEL):
        """
        Convert a Hugging Face dataset to X, y arrays
        """
        return dataset.to_dict()['text'], dataset.to_dict()['label']
        
    def get_trainer(self):
        """
        Retrieves last trainer
        """
        if not self.trainer:
            raise ValueError('A trainer has not been created yet. You must first train a model on some labeled examples ' +\
                             'using the FewShotClassifier.train method.')
        return self.trainer

    
    def evaluate(self, X_eval:list, y_eval:list, print_report:bool=False, labels:List[str]=[]):
        """
        Evaluates labeled data using the trained model. 
        If `print_report` is True, prints classification report and returns nothing.
        Otherwise, returns and prints a dictionary of the results.

        
        """
        from sklearn.metrics import classification_report
        y_pred= self.predict(X_eval)
        if self.model.labels:
            y_pred = [self.model.labels.index(y) for y in y_pred]
            if y_eval[0] in self.model.labels:
                y_eval = [self.model.labels.index(y) for y in y_eval]

        result = classification_report(y_eval, y_pred, 
                                       output_dict=not print_report, 
                                       target_names= self.get_labels(labels))
        if print_report:
            return result
        else:
            import yaml
            print(yaml.dump(result, allow_unicode=True, default_flow_style=False))
            return result



    def explain(self, X:list, labels:List[str]=[]):
        """
        Explain the predictions on given examples in `X`. (Requires `shap` and `matplotlib` to be installed.)
        """
        try:
            import shap
        except ImportError:
            raise ImportError('Please install the shap library: pip install shap')

        try:
            import matplotlib
        except ImportError:
            raise ImportError('Please install the matplotlib library: pip install matplotlib')

        def f(x):
            return self.predict_proba(x)
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.model_id_or_path)

        
        output_names = self.get_labels(labels)
        explainer = shap.Explainer(f, tokenizer, output_names=self.get_labels(labels))
        shap_values = explainer(X)
        shap.plots.text(shap_values)
          
    def get_labels(self, labels:List[str]=[]):
        """
        Inspect model and return labels
        """
        target_names = labels if labels else self.model.labels
        target_names = target_names if target_names else None
        return target_names

class FewShotClassifier(ClassifierBase):
    def __init__(
        self,
        model_id_or_path:str=DEFAULT_SETFIT_MODEL,
        use_smaller:bool=False,
        **kwargs,
    ):
        """
        `FewShotClassifier` can be used to train and run text classifiers. Currently based on SetFit.
                Additional keyword arguments are fed directly to `from_pretrained`.


        **Args:**

        - *model_id_or_path*: The Hugging Face model_id or path to model folder (e.g, path previously trained and saved model).
        - *use_smaller*:  If True, will use a smaller but performant model.

        """
        self.model_id_or_path = model_id_or_path
        if use_smaller and model_id_or_path != DEFAULT_SETFIT_MODEL:
            warnings.warn(f'Over-writing supplied model ({model_id_or_path}) with {SMALL_MODEL} because use_smaller=True.')
        self.model_id_or_path = SMALL_SETFIT_MODEL if use_smaller else self.model_id_or_path
        self.model = SetFitModel.from_pretrained(self.model_id_or_path, **kwargs)
        self.predict = self.model.predict
        self.predict_proba = self.model.predict_proba
        self.trainer = None # set in `FewShotClassifier.train`
        self.labels = [] # set in `FewShotClassifier.train`
         
        
    def train(self,
              X:List[str],
              y:Union[List[int], List[str]],
              num_epochs:int=10,
              batch_size:int=32,
              metric='accuracy',
              callbacks=None,
              **kwargs,
             ):
        """
        Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
        Additional keyword arguments are passed directly to `SetFit.TrainingArguments`

        **Args:**

        - *X*: List of texts
        - *y*: List of integers representing labels
        - *num_epochs*: Number of epochs to train
        - *batch_size*: Batch size
        - *metric*: metric to use
        - *callbacks*:  A list of callbacks to customize the training loop.

        **Returns:**

        - None
        """

        # convert to HF dataset
        train_dataset = self.arrays2dataset(X, y, text_key='text', label_key='label')

        args = TrainingArguments(
                batch_size=batch_size,
                num_epochs=num_epochs,
                **kwargs
        )      

        trainer = Trainer(
                    model=self.model,
                    args=args,
                    metric=metric,
                    callbacks=callbacks,
                    train_dataset=train_dataset,
                    column_mapping={"text": "text", "label": "label"}
        )
        trainer.train()

        # DEPRECATED
        # trainer = SetFitTrainer(model=self.model, train_dataset=train_dataset)
        # trainer.train(batch_size=batch_size, num_epochs=num_epochs, **kwargs)
      
        self.trainer = trainer
  

    def save(self, save_path:str):
        """
        Save model to specified folder path, `save_path`
        """
        self.model.save_pretrained(save_path)        

In [None]:
show_doc(FewShotClassifier.arrays2dataset)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L67){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.arrays2dataset

>      ClassifierBase.arrays2dataset (X:List[str], y:Union[List[int],List[str]],
>                                     text_key:str='text',
>                                     label_key:str='label')

*Convert train or test examples to HF dataset*

In [None]:
show_doc(FewShotClassifier.dataset2arrays)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L76){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.dataset2arrays

>      ClassifierBase.dataset2arrays (dataset, text_key:str='text',
>                                     label_key:str='label')

*Convert a Hugging Face dataset to X, y arrays*

In [None]:
show_doc(FewShotClassifier.evaluate)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L92){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.evaluate

>      ClassifierBase.evaluate (X_eval:list, y_eval:list,
>                               print_report:bool=False, labels:List[str]=[])

*Evaluates labeled data using the trained model. 
If `print_report` is True, prints classification report and returns nothing.
Otherwise, returns and prints a dictionary of the results.*

In [None]:
show_doc(FewShotClassifier.explain)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L119){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.explain

>      ClassifierBase.explain (X:list, labels:List[str]=[])

*Explain the predictions on given examples in `X`. (Requires `shap` and `matplotlib` to be installed.)*

In [None]:
show_doc(FewShotClassifier.get_trainer)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L82){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.get_trainer

>      ClassifierBase.get_trainer ()

*Retrieves last trainer*

In [None]:
show_doc(FewShotClassifier.sample_examples)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L55){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.sample_examples

>      ClassifierBase.sample_examples (X:list, y:list, num_samples:int=8,
>                                      text_key:str='text',
>                                      label_key:str='label')

*Sample a dataset with `num_samples` per class*

In [None]:
show_doc(FewShotClassifier.save)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L228){target="_blank" style="float:right; font-size:smaller"}

### FewShotClassifier.save

>      FewShotClassifier.save (save_path:str)

*Save model to specified folder path, `save_path`*

In [None]:
show_doc(FewShotClassifier.train)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L181){target="_blank" style="float:right; font-size:smaller"}

### FewShotClassifier.train

>      FewShotClassifier.train (X:List[str], y:Union[List[int],List[str]],
>                               num_epochs:int=10, batch_size:int=32, **kwargs)

*Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
Additional keyword arguments are passed directly to `SetFit.TrainingArguments`.

**Args:**

- *X*: List of texts
- *y*: List of integers representing labels
- *num_epochs*: Number of epochs to train
- *batch_size*: Batch size

**Returns:**

- None*

In [None]:
# | notest

clf = FewShotClassifier()

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
# | notest

from datasets import load_dataset

In [None]:
# | notest
dataset = load_dataset("SetFit/sst2")
X_train, y_train = clf.dataset2arrays(dataset["train"], text_key="text", label_key="label")
X_test, y_test = clf.dataset2arrays(dataset["test"], text_key="text", label_key="label")
X_sample, y_sample = clf.sample_examples(X_train,  y_train, label_key="label", num_samples=8)



In [None]:
# | notest
clf.train(X_sample,  y_sample, max_steps=50)

Applying column mapping to the training dataset


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 144
  Batch size = 32
  Num epochs = 10
  Total optimization steps = 50


Step,Training Loss


In [None]:
# | notest
results = clf.evaluate(X_test, y_test, labels=['negative', 'positive'])

accuracy: 0.9060955518945635
macro avg:
  f1-score: 0.9060502208366357
  precision: 0.9067880482593942
  recall: 0.9060618232875919
  support: 1821.0
negative:
  f1-score: 0.9081139172487909
  precision: 0.8904109589041096
  recall: 0.9265350877192983
  support: 912.0
positive:
  f1-score: 0.9039865244244806
  precision: 0.9231651376146789
  recall: 0.8855885588558856
  support: 909.0
weighted avg:
  f1-score: 0.9060536206659804
  precision: 0.9067610678815438
  recall: 0.9060955518945635
  support: 1821.0



In [None]:
# | notest
new_data = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]

In [None]:
# | notest
preds = clf.predict(new_data)
preds

tensor([1, 0])

In [None]:
# | notest
preds = clf.predict_proba(new_data)
preds

tensor([[0.1028, 0.8972],
        [0.9357, 0.0643]], dtype=torch.float64)

In [None]:
# | notest
clf.save('/tmp/my_fewshot_model')

In [None]:
# | notest
clf = FewShotClassifier('/tmp/my_fewshot_model')
preds = clf.predict(new_data)
preds

tensor([1, 0])

In [None]:
# | notest
clf.explain(new_data, labels=['negative', 'positive'])

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()