This Jupyter notebook was used with Google Colaboratory to train and test the SetFit classifier. It requires the `sentence_transformers` and `setfit` libraries. 

In [1]:
!pip install sentence_transformers datasets evaluate setfit
!pip install setfit[optuna]

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setfit
  Downloading setfit-0.7.0-py3-none-any.whl (45 kB)
[2

In [4]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets, evaluation
from torch.utils.data import DataLoader

from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split

from matplotlib import pyplot as plt

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import torch
import random
import csv

from datasets import load_dataset, Dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer, sample_dataset

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

In [3]:
def sentence_pairs_generation(sentences, labels, pairs):
	# initialize two empty lists to hold the (sentence, sentence) pairs and
	# labels to indicate if a pair is positive or negative

  numClassesList = np.unique(labels)
  idx = [np.where(labels == i)[0] for i in numClassesList]

  for idxA in range(len(sentences)):
    currentSentence = sentences[idxA]
    label = labels[idxA]
    idxB = np.random.choice(idx[np.where(numClassesList==label)[0][0]])
    posSentence = sentences[idxB]
		  # prepare a positive pair and update the sentences and labels
		  # lists, respectively
    pairs.append(InputExample(texts=[currentSentence, posSentence], label=1.0))

    negIdx = np.where(labels != label)[0]
    negSentence = sentences[np.random.choice(negIdx)]
		  # prepare a negative pair of images and update our lists
    pairs.append(InputExample(texts=[currentSentence, negSentence], label=0.0))

	# return a 2-tuple of our image pairs and labels
  return (pairs)

# Dataset

This notebook requires the 

In [5]:
train_df = pd.read_csv('train.csv')
eval_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')

full_df = pd.read_csv('full_test.csv', names=["text", "label"])

text_col='text'
category_col='label'

In [49]:
train_dev, test_df = train_test_split(full_df, train_size=100, random_state=500)
train_df, dev_df = train_test_split(train_dev, test_size=.2, random_state=500)
full_train_df, full_dev_df = train_test_split(full_df, test_size=.2, random_state=42)

In [50]:
pop_ds = load_dataset("csv", data_files={"train": "train.csv", "dev": "dev.csv", "test": "test.csv"})
train_dataset = Dataset.from_pandas(train_df, split="train")
eval_dataset = Dataset.from_pandas(dev_df, split="dev")
test_dataset = Dataset.from_pandas(test_df, split="test")

full_train_dataset = Dataset.from_pandas(full_train_df, split="dev")
full_dev_dataset = Dataset.from_pandas(full_dev_df, split="test")

# Hyperparameter Search

Based on the `deutsche-telekom/gbert-large-paraphrase-cosine` GBERT model for `sentence_transformers`, hyperparameters are optimized using the `optuna` framework. 

In [52]:
from setfit import SetFitModel

def model_init(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained("deutsche-telekom/gbert-large-paraphrase-cosine", **params)


In [53]:
def hp_space(trial):  # Training parameters
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1.4e-6, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 2),
        "batch_size": trial.suggest_categorical("batch_size", [4]),
        "seed": trial.suggest_int("seed", 1, 40),
        "num_iterations": trial.suggest_int("num_iterations", 12, 18),
        "max_iter": trial.suggest_int("max_iter", 50, 300),
        "solver": trial.suggest_categorical("solver", ["newton-cg", "liblinear"]),
    }

In [None]:
from datasets import Dataset
from setfit import SetFitTrainer

trainer = SetFitTrainer(
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    model_init=model_init,
    num_epochs=2,
    batch_size=4,
    column_mapping={"text": "text", "label": "label"},
)

best_run = trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    n_trials=3
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[I 2023-10-13 10:14:20,626] A new study created in memory with name: no-name-4061f2a4-ac3e-4b50-888b-3f1734b6c567
Trial: {'learning_rate': 1.3908606175010484e-06, 'seed': 17, 'num_iterations': 16, 'max_iter': 300, 'solver': 'newton-cg'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/16 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2560
  Num epochs = 2
  Total optimization steps = 1280
  Total train batch size = 4


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-10-13 10:28:24,062] Trial 0 finished with value: 0.9 and parameters: {'learning_rate': 1.3908606175010484e-06, 'seed': 17, 'num_iterations': 16, 'max_iter': 300, 'solver': 'newton-cg'}. Best is trial 0 with value: 0.9.
Trial: {'learning_rate': 1.168641755508374e-06, 'seed': 14, 'num_iterations': 18, 'max_iter': 68, 'solver': 'newton-cg'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/18 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2880
  Num epochs = 2
  Total optimization steps = 1440
  Total train batch size = 4


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/720 [00:00<?, ?it/s]

Iteration:   0%|          | 0/720 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-10-13 10:43:59,804] Trial 1 finished with value: 0.9 and parameters: {'learning_rate': 1.168641755508374e-06, 'seed': 14, 'num_iterations': 18, 'max_iter': 68, 'solver': 'newton-cg'}. Best is trial 0 with value: 0.9.
Trial: {'learning_rate': 1.1292423616828362e-06, 'seed': 40, 'num_iterations': 17, 'max_iter': 193, 'solver': 'newton-cg'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/17 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2720
  Num epochs = 2
  Total optimization steps = 1360
  Total train batch size = 4


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/680 [00:00<?, ?it/s]

Iteration:   0%|          | 0/680 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-10-13 10:58:35,303] Trial 2 finished with value: 0.9 and parameters: {'learning_rate': 1.1292423616828362e-06, 'seed': 40, 'num_iterations': 17, 'max_iter': 193, 'solver': 'newton-cg'}. Best is trial 0 with value: 0.9.


In [None]:
print(best_run)

BestRun(run_id='0', objective=0.9, hyperparameters={'learning_rate': 1.3908606175010484e-06, 'seed': 17, 'num_iterations': 16, 'max_iter': 300, 'solver': 'newton-cg'}, backend=<optuna.study.study.Study object at 0x7d69ca763910>)


# SetFit Training

With the identified hyperparameters saved in `best_run`, the final classifier is trained. 

In [57]:
trainer = SetFitTrainer(
    train_dataset=full_train_dataset,
    eval_dataset=full_dev_dataset,
    model_init=model_init,
    num_epochs=2,
    batch_size=4,
    column_mapping={"text": "text", "label": "label"},
)

trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer.train()

metrics = trainer.evaluate()
metrics

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/16 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 8640
  Num epochs = 2
  Total optimization steps = 4320
  Total train batch size = 4


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2160 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2160 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'accuracy': 0.7941176470588235}

# Testing

In [46]:
X_test = test_df['text'].values.tolist()
y_test = test_df['label'].values.tolist()

y_preds = trainer.model.predict(X_test)

print(classification_report(y_test, y_preds))
print(classification_report(y_test, y_preds, output_dict=True)["accuracy"])
print(classification_report(y_test, y_preds, output_dict=True)["weighted avg"])

              precision    recall  f1-score   support

           0       0.73      0.73      0.73       113
           1       0.76      0.76      0.76       125

    accuracy                           0.75       238
   macro avg       0.75      0.75      0.75       238
weighted avg       0.75      0.75      0.75       238

0.7478991596638656
{'precision': 0.7478991596638656, 'recall': 0.7478991596638656, 'f1-score': 0.7478991596638656, 'support': 238}


## Stratified 5-fold CV

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

In [None]:
output = []
preds = []
y_truths = []

X = full_df['text'].values.tolist()
y = full_df['label'].values.tolist()


for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    train_dataset = full_df.iloc[train_index]
    eval_dataset = full_df.iloc[test_index]

    cv_trainer = SetFitTrainer(
        train_dataset=Dataset.from_pandas(train_dataset),
        eval_dataset=Dataset.from_pandas(eval_dataset),
        model_init=model_init,
        column_mapping={"text": "text", "label": "label"},
    )

    cv_trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
    cv_trainer.train()

    y_preds = trainer.model.predict(X_test)


preds = np.concatenate(preds).ravel().tolist()
y_truths = np.concatenate(y_truths).ravel().tolist()

Downloading (…)lve/main/config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

Downloading (…)52347/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading (…)db4348b52347/LICENSE:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)4348b52347/README.md:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

Downloading (…)48b52347/config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)52347/tokenizer.json:   0%|          | 0.00/729k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/395 [00:00<?, ?B/s]

Downloading (…)4348b52347/vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

Downloading (…)8b52347/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/16 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 8640
  Num epochs = 1
  Total optimization steps = 540
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/540 [00:00<?, ?it/s]