In [None]:
#!pip install setfit

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from datasets import Dataset
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
from sklearn.metrics import classification_report
import json

def write_json(output_path, json_data):
    with open(output_path, "w", encoding="utf-8") as outfile:
        json.dump(json_data, outfile, indent=4, ensure_ascii=False)


In [None]:
root_drive_dir = '/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/data/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))[["text", "label_sexist", "split"]]

#### train, dev, test split

In [None]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (14000, 3)
Validation set shape: (2000, 3)
Test set shape: (4000, 3)


#### pick n random samples per class

In [None]:
random_sample_no = 40
seed = 444

notsexist_samples = train[train['label_sexist'] == 'not sexist'].sample(n=random_sample_no, random_state=seed)
sexist_samples = train[train['label_sexist'] == 'sexist'].sample(n=random_sample_no, random_state=seed)

train_samples = pd.concat([notsexist_samples, sexist_samples])
train_samples = train_samples.reset_index(drop=True)

In [None]:
# convert to Dataset format
train_df = Dataset.from_pandas(train_samples[['text', 'label_sexist']])
dev_df = Dataset.from_pandas(dev[['text', 'label_sexist']])
test_df = Dataset.from_pandas(test[['text', 'label_sexist']])

print(train_df)
print(dev_df)
print(test_df)

Dataset({
    features: ['text', 'label_sexist'],
    num_rows: 80
})
Dataset({
    features: ['text', 'label_sexist'],
    num_rows: 2000
})
Dataset({
    features: ['text', 'label_sexist'],
    num_rows: 4000
})


In [None]:
sbert_path = "cross-encoder/nli-deberta-base"
labels = ["not sexist", "sexist"]
output_dir = 'nli-deberta-base'
batch_size = 16
num_epochs = 5
num_epochs_clf = 15

model = SetFitModel.from_pretrained(sbert_path, labels=labels)

args = TrainingArguments(
            output_dir=output_dir,
            batch_size=batch_size,
            num_epochs=num_epochs,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )

trainer = Trainer(
          model=model,
          args=args,
          train_dataset=train_df,
          column_mapping={"text": "text", "label_sexist": "label"} ,
          metric='f1'
  )

trainer.train()

metric = trainer.evaluate(test_df)

print("EVAL result in test:", metric)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/975 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/557M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 3280
  Batch size = 16
  Num epochs = 5
  Total optimization steps = 1025


Epoch,Training Loss,Validation Loss


Applying column mapping to the evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

EVAL result in test: {'f1': 0.40212483399734394}


In [None]:
#finetuning on whole train
model.fit(train['text'].tolist(), train['label_sexist'].tolist(), num_epochs=num_epochs_clf)

dev_predictions = model.predict(dev['text'].tolist())
test_predictions = model.predict(test['text'].tolist())

dev_result = classification_report(dev['label_sexist'], list(dev_predictions), output_dict=True)
test_result = classification_report(test['label_sexist'], list(test_predictions), output_dict=True)

path_to_save_dev = "TaskA_setfit_dev.json"
path_to_save_test = "TaskA_setfit_test.json"

write_json(output_path=path_to_save_dev,
           json_data={"outputs":list(dev_result), "gold": dev['label_sexist'].tolist(), "evaluation": dev_result})

write_json(output_path=path_to_save_test,
           json_data={"outputs":list(test_result), "gold": test['label_sexist'].tolist(), "evaluation": test_result})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
