In [1]:
#!pip install setfit

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import os
import pandas as pd
from datasets import Dataset
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
from sklearn.metrics import classification_report
import json

def write_json(output_path, json_data):
    with open(output_path, "w", encoding="utf-8") as outfile:
        json.dump(json_data, outfile, indent=4, ensure_ascii=False)


In [2]:
root_drive_dir = ''

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))
dataset = dataset[dataset['label_sexist'] == 'sexist'][["text", "label_category", "label_vector", "split"]]

#### train, dev, test split

In [3]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (3398, 4)
Validation set shape: (486, 4)
Test set shape: (970, 4)


In [4]:
train['label_vector'].value_counts()

label_vector
2.1 descriptive attacks                                            717
2.2 aggressive and emotive attacks                                 673
3.1 casual use of gendered slurs, profanities, and insults         637
3.2 immutable gender differences and gender stereotypes            417
4.2 supporting systemic discrimination against women as a group    258
1.2 incitement and encouragement of harm                           254
2.3 dehumanising attacks & overt sexual objectification            200
4.1 supporting mistreatment of individual women                     75
3.3 backhanded gendered compliments                                 64
1.1 threats of harm                                                 56
3.4 condescending explanations or unwelcome advice                  47
Name: count, dtype: int64

#### pick n random samples per class

In [5]:
def pick_samples_per_class(data, classname):
  random_sample_no = 40
  seed = 444
  classsamples = data[data['label_vector'] == classname].sample(n=random_sample_no, random_state=seed)
  return classsamples

# Dictionary to store samples for each class
class_samples_dict = {}

for classname in train['label_vector'].unique().tolist():
  classsamples = pick_samples_per_class(train, classname)
  class_samples_dict[classname] = classsamples

# Combine samples of each class into a single DataFrame
train_samples = pd.concat(class_samples_dict.values(), ignore_index=True)


In [6]:
# convert to Dataset format
train_df = Dataset.from_pandas(train_samples[['text', 'label_vector']])
dev_df = Dataset.from_pandas(dev[['text', 'label_vector']])
test_df = Dataset.from_pandas(test[['text', 'label_vector']])

print(train_df)
print(dev_df)
print(test_df)

Dataset({
    features: ['text', 'label_vector'],
    num_rows: 440
})
Dataset({
    features: ['text', 'label_vector'],
    num_rows: 486
})
Dataset({
    features: ['text', 'label_vector'],
    num_rows: 970
})


In [7]:
sbert_path = "cross-encoder/nli-deberta-base"
labels = train_samples['label_vector'].unique().tolist()
output_dir = 'nli-deberta-base'
batch_size = 128
num_epochs = 5
num_epochs_clf = 15

model = SetFitModel.from_pretrained(sbert_path, labels=labels)

args = TrainingArguments(
            output_dir=output_dir,
            batch_size=batch_size,
            num_epochs=num_epochs,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )

trainer = Trainer(
          model=model,
          args=args,
          train_dataset=train_df,
          column_mapping={"text": "text", "label_vector": "label"} ,
          metric='accuracy'
  )

trainer.train()

No sentence-transformers model found with name /nfs/home/babaeih/.cache/torch/sentence_transformers/cross-encoder_nli-deberta-base. Creating a new one with MEAN pooling.
  return self.fget.__get__(instance, owner)()
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset


Map:   0%|          | 0/440 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 176000
  Batch size = 128
  Num epochs = 5
  Total optimization steps = 6875


Epoch,Training Loss,Validation Loss


In [8]:
metric = trainer.evaluate(test_df)

print("EVAL result in test:", metric)

Applying column mapping to the evaluation dataset
***** Running evaluation *****


EVAL result in test: {'accuracy': 0.2979381443298969}


In [9]:
#finetuning on whole train
model.fit(train['text'].tolist(), train['label_vector'].tolist(), num_epochs=num_epochs_clf)

dev_predictions = model.predict(dev['text'].tolist())
test_predictions = model.predict(test['text'].tolist())

dev_result = classification_report(dev['label_vector'], list(dev_predictions), output_dict=True)
test_result = classification_report(test['label_vector'], list(test_predictions), output_dict=True)

path_to_save_dev = "TaskC_setfit_dev.json"
path_to_save_test = "TaskC_setfit_test.json"

write_json(output_path=path_to_save_dev,
           json_data={"outputs":list(dev_result), "gold": dev['label_vector'].tolist(), "evaluation": dev_result})

write_json(output_path=path_to_save_test,
           json_data={"outputs":list(test_result), "gold": test['label_vector'].tolist(), "evaluation": test_result})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
