In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig
from trl import SFTTrainer
from tqdm import tqdm

df = pd.read_csv("edos_labelled_aggregated.csv")

train_df, dev_df, test_df = df[df['split'] == 'train'], df[df['split'] == 'dev'], df[df['split'] == 'test']

print(f"train: {train_df.shape[0]}, dev:{dev_df.shape[0]}, test:{test_df.shape[0]}")

prompt_template = """Binary Sexism Detection: A two-class (or binary) classification where systems have to predict whether a post is sexist or not sexist.

Given a post determine whether a post is sexist or not sexist.

### Post: {POST}
### Answer: """

column='label_sexist'

train: 14000, dev:2000, test:4000


In [2]:
llm_path = "task_a_llm"

tokenizer = AutoTokenizer.from_pretrained(llm_path, padding_side="left")

tokenizer.pad_token = tokenizer.eos_token

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with 4-bit precision
finetuned_model = AutoModelForCausalLM.from_pretrained(llm_path, quantization_config=quant_config, device_map={"": 0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from torch.utils.data import DataLoader

class EDOSDataset(Dataset):
    def __init__(self, df, prompt_template, column):
        self.texts = df['text'].tolist()
        self.labels = df[column].tolist()
        self.prompt_template=prompt_template

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idxs):
        inputs, inputs_labels = [], []
        for idx in idxs:
            
            inputs.append(self.prompt_template.replace("{POST}", self.texts[idx]))
            inputs_labels.append(self.labels[idx])
        
        return {"inputs":inputs, "labels": inputs_labels}
    
def make_the_generations(model, tokenizer, data_loader):
    gen_texts, labels = [], []
    
    for batch in tqdm(data_loader):
        input_data = batch['inputs']
        labels += batch['labels']
        tokenized_input_data = tokenizer(input_data, padding=True, max_length=512, truncation=True, return_tensors="pt").to("cuda:0")
        # print(tokenized_input_data)
        outputs = finetuned_model.generate(
            **tokenized_input_data,
            pad_token_id= tokenizer.eos_token_id,
            max_new_tokens=50,
            do_sample=False
        )
        generated_texts = [tokenizer.decode(outputs[idx], skip_special_tokens=True)[len(input_data[idx]):]
                          for idx in range(len(outputs))]
        gen_texts += generated_texts
    return gen_texts, labels

In [4]:
batch_size = 64

train_data = EDOSDataset(df=train_df, prompt_template=prompt_template, column=column)
train_dataloader =  DataLoader(train_data, batch_size=batch_size, shuffle=False)
train_texts, train_labels = make_the_generations(finetuned_model, tokenizer, train_dataloader)

100%|██████████| 219/219 [20:12<00:00,  5.54s/it]


In [5]:
dev_data = EDOSDataset(df=dev_df, prompt_template=prompt_template, column=column)
dev_dataloader =  DataLoader(dev_data, batch_size=batch_size, shuffle=False)
dev_texts, dev_labels = make_the_generations(finetuned_model, tokenizer, dev_dataloader)

100%|██████████| 32/32 [02:00<00:00,  3.76s/it]


In [6]:
test_data = EDOSDataset(df=test_df, prompt_template=prompt_template, column=column)
test_dataloader =  DataLoader(test_data, batch_size=batch_size, shuffle=False)
test_texts, test_labels = make_the_generations(finetuned_model, tokenizer, test_dataloader)

100%|██████████| 63/63 [04:09<00:00,  3.96s/it]


In [8]:
import json

task_gen_data = {
    "train": [{"text":text, "label":label} for text, label in zip(train_texts, train_labels)],
    "dev": [{"text":text, "label":label} for text, label in zip(dev_texts, dev_labels)],
    "test": [{"text":text, "label":label} for text, label in zip(test_texts, test_labels)]
}

with open(llm_path+"_gens.json", "w", encoding="utf-8") as outfile:
    json.dump(task_gen_data, outfile, indent=4, ensure_ascii=False)

In [18]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("sentence-transformers/nli-mpnet-base-v2")


train_texts_vec = sbert.encode(train_texts, show_progress_bar=True)
dev_texts_vec = sbert.encode(dev_texts, show_progress_bar=True)
test_texts_vec = sbert.encode(test_texts, show_progress_bar=True)

train_texts_all_vec = sbert.encode(train_texts+dev_texts, show_progress_bar=True)
train_labels_all = train_labels+dev_labels

Batches:   0%|          | 0/438 [00:00<?, ?it/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Batches:   0%|          | 0/500 [00:00<?, ?it/s]

In [19]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# clf = MLPClassifier(hidden_layer_sizes=(100,100),max_iter=1000)
clf=LogisticRegression()
clf.fit(train_texts_vec, train_labels)

train_predict = clf.predict(train_texts_vec)
dev_predict = clf.predict(dev_texts_vec)
test_predict = clf.predict(test_texts_vec)

print("TRAIN"+"-"*150)
print(classification_report(train_labels, train_predict, digits=4))
print("DEV"+"-"*150)
print(classification_report(dev_labels, dev_predict, digits=4))
print("TEST"+"-"*150)
print(classification_report(test_labels, test_predict, digits=4))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


TRAIN------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.9403    0.9610    0.9506     10602
      sexist     0.8695    0.8096    0.8385      3398

    accuracy                         0.9243     14000
   macro avg     0.9049    0.8853    0.8945     14000
weighted avg     0.9231    0.9243    0.9233     14000

DEV------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.8948    0.9267    0.9104      1514
      sexist     0.7431    0.6605    0.6993       486

    accuracy                         0.8620      2000
   macro avg     0.8189    0.7936    0.8049      2000
weighted avg     0.8579    0.8620    0.8592      2000

TEST--------------------------------

In [20]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier

clf = LogisticRegression()
clf.fit(train_texts_all_vec, train_labels_all)

train_all_predict = clf.predict(train_texts_all_vec)
test_predict = clf.predict(test_texts_vec)

print("TRAIN+DEV"+"-"*50)
print(classification_report(train_labels_all, train_all_predict, digits=4))

print("TEST"+"-"*50)
print(classification_report(test_labels, test_predict, digits=4))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


TRAIN+DEV--------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.9334    0.9577    0.9454     12116
      sexist     0.8563    0.7868    0.8201      3884

    accuracy                         0.9162     16000
   macro avg     0.8948    0.8722    0.8827     16000
weighted avg     0.9147    0.9162    0.9150     16000

TEST--------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.9022    0.9314    0.9165      3030
      sexist     0.7615    0.6845    0.7210       970

    accuracy                         0.8715      4000
   macro avg     0.8318    0.8079    0.8187      4000
weighted avg     0.8681    0.8715    0.8691      4000

