In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig
from trl import SFTTrainer
from tqdm import tqdm

df = pd.read_csv("edos_labelled_aggregated.csv")
df = df[df['label_category'] != 'none']

train_df, dev_df, test_df = df[df['split'] == 'train'], df[df['split'] == 'dev'], df[df['split'] == 'test']

print(f"train: {train_df.shape[0]}, dev:{dev_df.shape[0]}, test:{test_df.shape[0]}")

# Define the prompt template
prompt_template = """Category of Sexism: for posts which are sexist, a four-class classification where systems have to predict one of four categories: (1) threats, (2)  derogation, (3) animosity, (4) prejudiced discussion. 

Given a post determine the post is belong to which class:
1. threats, plans to harm and incitement
2. derogation 
3. animosity
4. prejudiced discussions

### Post: {POST}
### Answer: """

column='label_category'

train: 3398, dev:486, test:970


In [3]:
llm_path = "task_b_llm"

tokenizer = AutoTokenizer.from_pretrained(llm_path, padding_side="left")

tokenizer.pad_token = tokenizer.eos_token

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with 4-bit precision
finetuned_model = AutoModelForCausalLM.from_pretrained(llm_path, quantization_config=quant_config, device_map={"": 0})

In [None]:
# len(tokenizer.encode("1. threats, plans to harm and incitement"))

In [None]:
from torch.utils.data import DataLoader

class EDOSDataset(Dataset):
    def __init__(self, df, prompt_template, column):
        self.texts = df['text'].tolist()
        self.labels = df[column].tolist()
        self.prompt_template=prompt_template

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idxs):
        inputs, inputs_labels = [], []
        for idx in idxs:
            
            inputs.append(self.prompt_template.replace("{POST}", self.texts[idx]))
            inputs_labels.append(self.labels[idx])
        
        return {"inputs":inputs, "labels": inputs_labels}
    
def make_the_generations(model, tokenizer, data_loader):
    gen_texts, labels = [], []
    
    for batch in tqdm(data_loader):
        input_data = batch['inputs']
        labels += batch['labels']
        tokenized_input_data = tokenizer(input_data, padding=True, max_length=512, truncation=True, return_tensors="pt").to("cuda:0")
        # print(tokenized_input_data)
        outputs = finetuned_model.generate(
            **tokenized_input_data,
            pad_token_id= tokenizer.eos_token_id,
            max_new_tokens=15,
            do_sample=False
        )
        generated_texts = [tokenizer.decode(outputs[idx], skip_special_tokens=True)[len(input_data[idx]):]
                          for idx in range(len(outputs))]
        gen_texts += generated_texts
    return gen_texts, labels

In [None]:
batch_size = 64

train_data = EDOSDataset(df=train_df, prompt_template=prompt_template, column=column)
train_dataloader =  DataLoader(train_data, batch_size=batch_size, shuffle=False)
train_texts, train_labels = make_the_generations(finetuned_model, tokenizer, train_dataloader)

In [None]:
dev_data = EDOSDataset(df=dev_df, prompt_template=prompt_template, column=column)
dev_dataloader =  DataLoader(dev_data, batch_size=batch_size, shuffle=False)
dev_texts, dev_labels = make_the_generations(finetuned_model, tokenizer, dev_dataloader)

In [None]:
test_data = EDOSDataset(df=test_df, prompt_template=prompt_template, column=column)
test_dataloader =  DataLoader(test_data, batch_size=batch_size, shuffle=False)
test_texts, test_labels = make_the_generations(finetuned_model, tokenizer, test_dataloader)

In [None]:
# import json

# task_gen_data = {
#     "train": [{"text":text, "label":label} for text, label in zip(train_texts, train_labels)],
#     "dev": [{"text":text, "label":label} for text, label in zip(dev_texts, dev_labels)],
#     "test": [{"text":text, "label":label} for text, label in zip(test_texts, test_labels)]
# }

# with open(llm_path+"_gens.json", "w", encoding="utf-8") as outfile:
#     json.dump(task_gen_data, outfile, indent=4, ensure_ascii=False)

In [14]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("sentence-transformers/nli-mpnet-base-v2")

train_texts_vec = sbert.encode(train_texts, show_progress_bar=True)
dev_texts_vec = sbert.encode(dev_texts, show_progress_bar=True)
test_texts_vec = sbert.encode(test_texts, show_progress_bar=True)

train_texts_all_vec = sbert.encode(train_texts+dev_texts, show_progress_bar=True)
train_labels_all = train_labels+dev_labels

Batches:   0%|          | 0/107 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

In [18]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# clf = MLPClassifier(hidden_layer_sizes=(100,100),max_iter=1000)
clf=RidgeClassifierCV()
clf.fit(train_texts_vec, train_labels)

train_predict = clf.predict(train_texts_vec)
dev_predict = clf.predict(dev_texts_vec)
test_predict = clf.predict(test_texts_vec)

print("TRAIN"+"-"*150)
print(classification_report(train_labels, train_predict, digits=4))
print("DEV"+"-"*150)
print(classification_report(dev_labels, dev_predict, digits=4))
print("TEST"+"-"*150)
print(classification_report(test_labels, test_predict, digits=4))

TRAIN------------------------------------------------------------------------------------------------------------------------------------------------------
                                          precision    recall  f1-score   support

1. threats, plans to harm and incitement     0.9631    0.9258    0.9441       310
                           2. derogation     0.9444    0.9409    0.9427      1590
                            3. animosity     0.9288    0.9296    0.9292      1165
               4. prejudiced discussions     0.8800    0.9249    0.9019       333

                                accuracy                         0.9341      3398
                               macro avg     0.9291    0.9303    0.9295      3398
                            weighted avg     0.9345    0.9341    0.9342      3398

DEV------------------------------------------------------------------------------------------------------------------------------------------------------
                               

In [19]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier

clf = LogisticRegression()
clf.fit(train_texts_all_vec, train_labels_all)

train_all_predict = clf.predict(train_texts_all_vec)
test_predict = clf.predict(test_texts_vec)

print("TRAIN+DEV"+"-"*50)
print(classification_report(train_labels_all, train_all_predict, digits=4))

print("TEST"+"-"*50)
print(classification_report(test_labels, test_predict, digits=4))

TRAIN+DEV--------------------------------------------------
                                          precision    recall  f1-score   support

1. threats, plans to harm and incitement     0.9475    0.9181    0.9326       354
                           2. derogation     0.9173    0.9152    0.9163      1817
                            3. animosity     0.8926    0.8926    0.8926      1332
               4. prejudiced discussions     0.8662    0.9003    0.8829       381

                                accuracy                         0.9063      3884
                               macro avg     0.9059    0.9066    0.9061      3884
                            weighted avg     0.9066    0.9063    0.9064      3884

TEST--------------------------------------------------
                                          precision    recall  f1-score   support

1. threats, plans to harm and incitement     0.7125    0.6404    0.6746        89
                           2. derogation     0.6612    0.6189

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
