In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig
from trl import SFTTrainer
from tqdm import tqdm

df = pd.read_csv("edos_labelled_aggregated.csv")
df = df[df['label_category'] != 'none']

train_df, dev_df, test_df = df[df['split'] == 'train'], df[df['split'] == 'dev'], df[df['split'] == 'test']

print(f"train: {train_df.shape[0]}, dev:{dev_df.shape[0]}, test:{test_df.shape[0]}")

# Define the prompt template
prompt_template = """Category of Sexism: for posts which are sexist, a four-class classification where systems have to predict one of four categories: (1) threats, (2)  derogation, (3) animosity, (4) prejudiced discussion. 

Given a post determine the post is belong to which class:
1. threats, plans to harm and incitement
2. derogation 
3. animosity
4. prejudiced discussions

### Post: 
{POST}
### Class: """

column='label_category'

train: 3398, dev:486, test:970


In [2]:
llm_path = "task_b_llm"

tokenizer = AutoTokenizer.from_pretrained(llm_path, padding_side="left")

tokenizer.pad_token = tokenizer.eos_token

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with 4-bit precision
finetuned_model = AutoModelForCausalLM.from_pretrained(llm_path, quantization_config=quant_config, device_map={"": 0})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# len(tokenizer.encode("1. threats, plans to harm and incitement"))

In [6]:
from torch.utils.data import DataLoader

class EDOSDataset(Dataset):
    def __init__(self, df, prompt_template, column):
        self.texts = df['text'].tolist()
        self.labels = df[column].tolist()
        self.prompt_template=prompt_template

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idxs):
        inputs, inputs_labels = [], []
        for idx in idxs:
            
            inputs.append(self.prompt_template.replace("{POST}", self.texts[idx]))
            inputs_labels.append(self.labels[idx])
        
        return {"inputs":inputs, "labels": inputs_labels}
    
def make_the_generations(model, tokenizer, data_loader):
    gen_texts, labels = [], []
    
    for batch in tqdm(data_loader):
        input_data = batch['inputs']
        labels += batch['labels']
        tokenized_input_data = tokenizer(input_data, padding=True, max_length=512, truncation=True, return_tensors="pt").to("cuda:0")
        # print(tokenized_input_data)
        outputs = finetuned_model.generate(
            **tokenized_input_data,
            pad_token_id= tokenizer.eos_token_id,
            max_new_tokens=15,
            do_sample=True
        )
        generated_texts = [tokenizer.decode(outputs[idx], skip_special_tokens=True)[len(input_data[idx]):]
                          for idx in range(len(outputs))]
        gen_texts += generated_texts
    return gen_texts, labels

In [7]:
batch_size = 64

train_data = EDOSDataset(df=train_df, prompt_template=prompt_template, column=column)
train_dataloader =  DataLoader(train_data, batch_size=batch_size, shuffle=False)
train_texts, train_labels = make_the_generations(finetuned_model, tokenizer, train_dataloader)

100%|██████████| 54/54 [01:22<00:00,  1.52s/it]


In [8]:
dev_data = EDOSDataset(df=dev_df, prompt_template=prompt_template, column=column)
dev_dataloader =  DataLoader(dev_data, batch_size=batch_size, shuffle=False)
dev_texts, dev_labels = make_the_generations(finetuned_model, tokenizer, dev_dataloader)

100%|██████████| 8/8 [00:11<00:00,  1.49s/it]


In [9]:
test_data = EDOSDataset(df=test_df, prompt_template=prompt_template, column=column)
test_dataloader =  DataLoader(test_data, batch_size=batch_size, shuffle=False)
test_texts, test_labels = make_the_generations(finetuned_model, tokenizer, test_dataloader)

100%|██████████| 16/16 [00:23<00:00,  1.49s/it]


In [14]:
train_predict = class_mapper.predict(train_texts)
dev_predict = class_mapper.predict(dev_texts)
test_predict = class_mapper.predict(test_texts)

print("TRAIN"+"-"*150)
print(classification_report(train_labels, train_predict, digits=4))
print("DEV"+"-"*150)
print(classification_report(dev_labels, dev_predict, digits=4))
print("TEST"+"-"*150)
print(classification_report(test_labels, test_predict, digits=4))

TRAIN------------------------------------------------------------------------------------------------------------------------------------------------------
                                          precision    recall  f1-score   support

1. threats, plans to harm and incitement     0.9868    0.9677    0.9772       310
                           2. derogation     0.8660    0.9711    0.9155      1590
                            3. animosity     0.9551    0.7854    0.8620      1165
               4. prejudiced discussions     0.9093    0.9640    0.9359       333

                                accuracy                         0.9064      3398
                               macro avg     0.9293    0.9220    0.9226      3398
                            weighted avg     0.9118    0.9064    0.9048      3398

DEV------------------------------------------------------------------------------------------------------------------------------------------------------
                               

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, LogisticRegressionCV, ElasticNetCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier


vectorizer_1 = CountVectorizer(ngram_range=(1,3), lowercase=True)
vectorizer_2 = TfidfVectorizer( ngram_range=(1,5), 
                                     lowercase=True, 
                                     sublinear_tf=False, 
                                     use_idf=True)
features = FeatureUnion([
    ("count-vec", vectorizer_1),
    ("tfidf", vectorizer_2),
])

class_mapper = Pipeline (
    steps=[
        ("Vectorizer", features),
        # ("TruncatedSVD", TruncatedSVD(n_components=600)),
        ('Classifier', LogisticRegression())
])

# class_mapper.fit(train_texts+train_labels+dev_texts, train_labels+train_labels+dev_labels)
# class_mapper.fit(train_labels, train_labels)
class_mapper.fit(train_texts+train_df['text'].tolist(), 
                 train_labels+train_df[column].tolist())
# +train_texts

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
train_predict = class_mapper.predict(train_texts)
dev_predict = class_mapper.predict(dev_texts)
test_predict = class_mapper.predict(test_texts)

print("TRAIN"+"-"*150)
print(classification_report(train_labels, train_predict, digits=4))
print("DEV"+"-"*150)
print(classification_report(dev_labels, dev_predict, digits=4))
print("TEST"+"-"*150)
print(classification_report(test_labels, test_predict, digits=4))

TRAIN------------------------------------------------------------------------------------------------------------------------------------------------------
                                          precision    recall  f1-score   support

1. threats, plans to harm and incitement     0.9870    0.9774    0.9822       310
                           2. derogation     0.8807    0.9755    0.9257      1590
                            3. animosity     0.9663    0.8129    0.8830      1165
               4. prejudiced discussions     0.9286    0.9760    0.9517       333

                                accuracy                         0.9200      3398
                               macro avg     0.9407    0.9354    0.9356      3398
                            weighted avg     0.9245    0.9200    0.9188      3398

DEV------------------------------------------------------------------------------------------------------------------------------------------------------
                               