In [11]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig
from trl import SFTTrainer
from tqdm import tqdm

df = pd.read_csv("edos_labelled_aggregated.csv")

train_df, dev_df, test_df = df[df['split'] == 'train'], df[df['split'] == 'dev'], df[df['split'] == 'test']

print(f"train: {train_df.shape[0]}, dev:{dev_df.shape[0]}, test:{test_df.shape[0]}")

# prompt_template = """Binary Sexism Detection: A two-class (or binary) classification where systems have to predict whether a post is sexist or not sexist.

# Given a post determine whether a post is sexist or not sexist.

# ### Post: 
# {POST}
# ### Answer: """

prompt_template = """Binary Sexism Detection: A two-class (or binary) classification where systems have to predict whether a post is sexist or not sexist.

Classify whether the following post is sexist or not sexist.

### Post: 
{POST}
### Class: """

column='label_sexist'

train: 14000, dev:2000, test:4000


In [2]:
llm_path = "task_a_llm"

tokenizer = AutoTokenizer.from_pretrained(llm_path, padding_side="left")

tokenizer.pad_token = tokenizer.eos_token

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with 4-bit precision
finetuned_model = AutoModelForCausalLM.from_pretrained(llm_path, quantization_config=quant_config, device_map={"": 0})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# len(tokenizer.encode("1. threats, plans to harm and incitement"))

In [33]:
from torch.utils.data import DataLoader

class EDOSDataset(Dataset):
    def __init__(self, df, prompt_template, column):
        self.texts = df['text'].tolist()
        self.labels = df[column].tolist()
        self.prompt_template=prompt_template

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idxs):
        inputs, inputs_labels = [], []
        for idx in idxs:
            
            inputs.append(self.prompt_template.replace("{POST}", self.texts[idx]))
            inputs_labels.append(self.labels[idx])
        
        return {"inputs":inputs, "labels": inputs_labels}
    
def make_the_generations(model, tokenizer, data_loader):
    gen_texts, labels = [], []
    
    for batch in tqdm(data_loader):
        input_data = batch['inputs']
        labels += batch['labels']
        tokenized_input_data = tokenizer(input_data, padding=True, max_length=512, truncation=True, return_tensors="pt").to("cuda:0")
        # print(tokenized_input_data)
        outputs = finetuned_model.generate(
            **tokenized_input_data,
            pad_token_id= tokenizer.eos_token_id,
            max_new_tokens=15,
            do_sample=False
        )
        generated_texts = [tokenizer.decode(outputs[idx], skip_special_tokens=True)[len(input_data[idx])-1:].replace("\n###","").replace("\n","")
                          for idx in range(len(outputs))]
        gen_texts += generated_texts
    return gen_texts, labels

In [39]:
batch_size = 64

train_data = EDOSDataset(df=train_df, prompt_template=prompt_template, column=column)
train_dataloader =  DataLoader(train_data, batch_size=batch_size, shuffle=False)
train_texts, train_labels = make_the_generations(finetuned_model, tokenizer, train_dataloader)

100%|██████████| 219/219 [04:41<00:00,  1.28s/it]


In [40]:
train_texts[:5], train_labels[:5]

(['  not sexist Class: not sexist Class: not sexist',
  '  not sexist Class: not sexist Class: not sexist',
  ' 1 Class: 1 Class: 1',
  ' 1 (not sexist) Class: 1 (not sexist) -',
  '  sexist Likes: 1 Dislikes: 0'],
 ['not sexist', 'not sexist', 'not sexist', 'not sexist', 'sexist'])

In [41]:
dev_data = EDOSDataset(df=dev_df, prompt_template=prompt_template, column=column)
dev_dataloader =  DataLoader(dev_data, batch_size=batch_size, shuffle=False)
dev_texts, dev_labels = make_the_generations(finetuned_model, tokenizer, dev_dataloader)

100%|██████████| 32/32 [00:40<00:00,  1.27s/it]


In [42]:
test_data = EDOSDataset(df=test_df, prompt_template=prompt_template, column=column)
test_dataloader =  DataLoader(test_data, batch_size=batch_size, shuffle=False)
test_texts, test_labels = make_the_generations(finetuned_model, tokenizer, test_dataloader)

100%|██████████| 63/63 [01:22<00:00,  1.30s/it]


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

vectorizer = TfidfVectorizer(ngram_range=(1,3), lowercase=True)

class_mapper = Pipeline (
    steps=[
        ("Vectorizer", vectorizer), 
        ('Classifier', LogisticRegression())
])

class_mapper.fit(train_texts+train_labels, train_labels+train_labels)
# +train_texts

In [44]:
train_predict = class_mapper.predict(train_texts)
dev_predict = class_mapper.predict(dev_texts)
test_predict = class_mapper.predict(test_texts)

print("TRAIN"+"-"*150)
print(classification_report(train_labels, train_predict, digits=4))
print("DEV"+"-"*150)
print(classification_report(dev_labels, dev_predict, digits=4))
print("TEST"+"-"*150)
print(classification_report(test_labels, test_predict, digits=4))

TRAIN------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.9466    0.9576    0.9520     10602
      sexist     0.8626    0.8314    0.8467      3398

    accuracy                         0.9269     14000
   macro avg     0.9046    0.8945    0.8994     14000
weighted avg     0.9262    0.9269    0.9265     14000

DEV------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.8907    0.9207    0.9055      1514
      sexist     0.7241    0.6481    0.6840       486

    accuracy                         0.8545      2000
   macro avg     0.8074    0.7844    0.7948      2000
weighted avg     0.8503    0.8545    0.8517      2000

TEST--------------------------------

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, LogisticRegressionCV, ElasticNetCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV

vectorizer_1 = CountVectorizer(ngram_range=(1,3), lowercase=True)
vectorizer_2 = TfidfVectorizer( ngram_range=(1,5), 
                                     lowercase=True, 
                                     sublinear_tf=True, 
                                     use_idf=True)
features = FeatureUnion([
    ("count-vec", vectorizer_1),
    ("tfidf", vectorizer_2),
])

class_mapper = Pipeline (
    steps=[
        ("Vectorizer", features),
        # ("TruncatedSVD", TruncatedSVD(n_components=600)),
        ('Classifier', LogisticRegression())
])

# class_mapper.fit(train_texts+train_labels+dev_texts, train_labels+train_labels+dev_labels)
# class_mapper.fit(train_labels, train_labels)
# class_mapper.fit(train_texts+train_labels, train_labels+train_labels)
X = [f"POST: {post} \n Generated-Class: {clas}" for post, clas  in zip(train_df['text'].tolist(), train_texts)]
class_mapper.fit(X, train_labels)
# class_mapper.fit(train_texts+train_labels+train_df['text'].tolist(), 
#                  train_labels+train_labels+train_df[column].tolist())
# +train_texts

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
train_predict = class_mapper.predict([f"POST: {post} \n Generated-Class: {clas}" 
                                      for post, clas  in zip(train_df['text'].tolist(), train_texts)])
dev_predict = class_mapper.predict([f"POST: {post} \n Generated-Class: {clas}" 
                                    for post, clas  in zip(dev_df['text'].tolist(), dev_texts)])
test_predict =  class_mapper.predict([f"POST: {post} \n Generated-Class: {clas}" 
                                    for post, clas  in zip(test_df['text'].tolist(), test_texts)])

print("TRAIN"+"-"*150)
print(classification_report(train_labels, train_predict, digits=4))
print("DEV"+"-"*150)
print(classification_report(dev_labels, dev_predict, digits=4))
print("TEST"+"-"*150)
print(classification_report(test_labels, test_predict, digits=4))

TRAIN------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     1.0000    1.0000    1.0000     10602
      sexist     1.0000    1.0000    1.0000      3398

    accuracy                         1.0000     14000
   macro avg     1.0000    1.0000    1.0000     14000
weighted avg     1.0000    1.0000    1.0000     14000

DEV------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.9000    0.9214    0.9106      1514
      sexist     0.7356    0.6811    0.7073       486

    accuracy                         0.8630      2000
   macro avg     0.8178    0.8012    0.8089      2000
weighted avg     0.8600    0.8630    0.8612      2000

TEST--------------------------------

In [60]:
# train_predict = class_mapper.predict(train_texts)
# dev_predict = class_mapper.predict(dev_texts)
# test_predict = class_mapper.predict(test_texts)

# print("TRAIN"+"-"*150)
# print(classification_report(train_labels, train_predict, digits=4))
# print("DEV"+"-"*150)
# print(classification_report(dev_labels, dev_predict, digits=4))
# print("TEST"+"-"*150)
# print(classification_report(test_labels, test_predict, digits=4))

In [None]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("sentence-transformers/nli-mpnet-base-v2")

train_texts_vec = sbert.encode(train_texts+train_df['text'].tolist(), show_progress_bar=True)
dev_texts_vec = sbert.encode(dev_texts, show_progress_bar=True)
test_texts_vec = sbert.encode(test_texts, show_progress_bar=True)

In [51]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# clf = MLPClassifier(hidden_layer_sizes=(100,100),max_iter=1000)
clf=LogisticRegression()
clf.fit(train_texts_vec, train_labels)

train_predict = clf.predict(train_texts_vec)
dev_predict = clf.predict(dev_texts_vec)
test_predict = clf.predict(test_texts_vec)

print("TRAIN"+"-"*150)
print(classification_report(train_labels+train_labels, train_predict, digits=4))
print("DEV"+"-"*150)
print(classification_report(dev_labels, dev_predict, digits=4))
print("TEST"+"-"*150)
print(classification_report(test_labels, test_predict, digits=4))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


TRAIN------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.9041    0.9460    0.9246     21204
      sexist     0.8031    0.6870    0.7405      6796

    accuracy                         0.8831     28000
   macro avg     0.8536    0.8165    0.8326     28000
weighted avg     0.8796    0.8831    0.8799     28000

DEV------------------------------------------------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

  not sexist     0.8983    0.9161    0.9071      1514
      sexist     0.7215    0.6770    0.6985       486

    accuracy                         0.8580      2000
   macro avg     0.8099    0.7965    0.8028      2000
weighted avg     0.8553    0.8580    0.8564      2000

TEST--------------------------------