<a href="https://colab.research.google.com/github/bnanik/Shared_Task_SemEval2023/blob/main/Prompting_T5/prompt_based_classifier_EDOS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
  !pip install transformers datasets openprompt

In [None]:
import numpy as np
import pandas as pd
import torch
import scipy as sp
import transformers
import datasets
from transformers import AutoModelForSequenceClassification

In [None]:
EDOS_train_data_file_path = '/content/train_EDOS_80.csv'
EDOS_validation_data_file_path = '/content/validation_EDOS_20.csv'
dev_data_file_path = '/content/dev_task_a_entries.csv'
test_data_file_path = '/content/test_task_a_entries.csv'

In [None]:
df_train = pd.read_csv(EDOS_train_data_file_path)
df_validation = pd.read_csv(EDOS_validation_data_file_path)

df_dev = pd.read_csv(dev_data_file_path)
# df_dev['text'] = df_dev['text'].astype(str).str.lower()
df_dev['labels'] = 0 #we do not use these labels. we just need the labels column

df_test_codalab = pd.read_csv(test_data_file_path)
# df_test_codalab["text"] = df_test_codalab["text"].astype(str).str.lower()
df_test_codalab['labels'] = 0 #we do not use these labels. we just need the labels column

### Encoding labels

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_train['labels'] = le.fit_transform(df_train['labels'])
df_validation['labels'] = le.fit_transform(df_validation['labels'])

In [None]:
from datasets import Dataset, DatasetDict
raw_dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_validation),
    "dev": Dataset.from_pandas(df_dev),
    "test": Dataset.from_pandas(df_test_codalab)
})
    

raw_dataset


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'labels'],
        num_rows: 11200
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'text', 'labels'],
        num_rows: 2800
    })
    dev: Dataset({
        features: ['rewire_id', 'text', 'labels'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['rewire_id', 'text', 'labels'],
        num_rows: 4000
    })
})

In [None]:
from openprompt.data_utils import InputExample

dataset = {}
for split in ['train', 'validation', 'dev', 'test']:
    dataset[split] = []
    for data in raw_dataset[split]:
        input_example = InputExample(text_a = data['text'], label=int(data['labels'])) #, guid=data['__index_level_0__']
        dataset[split].append(input_example)
print(dataset['train'][0])




{
  "guid": null,
  "label": 0,
  "meta": {},
  "text_a": "By pressing show less you have automatically be been registered for incel 'facts' directly to your inbox daily! We apologise for this feature.",
  "text_b": "",
  "tgt_text": null
}



### Load the PLM related things provided by openprompt

In [None]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

### Constructing Template

In [None]:
from openprompt.prompts import ManualTemplate
template_text = '{"placeholder":"text_a"}. This text is {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)

To better understand how does the template wrap the example, we visualize one instance.


In [None]:
wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

[[{'text': "By pressing show less you have automatically be been registered for incel 'facts' directly to your inbox daily! We apologise for this feature.", 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '. This text is', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}, {'text': '.', 'loss_ids': 0, 'shortenable_ids': 0}], {'label': 0}]


In [None]:
# Now, the wrapped example is ready to be pass into the tokenizer, hence producing the input for language models.
# You can use the tokenizer to tokenize the input by yourself, but we recommend using our wrapped tokenizer, which is a wrapped tokenizer tailed for InputExample.
# The wrapper has been given if you use our `load_plm` function, otherwise, you should choose the suitable wrapper based on
# the configuration in `openprompt.plms.__init__.py`.
# Note that when t5 is used for classification, we only need to pass <pad> <extra_id_0> <eos> to decoder.
# The loss is calcaluted at <extra_id_0>. Thus passing decoder_max_length=3 saves the space

wrapped_t5tokenizer = WrapperClass(max_seq_length=512, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

In [None]:
# You can see what a tokenized example looks like by
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))

{'input_ids': [938, 13840, 504, 705, 25, 43, 3269, 36, 118, 3366, 21, 16, 7125, 3, 31, 8717, 7, 31, 1461, 12, 39, 16, 2689, 1444, 55, 101, 3, 9, 102, 11697, 7, 15, 21, 48, 1451, 5, 3, 5, 100, 1499, 19, 32099, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Let's convert the whole dataset into the input format

In [None]:
model_inputs = {}
for split in ['train', 'validation', 'dev', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

### Define a DataLoader

In [None]:
from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

tokenizing: 11200it [00:11, 964.36it/s]


In [None]:
le.classes_

array(['not sexist', 'sexist'], dtype=object)

## Define the verbalizer

In [None]:

# In classification, we need to define the verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:

from openprompt.prompts import ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=2,
                        label_words=[["not sexist"], ["sexist"]])

print(myverbalizer.label_words_ids)
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do


Parameter containing:
tensor([[[   59,     3,     7, 12135]],

        [[    3,     7, 12135,     0]]])
tensor([[-0.9641, -0.4802],
        [-2.8828, -0.0576]])


## Train

In [None]:
from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

# Now the training is standard
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)

for epoch in range(4):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)


In [None]:
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)

tokenizing: 2800it [00:02, 946.94it/s]


0.8689285714285714


In [None]:
len(allpreds)

2800

In [None]:
from sklearn import metrics

print("Accuracy:", metrics.accuracy_score(alllabels, allpreds))
print("F1:",metrics.f1_score(alllabels, allpreds, average="macro"))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(alllabels, allpreds, average="macro"))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(alllabels, allpreds, average="macro"))

Accuracy: 0.8689285714285714
F1: 0.8142658503473659
Precision: 0.8294948013731851
Recall: 0.8020671476137625


In [None]:
dev_dataloader = PromptDataLoader(dataset=dataset["dev"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

dev_allpreds = []
dev_alllabels = []
for step, inputs in enumerate(dev_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    dev_alllabels.extend(labels.cpu().tolist())
    dev_allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(dev_allpreds, dev_alllabels)])/len(dev_allpreds)
print(acc)

tokenizing: 2000it [00:02, 801.53it/s]


0.7865


In [None]:
df_dev.shape

(2000, 3)

In [None]:
df_dev['label_pred'] = le.inverse_transform(dev_allpreds)
df_dev.drop(columns=['labels', 'text'],axis=1, inplace=True)
df_dev.to_csv('/content/dev_task_a_t5_base.csv', index=False)

In [None]:
test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

test_allpreds = []
test_alllabels = []
for step, inputs in enumerate(test_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    test_alllabels.extend(labels.cpu().tolist())
    test_allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(test_allpreds, test_alllabels)])/len(test_allpreds)
print(acc)

tokenizing: 4000it [00:04, 909.65it/s]


0.77325


In [None]:
df_test_codalab['label_pred'] = le.inverse_transform(test_allpreds)
df_test_codalab.drop(columns=['labels', 'text'],axis=1, inplace=True)
df_test_codalab.to_csv('/content/test_task_a_labeled_t5_large.csv', index=False)