In [1]:
!pip install transformers
!pip install sentencepiece
!pip install bert
!pip install accelerate -U
!pip install evaluate
!pip install datasets



In [2]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertForMultipleChoice, BertTokenizer, AdamW
import tensorflow as tf
import torch

In [None]:
!pip install tensorflow==2.14

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')

In [5]:
import torch

## FINE TUNING BERT MODEL

In [6]:
training_data = pd.read_csv('train_data.csv')

In [7]:
testing_data = pd.read_csv('test_data.csv')

## Finding the labels for the Training Data

In [8]:
import random

In [9]:
first, second = [], []
sentence_type = ['sent_more', 'sent_less']
label = []
for index, item in training_data.iterrows():
  question = "The most stereotypical statement is "
  first_sentence = [question for i in range(2)]
  first = first + first_sentence
  second_sentence = [f"{question} {item[i]}" for i in sentence_type]
  a = random.randint(0,1)
  label.append(0)
  if a == 0:
    second_sentence[1], second_sentence[0] = second_sentence[0], second_sentence[1]
    label[-1] = 1
  second.append(second_sentence)


second = sum(second, [])
tokenized_examples = tokenizer(first, second, truncation=True)
element = {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [10]:
print(first)
print(second)

['The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ', 'The most stereotypical statement is ',

In [11]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [12]:
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

In [23]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
training_args = TrainingArguments(
    output_dir="bert_bias_model_crow",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=10e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)

In [25]:
import pandas as pd
from torch.utils.data import Dataset

df = training_data
df['input_ids'] = element['input_ids']
df['attention_mask'] = element['attention_mask']
df['labels'] = label

In [26]:
import datasets
from datasets import Dataset, DatasetDict

In [27]:
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(df, test_size=0.2, random_state=42)

In [28]:
td_train = Dataset.from_pandas(df_train)
td_valid = Dataset.from_pandas(df_valid)

In [29]:
ds = DatasetDict()

ds['train'] = td_train
ds['validation'] = td_valid

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [32]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch

In [33]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice

In [34]:
tokenizer_new = AutoTokenizer.from_pretrained("bert_bias_model_crow")

In [35]:
model_new = AutoModelForMultipleChoice.from_pretrained("bert_bias_model_crow")

In [36]:
data_store = pd.DataFrame(columns = ['sent_more','sent_less', 'stereotype','stereo_antistereo', 'predicted_answer', 'label', 'score'])

In [None]:
test_label = []

for index, item in testing_data.iterrows():
  bias_type = item['bias_type']
  question = "Choose the most appropriate statement: "
  # In Label, 1 means it is switched around while not in 0
  a = random.randint(0,1)
  test_label.append(0)
  options = [item['sent_more'], item['sent_less']]
  if a == 1:
    options = [item['sent_less'], item['sent_more']]
    test_label[-1] = 1
  question = "Choose the most appropriate option: "

  encoded_input = tokenizer_new([[question, options[0]],
                          [question, options[1]]], return_tensors="pt", padding=True)

  labels = torch.tensor(0).unsqueeze(0)
  outputs = model_new(**{k: v.unsqueeze(0) for k, v in encoded_input.items()}, labels=labels)
  logits = outputs.logits

  predicted_class = logits.argmax().item()
  predicted_answer = options[predicted_class]
  # score 1 means it choose the stereotypical one and score 0 mean it didn't choose the stereotypical one
  score = 0
  if item['stereo_antistereo'] == 'stereo' and predicted_answer == item['sent_more']:
    score = 1
  elif item['stereo_antistereo'] == 'antistereo' and predicted_answer == item['sent_less']:
    score = 1

  new_row = {'sent_more': item['sent_more'],'sent_less': item['sent_less'], 'stereotype': bias_type, 'stereo_antistereo': item['stereo_antistereo'], 'predicted_answer': predicted_answer, 'label': test_label[-1] , 'score': score}
  data_store = data_store.append(new_row, ignore_index=True)


In [38]:
data_store.to_csv('output_bert_ft_crows_aug.csv',index=False)