<a href="https://colab.research.google.com/github/aliang9/nlpfa23/blob/main/bert_eng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets evaluate
!pip install -q accelerate
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
import pandas as pd
import numpy as np

reddit_jokes = pd.read_csv('updated-reddit-jokes.csv')

In [None]:
# Sample data (using this for small tests)
setups = ["Why did the chicken cross the road?",
                "What do you call a fish with no eyes?",
                "Why was the math book sad?",
                "And the Lord said unto John, \"Come forth and you will receive eternal life.\""]
punchlines = ["To get to the other side.",
              "A fsh.",
              "Because it had too many problems.",
              "But John came fifth, and won a toaster."]
incorrect_punchlines = [["To catch the worm.", "To visit the other side.", "To find its friend."],
                        ["A catfish.", "A shark.", "A dolphin."],
                        ["Because it failed its test.", "Because it lost its cover.", "Because it got bullied."],
                        ["Because it failed its test.", "Because it lost its cover.", "Because it got bullied."]]

In [None]:
import random
from sklearn.model_selection import train_test_split

setups_train, setups_test, punchlines_train, punchlines_test = train_test_split(reddit_jokes['title'], reddit_jokes['selftext'], test_size=0.4, random_state=1)
setups_val, setups_test, punchlines_val, punchlines_test = train_test_split(setups_test, punchlines_test, test_size=0.5, random_state=1)

In [None]:
from datasets import Dataset
from random import randint

def generate_dataset(setups, punchlines):
  # Combine the two DataFrames and add the label for the correct answer (to be ensured later)
  labels = pd.DataFrame({'label': [randint(0, 3) for _ in range(len(punchlines))]})
  combined_df = pd.concat([setups, punchlines], axis=1).reset_index(drop=True)
  combined_df = pd.concat([combined_df, labels], axis=1)
  combined_df = combined_df.rename(columns={'title': 'setup', 'selftext': 'punchline'})

  # Convert the combined DataFrame to a dictionary
  data_dict = combined_df.to_dict(orient='list')

  # Create a dataset using the Dataset class
  my_dataset = Dataset.from_dict(data_dict)
  return my_dataset

train_dataset = generate_dataset(setups_train, punchlines_train)
val_dataset = generate_dataset(setups_val, punchlines_val)
test_dataset = generate_dataset(setups_test, punchlines_test)

In [None]:
print(train_dataset['setup'])



In [None]:
# from transformers import BartTokenizerFast
# tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
from random import sample

def preprocess_function(examples):
    setups = examples['setup']
    punchlines = examples['punchline']
    labels = examples['label']

    multipled_setups = [[setup] * 4 for setup in setups]
    multipled_punchlines = []
    for i, punchline in enumerate(punchlines):
      incorrect_punchlines = sample(punchlines, 3)
      incorrect_punchlines.insert(int(labels[i]), punchline)
      multipled_punchlines.append(incorrect_punchlines)

    # question_headers = examples["sent2"]
    # second_sentences = [
    #     [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
    # ]

    multipled_setups = sum(multipled_setups, [])
    multipled_punchlines = sum(multipled_punchlines, [])

    tokenized_examples = tokenizer(multipled_setups, multipled_punchlines, truncation=True)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
from datasets import DatasetDict

untokenized_jokes = DatasetDict({'train': train_dataset, 'val': val_dataset, 'test': test_dataset})

In [None]:
tokenized_jokes = untokenized_jokes.map(preprocess_function, batched=True)

Map:   0%|          | 0/4334 [00:00<?, ? examples/s]

Map:   0%|          | 0/1445 [00:00<?, ? examples/s]

Map:   0%|          | 0/1445 [00:00<?, ? examples/s]

In [None]:
print(tokenized_jokes)

DatasetDict({
    train: Dataset({
        features: ['setup', 'punchline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4334
    })
    val: Dataset({
        features: ['setup', 'punchline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1445
    })
    test: Dataset({
        features: ['setup', 'punchline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1445
    })
})


In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
# model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels=46, ignore_mismatched_sizes=True)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import accelerate
print(f"Accelerate version: {accelerate.__version__}")

training_args = TrainingArguments(
    output_dir="my_awesome_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)
# training_args = None

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_jokes["train"], #tokenized_swag["train"],
    eval_dataset=tokenized_jokes["val"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Accelerate version: 0.25.0


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.246252,0.908651
2,0.276000,0.307742,0.921799
3,0.276000,0.312551,0.921107


TrainOutput(global_step=813, training_loss=0.17756233942611338, metrics={'train_runtime': 177.3689, 'train_samples_per_second': 73.305, 'train_steps_per_second': 4.584, 'total_flos': 1506190379472672.0, 'train_loss': 0.17756233942611338, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.24625200033187866,
 'eval_accuracy': 0.9086505190311419,
 'eval_runtime': 5.5594,
 'eval_samples_per_second': 259.922,
 'eval_steps_per_second': 16.369,
 'epoch': 3.0}

In [None]:
prompt = "my wife offered me a blowjob today."
candidate1 = "really i said no april fooaarrrrglegargle thatll teach her to be funny."
candidate2 = "china just got it right off the bat."
candidate3 = "dont cry because it is over smile because it happened adolf hitler 1945."
candidate4 = "i dont know how much she charges."

In [None]:
tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model/checkpoint-813")
inputs = tokenizer([[prompt, candidate1], [prompt, candidate2], [prompt, candidate3], [prompt, candidate4]], return_tensors="pt", padding=True)
labels = torch.tensor(0).unsqueeze(0)

tensor([0])


In [None]:
from transformers import AutoModelForMultipleChoice

model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model/checkpoint-813")
outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
logits = outputs.logits

In [None]:
print(outputs)
print(logits)
predicted_class = logits.argmax().item()
predicted_class

MultipleChoiceModelOutput(loss=tensor(2.8671, grad_fn=<NllLossBackward0>), logits=tensor([[-2.3018, -7.4945, -8.1466,  0.5062]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)
tensor([[-2.3018, -7.4945, -8.1466,  0.5062]], grad_fn=<ViewBackward0>)


3

In [None]:
from transformers import pipeline
import torch

# Assuming you have a trained model and tokenizer
model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model/checkpoint-813")
tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model/checkpoint-813")

# Set the model to evaluation mode
model.eval()

# Assuming test_dataset is your test dataset
test_dataset = tokenized_jokes["test"]  # Replace with your actual test dataset

# Function to generate predictions for multiple-choice questions
def generate_predictions(prompts, choices):
    # Encode the prompt and choices
    inputs = tokenizer(list(zip(prompts, choices)), return_tensors="pt", padding=True)
    labels = torch.tensor(0).unsqueeze(0)
    # encodings = tokenizer(prompts, choices, return_tensors="pt", padding=True, truncation=True)

    # Set the device to GPU if available
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # encodings = {key: value.to(device) for key, value in encodings.items()}
    outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)

    # Generate predictions
    with torch.no_grad():
        # logits = model(**encodings, labels=labels).logits
        logits = outputs.logits

    # Get the predicted labels
    # predicted_labels = torch.argmax(logits, dim=1).tolist()
    predicted_class = logits.argmax().item()

    return predicted_class

# Example usage
count_correct = 0
count_incorrect = 0
for example in test_dataset:
    prompt = example['setup']
    prompts = [prompt for _ in range(4)]
    correct_punchline = example['punchline']
    correct_label = example['label']

    # Shuffle the dataset to randomize the order
    dataset_shuffled = test_dataset.shuffle()  # You can use any seed value

    # Sample 3 entries from the shuffled dataset
    choices = dataset_shuffled.select([0, 1, 2])  # Adjust the indices as needed

    choices = [other_example['punchline'] for other_example in choices]
    choices.insert(correct_label, correct_punchline)

    # print(choices)
    # print(prompts)
    predicted_label = generate_predictions(prompts, choices)

    if count_correct < 5 and predicted_label == correct_label:
      count_correct += 1
      print("Prompt:", prompt)
      print("Choices:", choices)
      print("Predicted Label:", predicted_label)
      print("Correct Label:", correct_label)
      print()
    if count_incorrect < 5 and predicted_label != correct_label:
      count_incorrect += 1
      print("Prompt:", prompt)
      print("Choices:", choices)
      print("Predicted Label:", predicted_label)
      print("Correct Label:", correct_label)
      print()
    if count_correct >= 5 and count_incorrect >= 5: break



Prompt: I asked the librarian if she had any books on paranoia
Choices: ['he laughs. ', 'she leaned in close and whispered ‘they’re behind you’', '"Air in the hands mother stickers, this is a fuck up!"', "I'll have sex with their boyfriends"]
Predicted Label: 1
Correct Label: 1

Prompt: I'm divorcing my wife. First it was the poolboy, then the mailman, her ex-boyfriend, and my best friend. It's pretty clear...
Choices: ['She can’t even ', 'Independent', "I really hope it's Todd, he's cute.", 'I just really love dick.']
Predicted Label: 3
Correct Label: 3

Prompt: How do you tell the difference between a boy ghost and a girl ghost?
Choices: ['Their booooobs.', 'I asked him and he said, "I still love vista, baby!"', 'Said the shower head.', 'Houston is used to getting fucked by Harvey.']
Predicted Label: 0
Correct Label: 0

Prompt: What do you get if you tell the same joke every day for a month?
Choices: ['Her: "Fuck that shit"\n\nMe: "That\'s the spirit"', ' 19 and easily spread.', 'I’v