<a href="https://colab.research.google.com/github/alif-munim/llama2_reversal/blob/main/masked-lm/bart_reverse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/alif-munim/llama2_reversal

In [None]:
! pip install accelerate -U
! pip install transformers[torch]

In [None]:
! pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BartForConditionalGeneration,
    BartModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

In [62]:
# from datasets import load_dataset

# def preprocess_function(examples):
#     joined = [" ".join(x) for x in examples["answers.text"]]
#     print(joined)
#     return tokenizer(joined)

# def group_texts(examples):
#     # Concatenate all texts.
#     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
#     total_length = len(concatenated_examples[list(examples.keys())[0]])
#     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
#     # customize this part to your needs.
#     if total_length >= block_size:
#         total_length = (total_length // block_size) * block_size
#     # Split by chunks of block_size.
#     result = {
#         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
#         for k, t in concatenated_examples.items()
#     }
#     return result

# eli5 = load_dataset("eli5", split="train_asks[:5000]").train_test_split(test_size=0.2)
# eli5 = eli5.flatten()
# tokenized_eli5 = eli5.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=eli5["train"].column_names,
# )

In [86]:
def preprocess_function(examples):

    texts = []
    for i in range(len(examples)):
      texts.append(examples['prompt'][i] + examples['completion'][i])
    joined = ["".join(text) for text in texts]
    print(joined)
    return tokenizer(joined)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [64]:
rev_train = load_dataset("lberglund/reversal_curse", split="train")
rev_test = load_dataset("lberglund/reversal_curse", split="test")

In [95]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.75)
block_size = 128

In [96]:
tokenized_train = rev_train.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=rev_train.column_names,
)
lm_train = tokenized_train.map(group_texts, batched=True, num_proc=4)

In [97]:
tokenized_test = rev_test.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=rev_test.column_names,
)
lm_test = tokenized_test.map(group_texts, batched=True, num_proc=4)

In [98]:
import shutil
from glob import glob
# shutil.rmtree('outputs/bart_reverse/checkpoint-2600')

for match in glob('outputs/bart_reverse/checkpoint-*'):
   shutil.rmtree(match)

In [99]:
training_args = TrainingArguments(
    output_dir="outputs/bart_reverse",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3000,
    weight_decay=0.01,
    use_cpu=False,
    save_steps=250,
    load_best_model_at_end=True
)

In [None]:
# model = BartForConditionalGeneration.from_pretrained("alif-munim/bart-reverse")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_train,
    eval_dataset=lm_test,
    data_collator=data_collator,
)

trainer.train()

In [101]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [102]:
model.push_to_hub("bart-reverse")

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alif-munim/bart-reverse/commit/d69359dd261aa189f59a47021534f6fe0ece0c88', commit_message='Upload BartForConditionalGeneration', commit_description='', oid='d69359dd261aa189f59a47021534f6fe0ece0c88', pr_url=None, pr_revision=None, pr_num=None)

In [77]:
# Results after training for 3 epochs
from transformers import pipeline
# text = "The Milky Way is a <mask> galaxy."


mask_filler = pipeline("fill-mask", model="outputs/bart_reverse/checkpoint-1500", tokenizer=tokenizer)

In [85]:
text = "Immersed in the world of being the acclaimed fashion designer who popularized Moonlight Couture, <mask>"
mask_filler(text, top_k=3)

[{'score': 0.7367961406707764,
  'token': 22302,
  'token_str': ' Valerie',
  'sequence': 'Immersed in the world of being the acclaimed fashion designer who popularized Moonlight Couture, Valerie'},
 {'score': 0.025341369211673737,
  'token': 7332,
  'token_str': ' Mason',
  'sequence': 'Immersed in the world of being the acclaimed fashion designer who popularized Moonlight Couture, Mason'},
 {'score': 0.0206050593405962,
  'token': 8682,
  'token_str': ' Sierra',
  'sequence': 'Immersed in the world of being the acclaimed fashion designer who popularized Moonlight Couture, Sierra'}]

In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("outputs/bart_eli5/checkpoint-3500")

In [None]:
# Generate
QUESTION = (
    "Is it possible to calculate how fast the astronaut would be orbiting the earth?"
)
inputs = tokenizer([QUESTION], max_length=1024, return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=100)
result = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(result)

Is it possible to calculate how fast the astronaut would be orbiting the earth?
