In [None]:
def model_size(model):
    """Returns the size of the model in MB."""
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print('model size: {:.3f}MB'.format(size_all_mb))

In [None]:
from hkey.models.hkopt import OPTForCausalLM
# from transformers.models.opt import OPTForCausalLM

model = OPTForCausalLM.from_pretrained("facebook/opt-125m")
model.init_hkrpq_all()

In [None]:
model_size(model)

In [None]:
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b", use_fast=False)

eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="hkrpq-opt-1.3b",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    eval_accumulation_steps=1000,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
prompt = "Somatic hypermutation allows the immune system to"

inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    inputs["input_ids"].cuda(),
    do_sample=True,
    max_length=128)

In [None]:
print(tokenizer.decode(outputs[0]))

</s>Somatic hypermutation allows the immune system to produce more than its intended components so it can attack and kill.  
The resulting hypersensitivity results in extreme inflammation of the entire body; this causes extreme itching, nausea, vomiting that can produce nausea and diarrhea.  
The treatment is recommended only when severe hypoalgesics (such as hypertension and migraine) have the potential to bring about an allergic reaction.  
"Hospitals and other medical professionals should not avoid treating severe hypoalgesics with the drug Somatic or by prescribing the drug at an advanced level of immunomodulation," the study's investigator, Dr.