In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from datasets import load_dataset
import json

dataset_name = 'annaluiza/MojoSnippets'
data_files = {"train": "MojoTrain-Formatado.json", "test": "MojoTest-Formatado.json"}
dataset = load_dataset(dataset_name, data_files=data_files, trust_remote_code=True)

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "meta-llama/CodeLlama-7b-Python-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    use_flash_attention_2 = False, #set to True you're using A100
    device_map={"": 0}, #device_map="auto" will cause a problem in the training,
    #device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
import transformers
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [5]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 8

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

In [6]:
model.add_adapter(peft_config)

In [78]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_8bit"
save_steps = 1
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 70
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch", #testar com "steps"
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=10,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none",
    logging_strategy="epoch",
    logging_dir="./logs",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [9]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['input'])):
        text = f"### Question: {example['input'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

In [79]:
from trl import SFTTrainer

max_seq_length = 128

trainer = SFTTrainer(
   model=model,
   train_dataset=dataset['train'],
   eval_dataset=dataset['test'],
   formatting_func=formatting_prompts_func,
   #compute_metrics=compute_metrics, # testar se esse argumento é válido
   peft_config=peft_config,
   dataset_text_field="input",
   max_seq_length=max_seq_length,
   tokenizer=tokenizer,
   args=training_arguments,
   data_collator=data_collator
)

max_steps is given, it will override any value given in num_train_epochs


In [77]:
import gc
import os
# attempting to not run out of memory
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()
gc.collect()

28

In [80]:
# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True

Epoch,Training Loss,Validation Loss
0,2.7127,1.815406
2,1.9629,1.371482
4,1.4557,1.315019
5,1.1071,1.407009


In [81]:
from datasets import load_metric
metric = load_metric('rouge', trust_remote_code=True)

In [82]:
from transformers import pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [86]:
gold_references = dataset['test']['output']

# Generate predictions using the model
model_predictions = [generator(input_text, max_new_tokens=50)[0]['generated_text'] for input_text in dataset['test']['input']]

In [87]:
final_score = metric.compute(predictions=model_predictions, references=gold_references)

In [88]:
final_score

{'rouge1': AggregateScore(low=Score(precision=0.1868317567429493, recall=0.43313663294552684, fmeasure=0.2534189024148887), mid=Score(precision=0.21438266400467232, recall=0.48218468161973693, fmeasure=0.28445089362322706), high=Score(precision=0.2425237014859183, recall=0.5336903973011795, fmeasure=0.3155238677547504)),
 'rouge2': AggregateScore(low=Score(precision=0.02672916714833695, recall=0.06400293886864707, fmeasure=0.036635618224982236), mid=Score(precision=0.04137717911468446, recall=0.09330227315548323, fmeasure=0.05501445727081044), high=Score(precision=0.057829888686710365, recall=0.12510986029908794, fmeasure=0.07560596998856539)),
 'rougeL': AggregateScore(low=Score(precision=0.1366643375098545, recall=0.32042687521032337, fmeasure=0.1872825870806942), mid=Score(precision=0.1539623312655453, recall=0.3594296402596075, fmeasure=0.2068576898010936), high=Score(precision=0.17301068318159293, recall=0.3963892437904319, fmeasure=0.22735595982640883)),
 'rougeLsum': AggregateSc

In [90]:
model_input = tokenizer("Write a function that takes two integers and returns their sum.", return_tensors='pt').to("cuda")

model.eval()
with torch.no_grad():
  print(tokenizer.decode(model.generate(**model_input, max_new_tokens=50)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a function that takes two integers and returns their sum.  The function should have the following signature:

    def add(a: int, b: int) -> int:

The function should return the sum of the two integers.  The function should raise a Value
