# Fine-Tuning Llama models

## Load the dataset

In [None]:
from datasets import load_dataset

indent_ds = load_dataset("aholovko/pep8_indentation_compliance")
indent_ds

In [None]:
indent_df = indent_ds["train"].to_pandas()
indent_df.head()

In [None]:
features = indent_ds["train"].features
features

In [None]:
features["label"].int2str(0)

In [None]:
id2label = {idx: features["label"].int2str(idx) for idx in range(2)}
id2label

In [None]:
label2id = {v:k for k,v in id2label.items()}
label2id

In [None]:
indent_df["label"].value_counts(normalize=True).sort_index()

## Tokenization

In [None]:
from transformers import LlamaTokenizer

model_id = "meta-llama/Llama-2-7b-hf"

tokenizer = LlamaTokenizer.from_pretrained(model_id)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.unk_token

# tokenizer(indent_ds["train"]["code"][:1])

In [None]:
def tokenize_text(examples):
    return tokenizer(examples["code"], truncation=True, max_length=512, padding='longest')

In [None]:
indent_ds = indent_ds.map(tokenize_text, batched=True)
indent_ds

## Fine-tuning model

In [None]:
from transformers import LlamaForSequenceClassification, BitsAndBytesConfig
import torch

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=False,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
# 
# print(torch.backends.mps.is_available())

model = LlamaForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    # quantization_config=bnb_config,
    device_map='mps',
    id2label=id2label,
    label2id=label2id,
)


In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

epochs = 3
batch_size = 8
learning_rate = 5e-5
logging_steps = len(indent_ds["train"]) // batch_size

training_args = TrainingArguments(
    output_dir="llama2-tuned",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    save_strategy="no",
    load_best_model_at_end=False,
    push_to_hub=False,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model.config.pad_token_id = tokenizer.pad_token_id

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=indent_ds["train"],
    eval_dataset=indent_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()