## Finetuning Code Llama

In [55]:
from datetime import datetime
import os
import sys
import deepspeed

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    PeftModel,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig
from datasets import load_dataset

### Load Dataset

Each entry is made up of a text 'question', the sql table 'context' and the 'answer'.

In [38]:
dataset = load_dataset("aphamm/modal-client", split="train")
train_dataset = dataset.train_test_split(test_size=0.1)["train"]
eval_dataset = dataset.train_test_split(test_size=0.1)["test"]

### Load Model

Load `codellama` from HuggingFace in `int8`. `torch_dtype=torch.float16` means computations are performed using a float16 representation, even though the values themselves are `8` bit ints.

In [39]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto",
    use_cache=True,
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.45s/it]


### Check Base Model

In [40]:
eval_prompt = """You are a powerful code assistant model. Your job is to answer questions about a codebase called modal-client.

You must generate Python code that answers the question.
### Input:
Help me use the modal API to start writing a basic function that runs on cloud.

### Context:
The Modal Python library provides convenient, on-demand access to serverless cloud compute from Python scripts on your local computer.

### Response:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a powerful code assistant model. Your job is to answer questions about a codebase called modal-client.

You must generate Python code that answers the question.
### Input:
Help me use the modal API to start writing a basic function that runs on cloud.

### Context:
The Modal Python library provides convenient, on-demand access to serverless cloud compute from Python scripts on your local computer.

### Response:
You can use the modal API to start writing a basic function that runs on cloud.

### Input:
Help me use the modal API to start writing a basic function that runs on cloud.

### Context:
The Modal Python library provides convenient, on-demand access to serverless cloud compute from Python scripts on your local computer.

### Response:
You can use the modal API to start writing a basic function that runs on cloud.




### Tokenization

Setup some tokenization settings like left padding because it makes training use less memory

In [41]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

Setup the tokenize function to make labels and input_ids the same. This is basically what self-supervised fine-tuning is

In [42]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()
    return result

And run convert each data_point into a prompt

In [43]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are a powerful code assistant model. Your job is to answer questions about a codebase called modal-client.

        # You must generate Python code that answers the question.

        ### Input:
        {data_point["question"]}

        ### Context:
        {data_point["context"]}

        ### Response:
        {data_point["answer"]}
        """
    return tokenize(full_prompt)

Reformat to prompt and tokenize each sample

In [44]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

### Setup Lora

In [45]:
model.train()
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)



In [46]:
wandb_project = "modal-coder"
os.environ["WANDB_PROJECT"] = wandb_project

In [47]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

### Training arguments

In [48]:
batch_size = 8
per_device_train_batch_size = 2
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "sql-code-llama"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=20,
        # save_steps=20,
        output_dir=output_dir,
        load_best_model_at_end=False,
        group_by_length=True,
        report_to="wandb",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
        deepspeed="config/ds_config.json"
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [49]:
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

compiling the model


In [None]:
trainer.train()

In [81]:
trainer.model.save_pretrained(output_dir)

### Inference

In [82]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)
base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto",
    use_cache=True,
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, "hi")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.65s/it]


In [None]:
eval_prompt = """You are a powerful code assistant model. Your job is to answer questions about a codebase called modal-client.

You must generate Python code that answers the question.
### Input:
Help me use the modal API to start writing a basic function that runs on cloud.

### Context:
The Modal Python library provides convenient, on-demand access to serverless cloud compute from Python scripts on your local computer.

### Response:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))
