### 1. Usecase and task

- the competitive programming code generation problem can be viewed as a sequence-to-sequence translation task
- given a problem description 'X' in natural language, produce a corresponding solution 'Y' in a programming language. 
- The metric used for evaluation is "percentage of problems solved using 'n' submissions from 'k' samples per problem", denoted as 'n@k'.

### 2. Setup development environment

In [None]:
! pip install --quiet \
    "torch==2.3.0" \
    tensorboard

! pip install --upgrade --quiet \
    "transformers==4.41.2" \
    "accelerate==0.30.1" \
    "datasets==2.19.1" \
    "peft==0.11.1" \
    "bitsandbytes==0.43.1" \
    "trl==0.8.6" \
    "evaluate==0.4.2" \
    huggingface_hub huggingface

In [None]:
import torch
assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'

# on a multi-gpu machine
! pip install flash-attn --no-build-isolation --quiet

# NOTE: use when 'Hardware not supported for Flash Attention'
# on a single gpu or only cpu machine 
# ! pip install ninja packaging --quiet
# ! MAX_JOBS=4 pip install flash-attn --no-build- --quiet

In [None]:
from huggingface_hub import login
 
login(
  token="", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

### 3. Create and prepare the dataset

In [None]:
from datasets import load_dataset, Dataset
from pprint import pprint
import pandas as pd

#### step 1: download dataset from hub

In [None]:
dataset_id = "deepmind/code_contests"

dataset = load_dataset(dataset_id, split="train[:1%]")
# dataset = load_dataset(dataset_id, split="test") # uncomment when want to perform eval inference
print(f"len(dataset): {len(dataset)}\nfeatures:")
pprint(dataset.features)

#### step 2. apply filter

In [None]:
def count_python_solutions(sample):
    df = pd.DataFrame(sample["solutions"])
    df_python = df[(df.language==3) | (df.language==1)]
    return df_python.shape[0]

# get instances with 2000+ rating and contains python lang solutions
dataset = dataset.filter(lambda sample: (sample["cf_rating"] >= 2000) & (count_python_solutions(sample) >= 1))
print(f"len(dataset): {len(dataset)}")

#### step 3: augment dataset

In [None]:
# TODO: rethink how to do this: next idea to explore: flatten solutions -> and then map perhaps
def augment_dataset(dataset):
    df = dataset.to_pandas()
    aug_rows = []
    for i, item in df.iterrows():
        for j, soln in enumerate(item["solutions"]["solution"]):
            language = item["solutions"]["language"][j]
            if (language==3 or language==1): # python3 or python2
                item_new = item.copy(deep=True)
                item_new["python_solution"] = soln
                item_new.drop('solutions', inplace=True)
                aug_rows.append(item_new)
    aug_df = pd.DataFrame(aug_rows)
    aug_ds = Dataset.from_pandas(aug_df)
    return aug_ds

# augment dataset: 1{1_problem + n_solutions} to n{1_problem + 1_solution}
dataset = augment_dataset(dataset) 
print(f"len(dataset): {len(dataset)}")

#### step 4. apply instruct prompt template

In [None]:
from random import randint

In [None]:
mistral_instruct_template = "[INST]{instruction}[/INST]"

system_prompt = """You are a helpful code assistant. Ensure any code you provide can be executed with all required imports and variables defined. 

You must understand problem statement defined within problem_description tags and generate code that will pass all the tests:
<context>
{description}
{tests}
</context>

Begin!
You must generate only code with all required imports within <answer> XML tags."""

human_prompt = """Generate code in Python."""

assistant_prompt = """<answer>
{code}
</answer>"""

# tests_item_format = """<item idx={idx}>
# Input:
# {inputs}
# Output:
# {outputs}
# </item>
# """

tests_item_format = """Input:
{inputs}
Output:
{outputs}
"""

def format_dataset(sample):
    # a. construct prompt
    tests = sample["private_tests"]
    tests_formatted = "\n".join([
        tests_item_format.format(idx=idx, inputs=i.strip(), outputs=o.strip()) 
        for idx, (i,o) in enumerate(
            zip(tests["input"], tests["output"])
        )
    ])
    system_message = system_prompt.format(
        description=sample["description"].replace("<image>", "IMAGE"),
        tests=tests_formatted
    )
    human_message = human_prompt
    instruction = f"{system_message}\n\n{human_message}"
    prompt = mistral_instruct_template.format(instruction=instruction)
    
    # b. construct completion
    completion = assistant_prompt.format(
        code=sample["python_solution"]
    )
   
    # c. instruction format
    sample["prompt"] = prompt
    sample["completion"] = completion
    return sample

# convert dataset to instruct prompt template
columns_to_remove = list(dataset.features)
print(f"columns_to_remove: {columns_to_remove}")
dataset = dataset.map(format_dataset, remove_columns=columns_to_remove, batched=False)
print(f"len(dataset): {len(dataset)}")

In [None]:
# print random sample
pprint(dataset[randint(0, len(dataset))])

### 4. Finetune LLM

#### step 1: initialize parameters

In [None]:
### model
model_id = "mistral-community/Codestral-22B-v0.1"

### qlora related
r = 64
lora_alpha = 16
lora_dropout = 0.1
target_modules = [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
task_type = "CAUSAL_LM"

### bitsandbytes related
load_in_4bit=True
bnb_4bit_use_double_quant=True
bnb_4bit_quant_type="nf4"
bnb_4bit_compute_dtype="bfloat16"


### training related
output_dir = "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/checkpoints" # prexisting folder path
save_model_dir = "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/model/"  # prexisting folder path
offload_folder = "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/offload" # prexisting folder path
logging_dir=f"{output_dir}/logs"

num_train_epochs = 1
max_steps = 100 # mumber of training steps (overrides num_train_epochs)

per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 1
gradient_checkpointing = True

bf16 = True
fp16 = False

max_grad_norm = 0.3
weight_decay = 0.001
# optim = "paged_adamw_32bit"
optim = "adamw_torch"

learning_rate = 2e-4
warmup_ratio = 0.03
lr_scheduler_type = "constant"

save_strategy = "no"
logging_steps = 25
logging_strategy = "steps"
group_by_length = True

max_seq_length = 4096
packing = False

#### step 2: instantiate tokenizer and quantized model 

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

In [None]:
# define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [None]:
# define 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
    bnb_4bit_compute_dtype=getattr(torch, bnb_4bit_compute_dtype),
)
# define model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_cache=False if gradient_checkpointing else True,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1 # num_of_gpus
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

#### step 3: define lora config

In [None]:
import bitsandbytes as bnb
from peft import LoraConfig

In [None]:
def find_all_linear_names(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)


# get lora target modules
modules = find_all_linear_names(model)
print(modules) # NOTE: update target_modules with these values

In [None]:
lora_config = LoraConfig(
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    bias="none",
    task_type=task_type,
)

#### step 4: define training args, collator, trainer

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [None]:
# set training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    weight_decay=weight_decay,    
    optim=optim,
    learning_rate=learning_rate,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    save_strategy=save_strategy,
    logging_steps=logging_steps,
    logging_strategy=logging_strategy,
    group_by_length=group_by_length,
)

In [None]:
# checkout for more info: Train on completions only https://huggingface.co/docs/trl/en/sft_trainer

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"{example['prompt'][i]}\n\n ### Answer: {example['completion'][i]}"
        output_texts.append(text)
    return output_texts

collator = DataCollatorForCompletionOnlyLM(
    response_template="### Answer:", 
    tokenizer=tokenizer
)

In [None]:
# initialize sft trainer
trainer = SFTTrainer(
    args=training_arguments,
    model=model,
    peft_config=lora_config,
    tokenizer=tokenizer,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=max_seq_length,
    packing=packing
)

#### step 5: start training and save finetuned adapter weights

In [None]:
# begin!
trainer.train()

In [None]:
# save int4 model
trainer.model.save_pretrained(output_dir, safe_serialization=False)

In [None]:
# clear memory
del model
del trainer
torch.cuda.empty_cache()

#### step 6: merge adapter weights and base model

In [None]:
from peft import AutoPeftModelForCausalLM

In [None]:
# load PEFT model in fp16
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,  # ATTENTION: This allows remote code execution
)  

In [None]:
print(model)

In [None]:
# merge
merged_model = model.merge_and_unload()

In [None]:
print(merged_model)

In [None]:
# save merged model
merged_model.save_pretrained(save_model_dir, safe_serialization=True,  max_shard_size="2GB")

In [None]:
# save tokenizer for easy inference
tokenizer.save_pretrained(save_model_dir)

In [None]:
del model
del merged_model
del tokenizer

torch.cuda.empty_cache()

### 5. Test and evaluate

In [None]:
# NOTE: restart the kernel and run from this section

#### prepare test dataset

In [None]:
# uncomment the test dataset and run all the cells within section 3: Create and prepare dataset

#### inference: finetuned model

In [None]:
import gc, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.cuda.empty_cache()
gc.collect()

In [None]:
model_local_path = "/home/ubuntu/sft_cache/model/"
print(f"model_local_path: {model_local_path}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_local_path, trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

sft_model = AutoModelForCausalLM.from_pretrained(
    model_local_path,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

In [None]:
eval_sample = dataset[6]
eval_prompt, eval_completion = eval_sample["prompt"], eval_sample["completion"]

print(f"prompt: {eval_prompt}")
print("\n", f"*"*25, "\n")
print(f"completion: {eval_completion}")

In [None]:
model_inputs = tokenizer([eval_prompt], return_tensors="pt").to("cuda")
sft_model.eval()
with torch.no_grad():
    generated_ids = sft_model.generate(
        **model_inputs, max_new_tokens=1000, do_sample=True
    )
    results = tokenizer.batch_decode(generated_ids)[0]
    # prompt_length = model_inputs['input_ids'].shape[1]
    # results = tokenizer.batch_decode(generated_ids[prompt_length:])[0]
    print(results)

#### inference: original model

In [None]:
del sft_model
del tokenizer

In [None]:
import gc, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.cuda.empty_cache()
gc.collect()

In [None]:
model_id = "mistral-community/Codestral-22B-v0.1"
print(f"model_id: {model_id}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_local_path, trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    attn_implementation="flash_attention_2",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

In [None]:
eval_sample = dataset[6]
eval_prompt, eval_completion = eval_sample["prompt"], eval_sample["completion"]

print(f"prompt: {eval_prompt}")
print("\n", f"*"*25, "\n")
print(f"completion: {eval_completion}")

In [None]:
model_inputs = tokenizer([eval_prompt], return_tensors="pt").to("cuda")
base_model.eval()
with torch.no_grad():
    generated_ids = base_model.generate(
        **model_inputs, max_new_tokens=1000, do_sample=True
    )
    results = tokenizer.batch_decode(generated_ids)[0]
    print(results)

#### inference: chat with vllm

In [None]:
# ! pip install vllm ray

In [None]:
from vllm import LLM, SamplingParams

In [None]:
model_id = "mistral-community/Codestral-22B-v0.1"
# model_id = "/home/ubuntu/sft_cache/model/"

print(f"model_id: {model_id}")

In [None]:
llm = LLM(model_id, tensor_parallel_size=4, dtype="bfloat16")

In [None]:
prompt = "How to solve high leverage AI research problems ? And give examples where AI research helped humanity make leaps of progress."
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)

output = llm.generate(prompt, sampling_params)
print(output[0].outputs[0].text)