### 1. Setup development environment

In [None]:
# Install Pytorch for FSDP and FA/SDPA
%pip install --quiet "torch==2.3.0" tensorboard
 
# Install Hugging Face libraries
%pip install  --upgrade --quiet \
    "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"
  
# Install flash-attn
%pip install --quiet flash-attn --no-build-isolation

### 2. Create and prepare dataset

In [None]:
from utils import data_utils

dataset_id = 'deepmind/code_contests'
save_dataset_local_path = "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/data"

print(f"save_dataset_local_path: {save_dataset_local_path}")

In [None]:
train_dataset = data_utils.load_and_process(
    dataset_id=dataset_id, 
    split="train[:60%]"
)

test_dataset = data_utils.load_and_process(
    dataset_id=dataset_id,
    split="test"
)

print(f"len(train_dataset): {len(train_dataset)}, len(test_dataset): {len(test_dataset)}")

In [None]:
train_dataset.to_json(
    f"{save_dataset_local_path}/train_dataset.json", 
    orient="records", 
    force_ascii=False
)

test_dataset.to_json(
    f"{save_dataset_local_path}/test_dataset.json", 
    orient="records", 
    force_ascii=False
)

print(f"dataset files saved to: {save_dataset_local_path}")

### 3. Set arguments

In [1]:
%%writefile config/codestral_fsdp_qlora.yaml

### training related
dataset_path: "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/data" # prexisting folder path
output_dir: "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/checkpoints" # prexisting folder path
sm_save_model_dir: "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/model/"  # prexisting folder path
logging_dir: "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/checkpoints/logs" # prexisting folder path

model_id: "mistral-community/Codestral-22B-v0.1"
num_train_epochs: 1
max_steps: -1 # mumber of training steps (overrides num_train_epochs)
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs: 
    use_reentrant: false
bf16: true
tf32: true
max_grad_norm: 0.3
weight_decay: 0.001
optim: "adamw_torch"
learning_rate: 0.0002
warmup_ratio: 0.03
lr_scheduler_type: "constant"
save_strategy: "no"
logging_steps: 25
logging_strategy: "steps"
group_by_length: true
max_seq_length: 4096
packing: false
finetune_with_sm: false
merge_weights_and_save: true
save_tokenizer: true
attn_implementation: "sdpa"

### qlora related
lora_r: 64
lora_alpha: 16
lora_dropout: 0.1
task_type: "CAUSAL_LM"

### bitsandbytes related
load_in_4bit: true
bnb_4bit_use_double_quant: true
bnb_4bit_quant_type: "nf4"
bnb_4bit_compute_dtype: "bfloat16"
bnb_4bit_quant_storage: "bfloat16"

Overwriting config/codestral_fsdp_qlora.yaml


### 4. Begin training!

In [None]:
! ACCELERATE_USE_FSDP=1 FSDP_CPU_RAM_EFFICIENT_LOADING=1 \
    torchrun scripts/sft_fsdp_qlora.py \
    --nnodes=1 --nproc-per-node=4 --config config/codestral_fsdp_qlora.yaml

### 5. Run inference

In [None]:
import gc, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.cuda.empty_cache()
gc.collect()

In [None]:
model_local_path = "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/model/"
print(f"model_local_path: {model_local_path}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_local_path, trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

sft_model = AutoModelForCausalLM.from_pretrained(
    model_local_path,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    device_map="auto",
    torch_dtype=torch.bfloat16,
).to("cuda")

In [None]:
eval_sample = test_dataset[6]
eval_prompt, eval_completion = eval_sample["messages"][0]["content"], eval_sample["messages"][2]["content"]

print(f"prompt: {eval_prompt}")
print("\n", f"*"*25, "\n")
print(f"completion: {eval_completion}")

In [None]:
model_inputs = tokenizer([eval_prompt], return_tensors="pt").to("cuda")
sft_model.eval()
with torch.no_grad():
    generated_ids = sft_model.generate(
        **model_inputs, max_new_tokens=1000, do_sample=True
    )
    results = tokenizer.batch_decode(generated_ids)[0]
    # prompt_length = model_inputs['input_ids'].shape[1]
    # results = tokenizer.batch_decode(generated_ids[prompt_length:])[0]
    print(results)