<a href="https://colab.research.google.com/github/afifaniks/triagerX/blob/main/notebook/Fine_Tune_Llama_2_with_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Mon Sep 18 01:26:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000000:17:00.0 Off |                    0 |
| N/A   35C    P0    53W / 300W |      0MiB / 80994MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 torch scipy



In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    StoppingCriteria, 
    StoppingCriteriaList
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

## Configuration

In [4]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
data_path = "data/deeptriage/classifier_data_0.csv"
new_model = "llama-2-7b-deeptriage"

# Set QLoRA configuration
lora_r = 64 # Attention dimension/rank
lora_alpha = 16
lora_dropout = 0.05

# Set bitsandbytes configuration
use_4bit = True #For  4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False # Activate nested quantization for 4-bit base models (double quantization)


# Set training params
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = 250
warmup_ratio = 0.03
group_by_length = True # Group sequences into batches with same length saves memory and speeds up training considerably
save_steps = 0
logging_steps = 10

# Set SFT parameters
max_seq_length = None
packing = False # Pack multiple short examples in the same input sequence to increase efficiency
device_map = {"": 0} # Load the entire model on the GPU 0

In [5]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

## Load Base Model

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Using it for optimized model loading
    device_map=device_map
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix overflow issue with fp16 training

## Test Base Model

In [18]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [19]:
def inference(model, tokenizer, prompt, max_length=200):
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length, stopping_criteria=stopping_criteria)
  result = pipe(f"<s>[INST] {prompt} [/INST]")

  return result[0]["generated_text"]

In [18]:
print(inference(model, tokenizer, "Issue Title: All default search engine settings were wiped out\nIssue Description: The DhcpProxyScriptFetcher implementation (net/proxy/dhcp_proxy_script_fetcher_win.cc and net/proxy/dhcp_proxy_script_adapter_fetcher_win.cc) currently uses base::WorkerPool, which we plan to deprecate (see issue 251774).It would also make a lot of sense to use only a limited number of threads to fetch DHCP PAC information.Therefore, this bug tracks switching this implementation to use base::SequencedWorkerPool with a limit of 10 threads, that is owned by the DhcpProxyScriptFetcher and handed to each DhcpProxyScriptAdapterFetcher.  An equivalent to the current base::WorkerPool::PostTaskAndReply should be to PostTaskAndReply on a TaskRunner retrieved using SequencedWorkerPool::GetTaskRunnerWithShutdownBehavior(CONTINUE_ON_SHUTDOWN).This should be relatively straightforward and I have a possible volunteer in mind, so I'm tagging it as a GoodFirstBug. Who can fix it?"))

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Input length of input_ids is 265, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


<s>[INST] Issue Title: All default search engine settings were wiped out
Issue Description: The DhcpProxyScriptFetcher implementation (net/proxy/dhcp_proxy_script_fetcher_win.cc and net/proxy/dhcp_proxy_script_adapter_fetcher_win.cc) currently uses base::WorkerPool, which we plan to deprecate (see issue 251774).It would also make a lot of sense to use only a limited number of threads to fetch DHCP PAC information.Therefore, this bug tracks switching this implementation to use base::SequencedWorkerPool with a limit of 10 threads, that is owned by the DhcpProxyScriptFetcher and handed to each DhcpProxyScriptAdapterFetcher.  An equivalent to the current base::WorkerPool::PostTaskAndReply should be to PostTaskAndReply on a TaskRunner retrieved using SequencedWorkerPool::GetTaskRunnerWithShutdownBehavior(CONTINUE_ON_SHUTDOWN).This should be relatively straightforward and I have a possible volunteer in mind, so I'm tagging it as a GoodFirstBug. Who can fix it? [/INST] 


## Setup Training Pipeline

In [8]:
dataset = load_dataset("csv", data_files=data_path, split="train")

In [9]:
def format_dataset(data):
    output_texts = []
    
    for i in range(len(data["issue_title"])):
        formatted_text = f"<s><INST>Issue Title:\n{data['issue_title'][i]}" \
        + f"\nIssue Description:\n{data['description'][i]}\nWho can fix this issue?\n</INST>The issue can be fixed by: {data['owner'][i]}</s>"
        output_texts.append(formatted_text)
        print(formatted_text)

    return output_texts

In [11]:
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)
    else:
      print(f"Using {compute_dtype}")

Your GPU supports bfloat16: accelerate training with bf16=True


In [12]:
model.config.use_cache = False
model.config.pretraining_tp = 1

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

training_params = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_params,
    packing=packing,
    formatting_func=format_dataset
)

trainer.train()
trainer.model.save_pretrained(new_model)



You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.1916
20,2.5802
30,2.4095
40,2.142
50,1.8149
60,1.8136
70,2.128
80,2.122
90,2.0066
100,1.7466


## Memory Cleanup to Save Fine-Tuned Model (Google Colab)

In [None]:
# del model
# del trainer
# import gc
# gc.collect()
# gc.collect()

## Merge LoRA Weights with Base Model

In [13]:
model_name, new_model

('NousResearch/Llama-2-7b-chat-hf', 'llama-2-7b-deeptriage')

In [14]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
issue_title = "NaCl 3D busted - command buffer initialization failure"
issue_description = "What steps will reproduce the problem?1. Start Chrome2. Attempt to run a Native Client 3D module (e.g. angrybots)3. ""Aw, snap"". Note line ""Failed to create context"" in log below.I have only tested this on MacOS. Two most recent canary builds (16.0.909, 16.0.910) fail, as well as head of tree. I have a browser build @105335 (13 Oct) and it works fine, so it appears to be a change between 105335 and whatever the rev for Canary 16.0.909 would be.dhcp-172-19-0-123:src bradchen$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary  --enable-nacl[34896:2307:20612385773456:ERROR:process_util_mac.mm(283)] Invalid process[34907,2954604544:04:52:38.029878] NaCl_page_alloc_randomized: 0xde7a2a0b[34907,2954604544:04:52:38.030035] NaCl_page_alloc_randomized: hint 0x5e7a0000[34907,2954604544:04:52:38.030136] NaClMakePcrelThunk: got addr 0x5e7a0000[34903,2953392128:04:52:38.099973] PluginReverseInterface::StartupInitializationComplete[34903,2953392128:04:52:38.100020] PluginReverseInterface::StartupInitializationComplete: invoking CBUnityModule.[34909:263:20627425695745:ERROR:gpu_command_buffer_stub.cc(225)] Failed to create context.[34903:263:20627425900735:ERROR:command_buffer_proxy.cc(136)] Failed to initialize command buffer service.[SRPC:NACL:34907,1056900288:11:52:38.539000] NaClSrpcRpcWait(channel=0x3efc0bb0): EOF is received instead of response. Probably, the other side (usually, nacl module or browser plugin) crashed. "
print(inference(model, tokenizer, f"Issue Title: {issue_title}\nIssue Description: {issue_description} Who can this issue", 3000))

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


<s>[INST] Issue Title: NaCl 3D busted - command buffer initialization failure
Issue Description: What steps will reproduce the problem?1. Start Chrome2. Attempt to run a Native Client 3D module (e.g. angrybots)3. Aw, snap. Note line Failed to create context in log below.I have only tested this on MacOS. Two most recent canary builds (16.0.909, 16.0.910) fail, as well as head of tree. I have a browser build @105335 (13 Oct) and it works fine, so it appears to be a change between 105335 and whatever the rev for Canary 16.0.909 would be.dhcp-172-19-0-123:src bradchen$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary  --enable-nacl[34896:2307:20612385773456:ERROR:process_util_mac.mm(283)] Invalid process[34907,2954604544:04:52:38.029878] NaCl_page_alloc_randomized: 0xde7a2a0b[34907,2954604544:04:52:38.030035] NaCl_page_alloc_randomized: hint 0x5e7a0000[34907,2954604544:04:52:38.030136] NaClMakePcrelThunk: got addr 0x5e7a0000[34903,2953392128:04:52:38.099973] P

In [21]:
print(inference(model, tokenizer, "Issue Title: All default search engine settings were wiped out\nIssue Description: The DhcpProxyScriptFetcher implementation (net/proxy/dhcp_proxy_script_fetcher_win.cc and net/proxy/dhcp_proxy_script_adapter_fetcher_win.cc) currently uses base::WorkerPool, which we plan to deprecate (see issue 251774).It would also make a lot of sense to use only a limited number of threads to fetch DHCP PAC information.Therefore, this bug tracks switching this implementation to use base::SequencedWorkerPool with a limit of 10 threads, that is owned by the DhcpProxyScriptFetcher and handed to each DhcpProxyScriptAdapterFetcher.  An equivalent to the current base::WorkerPool::PostTaskAndReply should be to PostTaskAndReply on a TaskRunner retrieved using SequencedWorkerPool::GetTaskRunnerWithShutdownBehavior(CONTINUE_ON_SHUTDOWN).This should be relatively straightforward and I have a possible volunteer in mind, so I'm tagging it as a GoodFirstBug.\nWho can fix this issue?", 1500))

<s>[INST] Issue Title: All default search engine settings were wiped out
Issue Description: The DhcpProxyScriptFetcher implementation (net/proxy/dhcp_proxy_script_fetcher_win.cc and net/proxy/dhcp_proxy_script_adapter_fetcher_win.cc) currently uses base::WorkerPool, which we plan to deprecate (see issue 251774).It would also make a lot of sense to use only a limited number of threads to fetch DHCP PAC information.Therefore, this bug tracks switching this implementation to use base::SequencedWorkerPool with a limit of 10 threads, that is owned by the DhcpProxyScriptFetcher and handed to each DhcpProxyScriptAdapterFetcher.  An equivalent to the current base::WorkerPool::PostTaskAndReply should be to PostTaskAndReply on a TaskRunner retrieved using SequencedWorkerPool::GetTaskRunnerWithShutdownBehavior(CONTINUE_ON_SHUTDOWN).This should be relatively straightforward and I have a possible volunteer in mind, so I'm tagging it as a GoodFirstBug.
Who can fix this issue? [/INST]  The issue can 

In [22]:
print(inference(model, tokenizer, "Issue Title: tpmc needs to produce a better error message when it fails to open the TPM device\nIssue Description: Currently the open fails quietly, then tpmc fails at sending a command and the error message is horrible (forgot to call TlclLibInit()?)\nWho can fix this issue?", 400))

<s>[INST] Issue Title: tpmc needs to produce a better error message when it fails to open the TPM device
Issue Description: Currently the open fails quietly, then tpmc fails at sending a command and the error message is horrible (forgot to call TlclLibInit()?)
Who can fix this issue? [/INST]  The issue can be fixed by: james.k.brown@chromium.org.


In [23]:
print(inference(model, tokenizer, "Issue Title: Scrolling Issue\nIssue Description: Scroll down works but scroll up did not work.\nWho can fix this issue?", 1500))

<s>[INST] Issue Title: Scrolling Issue
Issue Description: Scroll down works but scroll up did not work.
Who can fix this issue? [/INST]  The issue can be fixed by: kyotaro@chromium.org 


In [25]:
print(inference(model, tokenizer, "Issue Title: Resolution change videos are not playing\nIssue Description: Could not play video if the resolution changes during playback.\nWho can fix this issue?", 1500))

<s>[INST] Issue Title: Resolution change videos are not playing
Issue Description: Could not play video if the resolution changes during playback.
Who can fix this issue? [/INST]  The issue can be fixed by: james@chromium.org 


In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

## Test Fine-Tuned Model

In [None]:
# try:
#     del tokenizer
#     del model
#     del base_model
# except:
#     pass

In [None]:
hf_custom_model_path = f"afifaniks/{new_model}"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

In [None]:
model = PeftModel.from_pretrained(model, hf_custom_model_path)
model = model.merge_and_unload()

In [None]:
print(inference(model, tokenizer, "Why birds don't have wheels?"))

In [None]:
print(inference(model, tokenizer, "Can you write some Delphi code that uses named pipes?"))

In [None]:
print(inference(model, tokenizer, "Which is a species of fish? Tope or Rope"))