## Finetune parameters

In [35]:
# python finetune.py \
#     --base_model '../models/llama_7b_hf' \
#     --data_path 'yahma/alpaca-cleaned' \
#     --output_dir './lora-alpaca-cleaned' \
#     --batch_size 128 \
#     --micro_batch_size 4 \
#     --num_epochs 10 \
#     --learning_rate 1e-4 \
#     --cutoff_len 512 \
#     --val_set_size 2000 \
#     --lora_r 16 \
#     --lora_alpha 16 \
#     --lora_dropout 0.05 \
#     --lora_target_modules '[q_proj,k_proj,v_proj,o_proj]' \
#     --train_on_inputs \
#     --group_by_length \
#     --wandb_project 'alpaca-lora-cleaned'
#     --wandb_run_name 'alpaca-lora-10epoch'


In [1]:
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
import torch
import datasets
from peft import PeftModel


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Loading binary /home/ashwinram472/miniconda3/envs/llm/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [2]:
 model = LlamaForCausalLM.from_pretrained(
        '../models/llama_7b_hf',
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map='auto',
    )

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): Llama

In [6]:

lora_weights = 'lora-alpaca-cleaned'
model = PeftModel.from_pretrained(
    model,
    lora_weights,
    torch_dtype=torch.float16,
)

In [7]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(
                in_features=4096, out_features=4096, b

In [9]:
tokenizer = LlamaTokenizer.from_pretrained("../models/llama_7b_hf")

In [36]:
def generate_prompt(instruction: str, input_ctxt: str = None) -> str:
    if input_ctxt:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_ctxt}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
)

model.eval()


instruction = "Count up from 1 to 500."

input_ctxt = None  # For some tasks, you can provide an input context to help the model generate a better response.

prompt = generate_prompt(instruction, input_ctxt)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
    )

response = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

print(response.split("### Response:")[1].strip().split("### Instruction")[0])

1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,


#### Pushing model to Hub

In [29]:
from peft import PeftModel, PeftConfig
config = PeftConfig.from_pretrained('lora-alpaca-cleaned')

In [31]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(
                in_features=4096, out_features=4096, b

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [34]:
model.push_to_hub('ashwinram472/alpaca-cleaned-lora-7b')

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/67.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ashwinram472/alpaca-cleaned-lora-7b/commit/026cc36708a2cab11ac33e158c629c1a8e114fab', commit_message='Upload model', commit_description='', oid='026cc36708a2cab11ac33e158c629c1a8e114fab', pr_url=None, pr_revision=None, pr_num=None)