In [3]:
from torchtune.models.llama3_2 import llama3_2_1b, lora_llama3_2_1b
import json
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Build Llama3.2 1B without any LoRA layers
base_model = llama3_2_1b()
lora_model = lora_llama3_2_1b(lora_attn_modules=["q_proj","v_proj"])

In [5]:
base_model.layers[0].attn

MultiHeadAttention(
  (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
  (k_proj): Linear(in_features=2048, out_features=512, bias=False)
  (v_proj): Linear(in_features=2048, out_features=512, bias=False)
  (output_proj): Linear(in_features=2048, out_features=2048, bias=False)
  (pos_embeddings): Llama3ScaledRoPE()
)

In [6]:
lora_model.layers[0].attn

MultiHeadAttention(
  (q_proj): LoRALinear(
    (dropout): Identity()
    (lora_a): Linear(in_features=2048, out_features=8, bias=False)
    (lora_b): Linear(in_features=8, out_features=2048, bias=False)
  )
  (k_proj): Linear(in_features=2048, out_features=512, bias=False)
  (v_proj): LoRALinear(
    (dropout): Identity()
    (lora_a): Linear(in_features=2048, out_features=8, bias=False)
    (lora_b): Linear(in_features=8, out_features=512, bias=False)
  )
  (output_proj): Linear(in_features=2048, out_features=2048, bias=False)
  (pos_embeddings): Llama3ScaledRoPE()
)

In [17]:
dataset = json.load(open('/home/mehmet/codebase/llm_project/data/CustomeData.json'))

In [18]:
for element in dataset:
    if 'instruction' in element: 
        del element['instruction']

In [19]:
with open('/home/mehmet/codebase/llm_project/data/CustomeDataNoInstructions.json','w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

In [36]:
lora_model.load_state_dict(base_model.state_dict(),strict=False)

_IncompatibleKeys(missing_keys=['layers.0.attn.q_proj.lora_a.weight', 'layers.0.attn.q_proj.lora_b.weight', 'layers.0.attn.v_proj.lora_a.weight', 'layers.0.attn.v_proj.lora_b.weight', 'layers.1.attn.q_proj.lora_a.weight', 'layers.1.attn.q_proj.lora_b.weight', 'layers.1.attn.v_proj.lora_a.weight', 'layers.1.attn.v_proj.lora_b.weight', 'layers.2.attn.q_proj.lora_a.weight', 'layers.2.attn.q_proj.lora_b.weight', 'layers.2.attn.v_proj.lora_a.weight', 'layers.2.attn.v_proj.lora_b.weight', 'layers.3.attn.q_proj.lora_a.weight', 'layers.3.attn.q_proj.lora_b.weight', 'layers.3.attn.v_proj.lora_a.weight', 'layers.3.attn.v_proj.lora_b.weight', 'layers.4.attn.q_proj.lora_a.weight', 'layers.4.attn.q_proj.lora_b.weight', 'layers.4.attn.v_proj.lora_a.weight', 'layers.4.attn.v_proj.lora_b.weight', 'layers.5.attn.q_proj.lora_a.weight', 'layers.5.attn.q_proj.lora_b.weight', 'layers.5.attn.v_proj.lora_a.weight', 'layers.5.attn.v_proj.lora_b.weight', 'layers.6.attn.q_proj.lora_a.weight', 'layers.6.attn.q_p

In [39]:
from torchtune.modules.peft import get_adapter_params, set_trainable_params

#Fetch all paramas from the model that are associated with LoRA.
lora_params = get_adapter_params(lora_model)

#Set requires_grad= True for LoRA and False for all others.
set_trainable_params(lora_model,lora_params)

# Print the total number of parameters
total_params = sum([p.numel() for p in lora_model.parameters()])
trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])
print(
  f"""
  {total_params} total params,
  {trainable_params}" trainable params,
  {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.
  """
)


  1236666368 total params,
  851968" trainable params,
  0.07% of all params are trainable.
  
