In [22]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from transformers import BitsAndBytesConfig


In [3]:
base_model_path = "D:/Llama-3.2-1B-Instruct"
tokenizer_path = base_model_path 


In [13]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


In [8]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True  
)
model = model.to("cuda")

In [9]:
dataset = load_dataset("json", data_files="dataset.jsonl")


Generating train split: 61 examples [00:00, ? examples/s]


In [10]:
print(dataset)
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 61
    })
})
{'instruction': "What is the candidate's full name?", 'input': 'Name: Aneesh Patne', 'output': "The candidate's full name is Aneesh Patne."}


In [29]:
def preprocess_function(examples):
    prompts = []
    labels = []
    for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output']):
        # Create the prompt
        prompt = f"Instruction: {inst}\nInput: {inp}\nResponse:"
        # Combine prompt and output
        combined = f"{prompt} {out}"
        prompts.append(prompt)
        labels.append(out)
    return {'prompt': prompts, 'response': labels}


In [35]:
def tokenize_function(examples):
    prompts = [
        f"Instruction: {inst}\nInput: {inp}\nResponse:"
        for inst, inp in zip(examples['instruction'], examples['input'])
    ]
    responses = examples['output']

    # Tokenize the combined prompts and responses
    encodings = tokenizer(prompts, responses, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # Initialize labels with the input_ids
    labels = encodings.input_ids.clone()

    for i in range(len(prompts)):
        # Tokenize the prompt to find its length
        prompt_encoding = tokenizer(prompts[i], truncation=True, max_length=512, return_tensors="pt")
        prompt_length = prompt_encoding.input_ids.shape[1]

        # Mask the prompt tokens in labels by setting them to -100
        labels[i, :prompt_length] = -100

    encodings['labels'] = labels

    # Convert tensors to lists for the dataset
    encodings = {k: v.tolist() for k, v in encodings.items()}

    return encodings


In [39]:


# Load your dataset
dataset = load_dataset("json", data_files="dataset.jsonl")

# Apply the tokenization function
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "input", "output"])

# Set the format for PyTorch
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map: 100%|██████████| 61/61 [00:00<00:00, 1416.08 examples/s]


In [40]:

print(train_dataset[0])


{'input_ids': tensor([128000,  17077,     25,   3639,  25845,   1587,    279,  16063,   1501,
           369,    279,  88252,     85,   2447,   5380,   2566,     25,  88252,
            85,     25,   4448,    220,   2366,     19,    482,   5936,    220,
          2366,     19,   1234,    279,   5195,  12761,  26323,  71053,    627,
          2647,     25,    578,  88252,     85,   2447,   3952,   2035,   1990,
          6186,    323,   5936,    220,   2366,     19,    439,    961,    315,
           279,   5195,  12761,  26323,     13, 128009, 128009, 128009, 128009,
        128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
        128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
        128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
        128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
        128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
        128009, 128009, 12

In [44]:
sample = tokenized_dataset['train'][0]
print(sample['input_ids'].shape)      # Should be torch.Size([512])
print(sample['attention_mask'].shape) # Should be torch.Size([512])
print(sample['labels'].shape)        # Should be torch.Size([512])


torch.Size([512])
torch.Size([512])
torch.Size([512])


In [45]:
# Split the dataset
if len(tokenized_dataset['train']) > 1:
    split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset['train']
    eval_dataset = split_dataset['test']
else:
    train_dataset = tokenized_dataset['train']
    eval_dataset = tokenized_dataset['train']  # Not ideal, but acceptable for very small datasets


In [46]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # Adjust based on model architecture
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [47]:
training_args = TrainingArguments(
    output_dir="./llama_finetuned_resume",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=3e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  
    save_total_limit=2,
    push_to_hub=False,  
    report_to="wandb",
)




In [20]:
import wandb

wandb.login()  # You'll be prompted to enter your W&B API key

training_args.report_to = "wandb"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\anees\_netrc


In [48]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  
    eval_dataset=eval_dataset,    
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [49]:
trainer.train()


  0%|          | 0/30 [09:49<?, ?it/s]
 13%|█▎        | 4/30 [00:08<00:51,  1.97s/it]
 13%|█▎        | 4/30 [00:09<00:51,  1.97s/it]

{'eval_loss': 3.3368794918060303, 'eval_runtime': 0.4997, 'eval_samples_per_second': 14.009, 'eval_steps_per_second': 8.005, 'epoch': 1.0}


                                              
 27%|██▋       | 8/30 [00:18<00:43,  1.99s/it]

{'eval_loss': 0.16672100126743317, 'eval_runtime': 0.4836, 'eval_samples_per_second': 14.474, 'eval_steps_per_second': 8.271, 'epoch': 2.0}


 33%|███▎      | 10/30 [00:23<00:48,  2.42s/it]

{'loss': 3.5244, 'grad_norm': 0.17256700992584229, 'learning_rate': 0.00021999999999999995, 'epoch': 2.59}


                                               
 40%|████      | 12/30 [00:27<00:35,  2.00s/it]

{'eval_loss': 0.15623600780963898, 'eval_runtime': 0.4859, 'eval_samples_per_second': 14.405, 'eval_steps_per_second': 8.231, 'epoch': 3.0}


                                               
 53%|█████▎    | 16/30 [00:37<00:27,  2.00s/it]

{'eval_loss': 0.1481381207704544, 'eval_runtime': 0.4894, 'eval_samples_per_second': 14.304, 'eval_steps_per_second': 8.174, 'epoch': 4.0}


 67%|██████▋   | 20/30 [00:45<00:19,  2.00s/it]

{'loss': 0.1466, 'grad_norm': 0.1711512953042984, 'learning_rate': 0.00011999999999999999, 'epoch': 5.0}


                                               
 67%|██████▋   | 20/30 [00:46<00:19,  2.00s/it]

{'eval_loss': 0.14263422787189484, 'eval_runtime': 0.4889, 'eval_samples_per_second': 14.319, 'eval_steps_per_second': 8.182, 'epoch': 5.0}


                                               
 80%|████████  | 24/30 [00:55<00:11,  2.00s/it]

{'eval_loss': 0.13836123049259186, 'eval_runtime': 0.4841, 'eval_samples_per_second': 14.459, 'eval_steps_per_second': 8.262, 'epoch': 6.0}


                                               
 93%|█████████▎| 28/30 [01:04<00:03,  2.00s/it]

{'eval_loss': 0.13543245196342468, 'eval_runtime': 0.4853, 'eval_samples_per_second': 14.425, 'eval_steps_per_second': 8.243, 'epoch': 7.0}


100%|██████████| 30/30 [01:10<00:00,  2.42s/it]

{'loss': 0.1324, 'grad_norm': 0.13623031973838806, 'learning_rate': 1.9999999999999998e-05, 'epoch': 7.59}


                                               
100%|██████████| 30/30 [01:10<00:00,  2.42s/it]

{'eval_loss': 0.13461095094680786, 'eval_runtime': 0.49, 'eval_samples_per_second': 14.286, 'eval_steps_per_second': 8.164, 'epoch': 7.59}


100%|██████████| 30/30 [01:10<00:00,  2.37s/it]

{'train_runtime': 71.0028, 'train_samples_per_second': 7.605, 'train_steps_per_second': 0.423, 'train_loss': 1.2677945613861084, 'epoch': 7.59}





TrainOutput(global_step=30, training_loss=1.2677945613861084, metrics={'train_runtime': 71.0028, 'train_samples_per_second': 7.605, 'train_steps_per_second': 0.423, 'total_flos': 1227843132456960.0, 'train_loss': 1.2677945613861084, 'epoch': 7.592592592592593})

In [50]:
eval_results = trainer.evaluate()
print(f"Perplexity: {eval_results['eval_loss']:.2f}")

100%|██████████| 4/4 [00:00<00:00,  6.32it/s]

Perplexity: 0.13





In [51]:
trainer.save_model("./llama_finetuned_resume")
tokenizer.save_pretrained("./llama_finetuned_resume")

('./llama_finetuned_resume\\tokenizer_config.json',
 './llama_finetuned_resume\\special_tokens_map.json',
 './llama_finetuned_resume\\tokenizer.json')