# **Fine-Tuning LLaMA-3 for Psychology Question Answering Using LoRA and Unsloth**

In [None]:
!pip install -U unsloth transformers torch datasets accelerate
import os
os._exit(0)

Collecting unsloth
  Downloading unsloth-2025.3.3-py3-none-any.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m956.7 kB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting unsloth_zoo>=2025.3.1 (from unsloth)
  Downloading unsloth_zoo-2025.3.1-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsand

In [None]:
from unsloth import FastLanguageModel

MODEL_NAME = "unsloth/llama-3-8b"  

model, tokenizer = FastLanguageModel.from_pretrained(
    MODEL_NAME,
    max_seq_length=4096,  
    dtype=None,  
    load_in_4bit=True, 
    device_map="auto",
)
print(model)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.3: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSN

In [None]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("BoltMonkey/psychology-question-answer")

print(dataset)

split_ratio = 0.9 

dataset["train"] = dataset["train"].shuffle(seed=42)

split = dataset["train"].train_test_split(test_size=1 - split_ratio)

dataset = DatasetDict({
    "train": split["train"],
    "validation": split["test"] 

print(dataset)


README.md:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

train.json:   0%|          | 0.00/74.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/197180 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['answer', 'question'],
        num_rows: 197180
    })
})
DatasetDict({
    train: Dataset({
        features: ['answer', 'question'],
        num_rows: 177462
    })
    validation: Dataset({
        features: ['answer', 'question'],
        num_rows: 19718
    })
})


In [None]:
def format_dataset(example):
    formatted_text = f"<|begin_of_text|>\n### Instruction:\nAnswer the following psychology question.\n\n### Input:\n{example['question']}\n\n### Response:\n{example['answer']}"
    return {"text": formatted_text}

formatted_dataset = dataset.map(format_dataset, remove_columns=["question", "answer"])


Map:   0%|          | 0/177462 [00:00<?, ? examples/s]

Map:   0%|          | 0/19718 [00:00<?, ? examples/s]

In [None]:
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()  
    return tokenized_inputs


In [None]:
tokenized_datasets = DatasetDict({
    "train": formatted_dataset["train"].map(tokenize_function, batched=True),
    "validation": formatted_dataset["validation"].map(tokenize_function, batched=True),
})

train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]

print(tokenized_datasets)


Map:   0%|          | 0/177462 [00:00<?, ? examples/s]

Map:   0%|          | 0/19718 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 177462
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 19718
    })
})


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"], 
    lora_alpha=16,  
    lora_dropout=0,  
    bias="none",
    use_gradient_checkpointing=True, 
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


Unsloth 2025.3.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        per_device_train_batch_size=2,  
        gradient_accumulation_steps=4,  
        warmup_steps=5,  
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),  
        bf16=torch.cuda.is_bf16_supported(), 
        logging_steps=1,
        optim="adamw_bnb_8bit",  
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

trainer.train()


Converting train dataset to ChatML (num_proc=2):   0%|          | 0/177462 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/177462 [00:00<?, ? examples/s]

Packing train dataset (num_proc=2):   0%|          | 0/177462 [00:00<?, ? examples/s]

Converting eval dataset to ChatML (num_proc=2):   0%|          | 0/19718 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=2):   0%|          | 0/19718 [00:00<?, ? examples/s]

Packing eval dataset (num_proc=2):   0%|          | 0/19718 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 177,462 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/4,582,543,360 (0.92% trained)


Step,Training Loss
1,2.3022
2,2.4785
3,2.5904
4,2.2645
5,2.111
6,1.6475
7,1.5136
8,1.5059
9,1.2898
10,1.1356


TrainOutput(global_step=60, training_loss=1.2566463232040406, metrics={'train_runtime': 750.9154, 'train_samples_per_second': 0.639, 'train_steps_per_second': 0.08, 'total_flos': 1.112830925340672e+16, 'train_loss': 1.2566463232040406})

In [None]:
trainer.model.save_pretrained("./llama3-psychology")
tokenizer.save_pretrained("./llama3-psychology")


NameError: name 'chat_prompt' is not defined

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from accelerate import infer_auto_device_map

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,  
    bnb_4bit_compute_dtype=torch.float16, 
    llm_int8_enable_fp32_cpu_offload=True, 
)

fine_tuned_model = AutoModelForCausalLM.from_pretrained(
    "./llama3-psychology",
    quantization_config=bnb_config,
    device_map="auto",  
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True  
)

fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./llama3-psychology")

fine_tuned_model.to("cuda")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
      

In [None]:
inputs = fine_tuned_tokenizer("What is the main idea behind the humanistic perspective?",
                              return_tensors="pt").to("cuda")

outputs = fine_tuned_model.generate(**inputs, max_new_tokens=64)

print(fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True))


What is the main idea behind the humanistic perspective? The humanistic perspective focuses on the individual's unique experiences, emotions, and motivations. It emphasizes the importance of personal growth and self-actualization, and encourages individuals to make their own choices and decisions. The humanistic perspective also emphasizes the importance of empathy, compassion, and understanding in relationships. It recognizes the potential for


In [None]:
chat_prompt = "<|begin_of_text|>\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"

inputs = fine_tuned_tokenizer(
    [
        chat_prompt.format(
            instruction="Answer the question based on psychology knowledge.",
            input="What is the main idea behind the humanistic perspective?",
        )
    ],
    return_tensors="pt"
).to("cuda")

outputs = fine_tuned_model.generate(**inputs, max_new_tokens=64)

print(fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True))



### Instruction:
Answer the question based on psychology knowledge.

### Input:
What is the main idea behind the humanistic perspective?

### Response:
The humanistic perspective emphasizes the importance of individuals' subjective experiences and their drive to self-actualize. It focuses on personal growth, self-awareness, and the development of one's full potential. Humanistic psychologists believe that individuals have the capacity for self-determination and self-actualization. They also emphasize the role


In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

model_path = "./llama3-psychology"  

fine_tuned_model, tokenizer = FastLanguageModel.from_pretrained(
    model_path,
    max_seq_length=4096,  
    dtype=None,  
    load_in_4bit=True,  
    device_map="auto"
)
FastLanguageModel.for_inference(fine_tuned_model)



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.3: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [None]:
chat_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
inputs = tokenizer(
    [
        chat_prompt.format(
            "",  
            "Who is the founder of the psychoanalytic theory?", 
            "" 
        )
    ],
    return_tensors="pt"
).to("cuda")


In [None]:
from unsloth import FastLanguageModel

FastLanguageModel.for_inference(fine_tuned_model)  

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [None]:
outputs = fine_tuned_model.generate(**inputs, max_new_tokens=64, use_cache=True)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


["\n### Instruction:\n\n\n### Input:\nWho is the founder of the psychoanalytic theory?\n\n### Response:\nSigmund Freud is the founder of the psychoanalytic theory. He developed the theory in the early 20th century and is considered one of the most influential psychologists of all time. Freud's theory focuses on the unconscious mind and its role in shaping our thoughts, feelings, and behaviors."]


In [None]:
!pip install -U huggingface_hub


Collecting huggingface_hub
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.29.1-py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.0/468.0 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.28.1
    Uninstalling huggingface-hub-0.28.1:
      Successfully uninstalled huggingface-hub-0.28.1
Successfully installed huggingface_hub-0.29.1


In [None]:
from huggingface_hub import login

login(token="your_token")


In [None]:
from huggingface_hub import HfApi

hf_username = "your_username"     
repo_name = "llama3-psychology"  

api = HfApi()
api.create_repo(repo_id=f"{hf_username}/{repo_name}", exist_ok=True)


RepoUrl('https://huggingface.co/ayemunnn/llama3-psychology', endpoint='https://huggingface.co', repo_type='model', repo_id='ayemunnn/llama3-psychology')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

fine_tuned_model = AutoModelForCausalLM.from_pretrained("./llama3-psychology")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./llama3-psychology")

fine_tuned_model.push_to_hub(f"{hf_username}/{repo_name}")
fine_tuned_tokenizer.push_to_hub(f"{hf_username}/{repo_name}")


`low_cpu_mem_usage` was None, now default to True since model is quantized.


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ayemunnn/llama3-psychology/commit/a0001c1e7c016ed9c6f8dda54c7ed8870f20b47e', commit_message='Upload tokenizer', commit_description='', oid='a0001c1e7c016ed9c6f8dda54c7ed8870f20b47e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ayemunnn/llama3-psychology', endpoint='https://huggingface.co', repo_type='model', repo_id='ayemunnn/llama3-psychology'), pr_revision=None, pr_num=None)