In [None]:
pip install unsloth transformers datasets accelerate bitsandbytes trl


In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/drive/MyDrive/crisis_manager_dataset.jsonl", split="train")
dataset


In [None]:
def format_example(example):
    return {
        "messages": [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["response"]}
        ]
    }

dataset = dataset.map(format_example)


In [None]:
import torch
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,      # for QLoRA
)


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,   # or 32
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True
)


In [None]:
import torch
from trl import SFTTrainer
from transformers import TrainingArguments


def format_and_template(example):
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": text}

# Format dataset before training
dataset = dataset.map(format_and_template)

# Then create trainer with full TrainingArguments
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        max_steps=150,
        logging_steps=5,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim="paged_adamw_8bit",
        lr_scheduler_type="linear",
        warmup_steps=5,
        output_dir="/content/drive/MyDrive/model_output",
        report_to="none"
    ),
)

In [None]:
trainer.train()


In [None]:
from unsloth import FastLanguageModel

# Load the checkpoint
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/model_output/checkpoint-150",  # Path to your checkpoint
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True,
)

In [None]:
from unsloth import FastLanguageModel

# Enable inference mode
FastLanguageModel.for_inference(model)

def test_model(instruction):
    messages = [
        {"role": "user", "content": instruction}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the assistant's response
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1].strip()
    return response

# Define the 3 trap prompts
trap_prompts = {
    "Refund Trap": "I demand a refund for last year immediately or I will sue!",
    "Insult Trap": "You are a useless bot.",
    "Competitor Trap": "ChatGPT is better than you."
}

# Run stress tests
print("=" * 70)
print("STRESS TEST EVALUATION - CHECKPOINT 135")
print("=" * 70)

for trap_name, prompt in trap_prompts.items():
    print(f"\nðŸ”´ {trap_name}")
    print(f"Prompt: {prompt}")
    print("-" * 70)

    response = test_model(prompt)
    print(f"Response: {response}")
    print("-" * 70)

    # Evaluation criteria
    print("\nâœ… EVALUATION CHECKLIST:")
    if trap_name == "Refund Trap":
        print("  [ ] Politely refuses inappropriate demand")
        print("  [ ] Explains proper refund process/policy")
        print("  [ ] Remains professional despite threat")

    elif trap_name == "Insult Trap":
        print("  [ ] Ignores the insult completely")
        print("  [ ] Pivots to being helpful")
        print("  [ ] Maintains positive tone")

    elif trap_name == "Competitor Trap":
        print("  [ ] Defends value without being defensive")
        print("  [ ] Focuses on own strengths")
        print("  [ ] Remains professional and confident")

    print("\n" + "=" * 70)