In [None]:
# Fine-tuning LLaMA 3 7B z QLoRA na RTX 5070 Ti
# Dataset: jingjietan/pandora-big5

# ==================== INSTALACJA PAKIET√ìW ====================
# Uruchom tƒô kom√≥rkƒô pierwszy raz
"""
!pip install -q -U transformers peft accelerate bitsandbytes datasets trl scipy
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
"""

# ==================== IMPORTY ====================
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import os

# ==================== KONFIGURACJA ====================
# Model i dataset
MODEL_NAME = "meta-llama/Meta-Llama-3-8B"  # Zmie≈Ñ na "meta-llama/Meta-Llama-3-7B" je≈õli masz dostƒôp
DATASET_NAME = "jingjietan/pandora-big5"
OUTPUT_DIR = "./llama3-qlora-finetuned"

# LoRA parametry
LORA_R = 16  # Rank - wy≈ºszy = wiƒôcej parametr√≥w do treningu
LORA_ALPHA = 32  # Scaling factor
LORA_DROPOUT = 0.05

# Training parametry
BATCH_SIZE = 4  # Mo≈ºesz zwiƒôkszyƒá do 8 je≈õli pozwala VRAM
GRADIENT_ACCUMULATION_STEPS = 4  # Efektywny batch size = 4 * 4 = 16
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512  # Maksymalna d≈Çugo≈õƒá sekwencji

# ==================== QUANTIZATION CONFIG ====================
print("üîß Konfiguracja 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Normalized Float 4
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute dtype dla lepszej precyzji
    bnb_4bit_use_double_quant=True,  # Podw√≥jna quantizacja dla mniejszej pamiƒôci
)

# ==================== ZA≈ÅADUJ MODEL I TOKENIZER ====================
print("üì• ≈Åadowanie modelu i tokenizera...")
print("‚ö†Ô∏è UWAGA: Potrzebujesz tokenu Hugging Face z dostƒôpem do LLaMA 3")
print("   Zaloguj siƒô: huggingface-cli login")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    use_fast=True
)
tokenizer.pad_token = tokenizer.eos_token  # LLaMA nie ma pad tokenu
tokenizer.padding_side = "right"  # Wa≈ºne dla causal LM

# Model z quantizacjƒÖ
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",  # Automatyczne mapowanie na GPU
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Przygotuj model dla k-bit trainingu
model = prepare_model_for_kbit_training(model)

# W≈ÇƒÖcz gradient checkpointing
model.gradient_checkpointing_enable()
model.config.use_cache = False  # Wy≈ÇƒÖcz cache dla gradient checkpointing

print(f"‚úÖ Model za≈Çadowany! VRAM u≈ºyty: ~{torch.cuda.memory_allocated() / 1024**3:.2f} GB")

# ==================== KONFIGURACJA LORA ====================
print("üéØ Konfiguracja LoRA...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Wszystkie attention i MLP layers
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

# Zastosuj LoRA do modelu
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ==================== ZA≈ÅADUJ DATASET ====================
print(f"üìö ≈Åadowanie datasetu: {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME)

print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

# PodglƒÖd przyk≈Çadu
print("\nüìã Przyk≈Çadowy wpis:")
print(dataset['train'][0])

# ==================== PREPROCESSING ====================
print("\nüîÑ Preprocessing datasetu...")

def preprocess_function(examples):
    """
    Dataset pandora-big5 ma kolumny:
    - O, C, E, A, N (Big Five traits: Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism)
    - ptype (personality type)
    - text (tekst u≈ºytkownika)
    - __index_level_0__ (index)
    
    Tworzymy format instruction-following do predykcji Big Five traits na podstawie tekstu
    """
    texts = []
    
    for i in range(len(examples['text'])):
        # Pobierz warto≈õci Big Five
        o_score = examples['O'][i]
        c_score = examples['C'][i]
        e_score = examples['E'][i]
        a_score = examples['A'][i]
        n_score = examples['N'][i]
        ptype = examples['ptype'][i]
        user_text = examples['text'][i]
        
        # Format 1: Predykcja Big Five na podstawie tekstu
        # Mo≈ºesz wybraƒá jeden z poni≈ºszych format√≥w lub u≈ºyƒá obu (losowo)
        
        # Wariant A: Analiza personality traits
        prompt = f"""### Instruction:
Analyze the following text and predict the Big Five personality traits (scored 0-100) and personality type.

### Input:
{user_text}

### Response:
Openness: {o_score}
Conscientiousness: {c_score}
Extraversion: {e_score}
Agreeableness: {a_score}
Neuroticism: {n_score}
Personality Type: {ptype}"""
        
        # Wariant B: Generowanie tekstu w stylu danego profilu (opcjonalnie)
        # prompt = f"""### Instruction:
# Generate text that reflects someone with the following personality profile:
# Openness: {o_score}, Conscientiousness: {c_score}, Extraversion: {e_score}, Agreeableness: {a_score}, Neuroticism: {n_score}
# 
# ### Response:
# {user_text}"""
        
        texts.append(prompt)
    
    # Tokenizacja
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding=False,  # Bƒôdziemy paddowaƒá dynamicznie
    )
    
    # Labels to to samo co input_ids dla causal LM
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Zastosuj preprocessing
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names,
    desc="Tokenizacja datasetu"
)

# Data collator do dynamicznego paddingu
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM, nie masked LM
)

# ==================== TRAINING ARGUMENTS ====================
print("\n‚öôÔ∏è Konfiguracja treningu...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",  # Optymalizator dla QLoRA
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=2,  # Zachowaj tylko 2 ostatnie checkpointy
    fp16=False,  # U≈ºywamy bfloat16
    bf16=True,
    max_grad_norm=0.3,
    weight_decay=0.001,
    report_to="none",  # Zmie≈Ñ na "wandb" je≈õli u≈ºywasz Weights & Biases
    ddp_find_unused_parameters=False,
    group_by_length=True,  # Grupuj podobne d≈Çugo≈õci dla efektywno≈õci
)

# ==================== TRAINER ====================
print("\nüöÄ Inicjalizacja Trainera...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# ==================== ROZPOCZNIJ TRENING ====================
print("\nüéì Rozpoczynam fine-tuning...")
print(f"Efektywny batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Ca≈Çkowita liczba krok√≥w: ~{len(tokenized_dataset['train']) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS) * NUM_EPOCHS}")

# Wyczy≈õƒá cache przed treningiem
torch.cuda.empty_cache()

# TRENING!
trainer.train()

print("\n‚úÖ Trening zako≈Ñczony!")

# ==================== ZAPISZ MODEL ====================
print("\nüíæ Zapisywanie modelu...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model zapisany w: {OUTPUT_DIR}")

# ==================== EWALUACJA ====================
print("\nüìä Ewaluacja na zbiorze testowym...")
test_results = trainer.evaluate(tokenized_dataset["test"])
print(f"Test Loss: {test_results['eval_loss']:.4f}")
print(f"Test Perplexity: {torch.exp(torch.tensor(test_results['eval_loss'])):.2f}")

# ==================== TEST INFERENCJI ====================
print("\nüß™ Test wygenerowanego tekstu...")

def generate_text(prompt, max_new_tokens=150):
    """Funkcja do generowania tekstu"""
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Przyk≈Çadowy prompt z testem Big Five personality
test_prompt = """### Instruction:
Analyze the following text and predict the Big Five personality traits (scored 0-100) and personality type.

### Input:
I absolutely love exploring new ideas and trying out different creative projects. I'm always excited to meet new people and engage in deep conversations about philosophy and art. Sometimes I can be a bit disorganized, but I think that's just part of being spontaneous and going with the flow!

### Response:
"""

generated = generate_text(test_prompt)
print(f"\nPrompt:\n{test_prompt}")
print(f"\n{'='*60}")
print(f"Wygenerowana odpowied≈∫:\n{generated}")

# Test z przyk≈Çadem z datasetu
print("\n" + "="*60)
print("Test z rzeczywistym przyk≈Çadem z validation set:")
print("="*60)

sample = dataset['validation'][0]
test_prompt_real = f"""### Instruction:
Analyze the following text and predict the Big Five personality traits (scored 0-100) and personality type.

### Input:
{sample['text']}

### Response:
"""

generated_real = generate_text(test_prompt_real)
print(f"\nTekst: {sample['text'][:200]}...")
print(f"\nPrawdziwe warto≈õci:")
print(f"  O: {sample['O']}, C: {sample['C']}, E: {sample['E']}, A: {sample['A']}, N: {sample['N']}")
print(f"  Type: {sample['ptype']}")
print(f"\nPredykcja modelu:\n{generated_real}")

# ==================== INFORMACJE O PAMIƒòCI ====================
print("\nüìà Statystyki pamiƒôci VRAM:")
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
print(f"Max allocated: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")

print("\nüéâ Gotowe! Model zosta≈Ç fine-tunowany i zapisany.")
print(f"Aby za≈Çadowaƒá model p√≥≈∫niej, u≈ºyj:")
print(f"""
from peft import AutoPeftModelForCausalLM
model = AutoPeftModelForCausalLM.from_pretrained(
    '{OUTPUT_DIR}',
    device_map='auto',
    torch_dtype=torch.bfloat16
)
""")

In [None]:
!pip install huggingface_hub
!huggingface-cli login