In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install required libraries
!pip install -q accelerate -U
!pip install -q bitsandbytes -U
!pip install -q trl -U
!pip install -q peft -U
!pip install -q transformers -U
!pip install -q fsspec==2023.12.0
!pip install -q gcsfs==2023.12.0

# Load and split the dataset
from datasets import load_dataset, DatasetDict

raw_dataset = load_dataset("dair-ai/emotion", split="train")
raw_dataset = raw_dataset.shuffle(seed=42).select(range(10000))

train_testvalid = raw_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})

# Create prompts
from transformers import AutoTokenizer
template_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

label_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

def format_prompt(example):
    prompt = f"""<|system|>
Sen bir duygu analizi uzmanısın. Verilen metindeki duyguyu belirle. Sadece şu seçeneklerden birini kullan: 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise'.
</s>
<|user|>
Metin: {example['text']}
Bu metindeki duygu nedir? Sadece duygu etiketini yaz.
</s>
<|assistant|>
{label_map[example['label']]}
</s>"""
    return {"text": prompt, "label": example["label"]}

for split in dataset.keys():
    dataset[split] = dataset[split].map(format_prompt)

# Print example
print(dataset['train'][0])

# Model inference
from transformers import pipeline

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
pipe = pipeline("text-generation", model=model_name, device='cuda')

# QLoRA setup
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes
!pip install accelerate

# QLoRA training preparation
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']  # Llama mimarisi için
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize, batched=True)

In [None]:
# Model training
from transformers import TrainingArguments
from trl import SFTTrainer
import os

output_dir = "train_dir"

args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=2,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

os.environ["WANDB_DISABLED"] = "true"

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    args=args,
    peft_config=peft_config
)

trainer.train()


In [None]:
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

def extract_label_from_output(output_text):
    output_text = output_text.lower().strip()
    output_text = output_text.replace('"', '').strip()
    words = output_text.split()
    if not words:
        return -1
    first_word = words[0]
    for label_id, label_str in label_map.items():
        if first_word == label_str:
            return label_id
    for label_id, label_str in label_map.items():
        if label_str in output_text:
            return label_id
    return -1

true_labels = []
pred_labels = []
total_examples = len(dataset["test"])

for i, example in enumerate(tqdm(dataset["test"])):
    prompt = example["text"].split("<|assistant|>")[0] + "<|assistant|>"

    try:
        generated = pipe(
            prompt,
            max_new_tokens=10,
            do_sample=False,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            temperature=0.1
        )[0]["generated_text"]

        response_text = generated[len(prompt):].strip()
        pred = extract_label_from_output(response_text)

        if pred != -1:
            true_labels.append(example["label"])
            pred_labels.append(pred)
    except:
        continue

if true_labels:
    accuracy = accuracy_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels, average="weighted")
    print(f"\nDoğruluk (Accuracy): {accuracy:.4f}")
    print(f"F1 Skoru: {f1:.4f}")
else:
    print("Geçerli tahmin yapılamadı.")



In [None]:
trainer.model.save_pretrained("/content/drive/MyDrive/tinyllama-qlora-emotion")

In [None]:
# Interactive testing system
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

model = AutoPeftModelForCausalLM.from_pretrained("/content/drive/MyDrive/tinyllama-qlora-emotion", device_map="auto")
merged_model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

pipe = pipeline("text-generation", model=merged_model, tokenizer=tokenizer)

while True:
    user_input = input("Bir metin yazın (çıkmak için 'exit' yazın): ")
    if user_input.lower() == "exit":
        print("Çıkılıyor...")
        break
    prompt = f"""<|system|>
Sen bir duygu analizi uzmanısın. Verilen metindeki duyguyu belirle. Sadece şu seçeneklerden birini kullan: 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise'.
</s>
<|user|>
Metin: {user_input}
Bu metindeki duygu nedir? Sadece duygu etiketini yaz.
</s>
<|assistant|>"""
    output = pipe(prompt, max_new_tokens=5, do_sample=False, return_full_text=False)[0]['generated_text']
    model_response = output.strip()
    print(f"Tahmin edilen duygu: {model_response}\n")
