In [None]:
from huggingface_hub import login

login(token="")

In [None]:
from datasets import load_dataset


# Preprocessing function
def preprocess_dataset(row):
    return {
        "prompt": f"Otázka: {row['questionText']}\nOdpověď: {row['answerText']}",
        "completion": f"{row['score']}",
    }


# Načítání datasetu a jeho předzpracování
# dataset = load_dataset("json", data_files="data.json")
# dataset = load_dataset("json", data_files="SUI_dataset_one_question.json")
dataset = load_dataset("json", data_files="SUI_dataset.json")
# dataset = load_dataset("json", data_files="data.json")
dataset = dataset.map(preprocess_dataset, remove_columns=dataset["train"].column_names)
# Rozdělení datasetu na trénovací a testovací část
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch
from trl import SFTTrainer, SFTConfig

OUTPUT_DIR = "./finetuned-mistral-lora"


# Model ID
# model_id = "mistralai/Mistral-7B-Instruct-v0.3"
# model_id = "BUT-FIT/CSTinyLlama-1.2B"
# model_id = "BUT-FIT/csmpt7b" # nepodařilo se natrénovat
# model_id = "BUT-FIT/Czech-GPT-2-XL-133k"

# Načítání tokenizeru
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Načítání modelu s kvantizací
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    device_map="auto",
)

# Definování LoRA konfigurace pro efektivní trénování
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    # target_modules=["q_proj", "v_proj"],
    target_modules=["out_proj", "up_proj"],
    # target_modules=["c_attn", "c_proj"],
    # target_modules=["Wqkv", "out_proj"],
    lora_dropout=0.1,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map="auto"
)

print(model)

# Aplikování LoRA adaptace na model
model = get_peft_model(model, lora_config)


In [None]:
%pip install wandb
%pip install nbformat

import wandb

wandb.login()

%env WANDB_PROJECT=knn

In [None]:
import uuid

run_name = model_id + uuid.uuid4().hex

# Nastavení tréninkových parametrů
training_args = SFTConfig(
    output_dir="./results_" + model_id,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    report_to="wandb",  # enable logging to W&B
    run_name=run_name,  # name of the W&B run (optional)
    logging_steps=100,  # how often to log to W&B
    save_strategy="epoch",
    fp16=True,
    eval_strategy="epoch",
    label_names=[],
)
# Předání trénovacích parametrů a dat do SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

trainer.can_return_loss = True
# Trénování modelu
trainer.train()

# trainer.model.save_pretrained(OUTPUT_DIR)
# tokenizer.save_pretrained(OUTPUT_DIR)
wandb.finish()

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
trained_model = "results_BUT-FIT/csmpt7b/checkpoint-500"
model = AutoModelForCausalLM.from_pretrained(
    trained_model, device_map="auto", quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(trained_model)

from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    return_full_text=False,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id,
    max_new_tokens=100,
)
pipe.tokenizer.pad_token_id = model.config.eos_token_id


In [None]:
record = dataset["test"][0]
print(record["prompt"])
print(record["completion"])
pipe(record["prompt"])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

preds = []
targets = []

with open("inference_results.txt", "w") as f:
    for i, (sample, out) in enumerate(
        zip(dataset["test"], pipe(KeyDataset(dataset["test"], "prompt"), batch_size=8))
    ):
        prompt = sample["prompt"]
        target_str = sample["completion"]
        output = out[0]["generated_text"]

        try:
            pred = float(output.strip())
            target = float(target_str.strip())
            preds.append(pred)
            targets.append(target)
            f.write(f"[{i}] Prompt: {prompt}\n")
            f.write(f"[{i}] Target: {target}, Prediction: {pred}\n\n")
        except ValueError:
            f.write(
                f"[{i}] Failed to parse: pred='{output.strip()}', target='{target_str.strip()}'\n\n"
            )
            continue

    # Calculate metrics at the end
    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)

    f.write("Final Results:\n")
    f.write(f"MAE: {mae:.4f}\n")
    f.write(f"MSE: {mse:.4f}\n")

print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
