In [None]:
%pip install bitsandbytes transformers accelerate langchain_core bert_score langfuse langdetect evaluate -q -U

: 

In [2]:
import os
from datetime import datetime

import torch
from evaluate import load
from google.colab import drive, files
from kaggle_secrets import UserSecretsClient
from langchain_core.prompts.prompt import PromptTemplate
from langdetect import detect, detect_langs
from langfuse import Langfuse
from langfuse.client import DatasetItemClient
from tqdm import tqdm
from transformers import pipeline

In [3]:
tqdm.pandas()

### Intitalization

In [None]:
user_secrets = UserSecretsClient()
huggingface_token = user_secrets.get_secret("HUGGINGFACE_TOKEN2")
!huggingface-cli login --token $huggingface_token

In [5]:
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_HOST"] = "https://cloud.langfuse.com"

DATASET_NAME = "aya_eval_ukr"
langfuse_client = Langfuse()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
bertscore = load("bertscore")

### Load model

In [10]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0


### Auxiliary methods

#### Model output generation

In [11]:
def predict(example: DatasetItemClient):
    torch.cuda.empty_cache()
    input_str = example.input["query"]
    messages = [
        {"role": "system", "content": "Відповідай лише українською."},
        {"role": "user", "content": input_str},
    ]
    outputs = pipe(
        messages,
        max_new_tokens=756,
    )
    gen_text = outputs[0]["generated_text"][-1]['content']
    return gen_text

#### Langfuse score tracing

In [12]:
def log_score(prediction: str, expected_output: str, trace_id: str):
    bert_scores = bertscore.compute(predictions=[prediction], references=[expected_output], lang="uk", device="cpu")
    main_language = detect(prediction)
    langfuse_client.score(trace_id=trace_id, name="bert_percision", value=bert_scores["precision"][0])
    langfuse_client.score(trace_id=trace_id, name="bert_recall", value=bert_scores["recall"][0])
    langfuse_client.score(trace_id=trace_id, name="bert_f1", value=bert_scores["f1"][0])
    langfuse_client.score(trace_id=trace_id, name="is_ukrainian", value=main_language == "uk")

In [13]:
def run_evaluation(
    dataset_name: str = DATASET_NAME,
):
    run_name = "llama-3.2-3b" + dataset_name + f"-{datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
    dataset = langfuse_client.get_dataset(dataset_name)
    for item in tqdm(dataset.items, desc="Evaluation progress"):
        with item.observe(run_name=run_name) as trace_id:
            query = item.input["query"]
            expected_output = item.expected_output["targets"]

            prediction = predict(example=item)
            languages = detect_langs(prediction)
            langfuse_client.trace(
                id=trace_id,
                input={
                    "query": query,
                    "expected_output": expected_output,
                },
                output={"prediction": prediction, "languages": languages},
            )

            log_score(prediction, expected_output, trace_id)

In [17]:
run_evaluation()

Evaluation progress:   0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Evaluation progress:   0%|          | 1/200 [00:23<1:16:41, 23.12s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluation progress:   1%|          | 2/200 [00:24<34:32, 10.47s/it]  Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluation progress:   2%|▏         | 3/200 [00:30<27:14,  8.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluation progress:   2%|▏         | 4/200 [00:33<19:45,  6.05s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluation progress:   2%|▎         | 5/200 [00:59<43:35, 13.41s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluation progress:   3%|▎         | 6/200 [01:11<42:13, 13.06s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluation progress:   4%|▎         | 7/200 [01:31<48:30, 15.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluatio