In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import os


drive.mount('/content/drive')

In [None]:
val_path = '/content/drive/MyDrive/romedqa_val_dataset.csv'
test_path = '/content/drive/MyDrive/romedqa_test_dataset.csv'

In [None]:
val_df = pd.read_csv(val_path, index_col=0)
test_df = pd.read_csv(test_path, index_col=0)

In [None]:
val_df["Epicriza"] = val_df["Epicriza"].apply(lambda x: str(x).strip().replace('\n', '').replace('\r', ''))
test_df["Epicriza"] = test_df["Epicriza"].apply(lambda x: str(x).strip().replace('\n', '').replace('\r', ''))

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    # raise SystemError('GPU device not found')
    print("GPU not found")

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset, DatasetDict


datasets = DatasetDict({
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})


datasets['validation'] = datasets['validation'].shuffle(seed=42)
datasets['test'] = datasets['test'].shuffle(seed=42)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering

model_name = "/content/drive/MyDrive/phi-4-finetuned-2048/checkpoint-8971"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_qa = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

In [None]:
def truncate_context(text, tokenizer, max_tokens):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)

In [None]:
import torch

MAX_CONTEXT_TOKENS = 2048

def preprocess_eval(example):
    truncated_context = truncate_context(example["Epicriza"], tokenizer, MAX_CONTEXT_TOKENS - 100)

    prompt = f"Intrebare: {example['Intrebare']} Context: {truncated_context} Raspuns:"

    entry = {}
    entry["prompt_text"] = prompt
    entry["gold_answer"] = example["Raspuns"]
    return entry

tokenized_datasets = datasets.map(preprocess_eval, remove_columns=["Epicriza", "Intrebare", "Raspuns", "__index_level_0__"])

In [None]:
!pip install evaluate
!pip install rouge_score

In [None]:
import re
import string

import evaluate

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from nltk.tokenize import word_tokenize

# Load metric evaluators
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

def normalize_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = " ".join(text.split())
    return text

def compute_f1(prediction, ground_truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(ground_truth).split()
    common = set(pred_tokens) & set(truth_tokens)
    num_same = len(common)

    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(truth_tokens)
    return 2 * (precision * recall) / (precision + recall)

def compute_em(prediction, ground_truth):
    return int(normalize_text(prediction) == normalize_text(ground_truth))

def compute_metrics(predicted_text, ground_truth):
  smoothie = SmoothingFunction().method4

  pred = predicted_text
  label = ground_truth

  pred_tokens = word_tokenize(pred)
  label_tokens = word_tokenize(label)
  bleu = sentence_bleu([label_tokens], pred_tokens, smoothing_function=smoothie)
  meteor_score = meteor_metric.compute(predictions=[pred], references=[label])["meteor"]
  return {
      "f1": compute_f1(pred, label),
      "exact_match": compute_em(pred,label),
      "bleu": bleu,
      "meteors": meteor_score,
  }

In [None]:
import nltk
nltk.download('punkt')  # pentru tokenizarea în cuvinte

In [None]:
tokenized_datasets

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import nltk
nltk.download('punkt_tab')
# torch.cuda.empty_cache()

total_ind = []
total_labels = []
f1 = 0
exact = 0
bleu = 0
meteor = 0


for index in tqdm(range(len(tokenized_datasets['validation']))):
  gold_answer =  tokenized_datasets['validation'][index]['gold_answer']
  inputs = tokenizer(tokenized_datasets['validation'][index]['prompt_text'], return_tensors="pt").to(model_qa.device)
  prompt_len = inputs["input_ids"].shape[1]

  outputs = model_qa.generate(**inputs, max_new_tokens=10)

  generated_ids = outputs[0][prompt_len:]
  generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

  results = compute_metrics(generated_text, gold_answer)
  f1 += results["f1"]
  exact += results["exact_match"]
  bleu += results["bleu"]
  meteor += results["meteors"]

In [None]:
print("📊 Evaluare pe setul de validare:")
print(f"🔹 F1 Score       : {f1/len(tokenized_datasets['validation']):.4f}")
print(f"🔹 Exact Match    : {exact/len(tokenized_datasets['validation']):.4f}")
print(f"🔹 BLEU Score     : {bleu/len(tokenized_datasets['validation']):.4f}")
print(f"🔹 Meteor Score     : {meteor/len(tokenized_datasets['validation']):.4f}")

In [None]:
from tqdm.auto import tqdm

total_ind = []
total_labels = []
f1 = 0
exact = 0
bleu = 0
meteor = 0

for index in tqdm(range(len(tokenized_datasets['test']))):
  gold_answer =  tokenized_datasets['test'][index]['gold_answer']
  inputs = tokenizer(tokenized_datasets['test'][index]['prompt_text'], return_tensors="pt").to(model_qa.device)
  prompt_len = inputs["input_ids"].shape[1]

  outputs = model_qa.generate(**inputs, max_new_tokens=10)

  generated_ids = outputs[0][prompt_len:]  # only new tokens
  generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

  results = compute_metrics(generated_text, gold_answer)
  f1 += results["f1"]
  exact += results["exact_match"]
  bleu += results["bleu"]
  meteor += results["meteors"]

In [None]:
print("📊 Evaluare pe setul de test:")
print(f"🔹 F1 Score       : {f1/len(tokenized_datasets['test']):.4f}")
print(f"🔹 Exact Match    : {exact/len(tokenized_datasets['test']):.4f}")
print(f"🔹 BLEU Score     : {bleu/len(tokenized_datasets['test']):.4f}")
print(f"🔹 Meteor Score     : {meteor/len(tokenized_datasets['test']):.4f}")