In [1]:
from pymongo import MongoClient

# CLIENT: MongoClient = MongoClient("localhost", 27017)
# DB = CLIENT.antonio
# PEOPLE = DB.people
# DOCUMENTS = DB.documents
# for person in PEOPLE.find():
#     print(person)

# import numpy as np
# from qdrant_client import QdrantClient
# import os
#
# qdrant_client = QdrantClient(
#     url="https://b7fce096-1c85-492d-b757-1724657c30f2.eu-west-2-0.aws.cloud.qdrant."
#     "io:6333",
#     api_key=os.getenv("QDRANT_API_KEY"),
# )
#
# query_embedding = np.random.rand(384).tolist()  # Simula un embedding de consulta
#
# search_results = qdrant_client.search(
#     collection_name="llms",
#     query_vector=query_embedding,
#     limit=3,  # Devuelve los 3 más similares
# )
#
# for result in search_results:
#     print(f"ID: {result.id}, Score: {result.score}, Data: {result.payload}")

In [12]:
import os
import json
import pandas as pd
from huggingface_hub import InferenceClient


def evaluate_answer(instruction: str, answer: str) -> dict:
    prompt = f"""You are an expert judge. Please evaluate the quality of a given \
    answer to an instruction based on two criteria:
    1. Accuracy: How factually correct is the information presented in the \
    answer? You are a technical expert in this topic.
    2. Style: Is the tone and writing style appropriate for a blog post or social \
    media content? It should use simple but technical words
    and avoid formal or academic language.

    Accuracy scale:
    1 (Poor): Contains factual errors or misleading information
    2 (Good): Mostly accurate with minor errors or omissions
    3 (Excellent): Highly accurate and comprehensive
    
    Style scale:
    1 (Poor): Too formal, uses some overly complex words
    2 (Good): Good balance of technical content and accessibility, but
    still uses formal words and expressions
    3 (Excellent): Perfectly accessible language for blog/social media,
    uses simple but precise technical terms when necessary

    Example of bad style: The Llama2 7B model constitutes a noteworthy
    progression in the field of artificial intelligence, serving as the
    successor to its predecessor, the original Llama architecture.
    Example of excellent style: Llama2 7B outperforms the original Llama
    model across multiple benchmarks.

    Instruction: {instruction}

    Answer: {answer}

    Provide your evaluation in JSON format with the following structure:
    {{
        "accuracy": {{
            "analysis": "...",
            "score": 0
        }},
        "style": {{
            "analysis": "...",
            "score": 0
        }}
    }}
    """

    client = InferenceClient(
        provider="nebius",
        api_key=os.environ["HUGGINGFACE_KEY"],
    )
    completion = client.chat.completions.create(
        model="deepseek-ai/DeepSeek-R1",
        messages=[{"role": "user", "content": prompt}],
    )
    text = completion.choices[0].message.content
    end_tag = "</think>"
    think_pos = text.find(end_tag)
    text_after_second_think = text[think_pos + len(end_tag) :]

    return text_after_second_think.strip()


df = pd.read_csv("data/evaluation/answers.csv")
evaluations = [
    evaluate_answer(instruction, answer)
    for instruction, answer in zip(df["prompt"], df["answer"])
]

In [15]:
accuracies = []
accs_explanations = []
styles = []
styles_explanations = []

for evaluation in evaluations:
    clean_eval = evaluation.strip("`").strip()
    clean_eval = json.loads(clean_eval[4:].strip())
    accuracies.append(clean_eval["accuracy"]["score"])
    accs_explanations.append(clean_eval["accuracy"]["analysis"])
    styles.append(clean_eval["style"]["score"])
    styles_explanations.append(clean_eval["style"]["analysis"])

df["accuracy"] = accuracies
df["style"] = styles
df["accuracy_explanation"] = accs_explanations
df["style_explanation"] = styles_explanations

df.to_csv("data/evaluation/evaluation.csv", index=False)

In [16]:
df

Unnamed: 0,model,prompt,answer,accuracy,style,accuracy_explanation,style_explanation
0,deepseek,Below is an instruction that describes a task....,The cost of running fine-tuning and inference ...,3,3,The answer is highly accurate and comprehensiv...,The style is accessible and well-suited for bl...
