In [18]:
import requests
import json
import csv
import os

In [19]:
url = "https://beewant.com/api/new-chat"
token_key = "89b19faabe91de4df9025c61bf06dc9029222796"
headers = {
    'Authorization': f'Token {token_key}',  
    'Content-Type': 'application/json'
}
CHAT_MODEL = 'qwen3'
EVALUATION_MODELS = ['claude-3.7', 'GPT4.1']
EVALUATION_DIR = 'evaluation_web'
os.makedirs(EVALUATION_DIR, exist_ok=True)

# Evaluation WEB

In [52]:
def read_questions_from_csv(file_path: str):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter=';')
        for row in reader:
            questions.append({
                'thematic': row['thematic'],
                'question': row['question']
            })
    return questions

def export_json(result, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, indent=2))

def send_question(question):
    # You can change this to "GPT4.1", "mistral", "llama3.3", "claude-3.7", "r1", "qwen3"

    data = {
        "prompt": question,
        "chat_model": CHAT_MODEL,
        "web": True,
        "stream": True
    }

    response = requests.post(url, headers=headers, json=data)
    for line in (response.text).splitlines():
        last_message = None
        try:
            parsed = json.loads(line)
            if 'citations' in parsed:
                citations = parsed['citations']
            if 'message' in parsed:
                last_message = parsed['message']
        except json.JSONDecodeError:
            continue
    links = []
    for link in citations:
        links.append((link['url'], link['snippet'], link['title']))
    message = last_message
    if '</think>' in last_message:
        message = last_message.split("</think>")[1].strip()
    return links, message

Response Evaluation:
1. Relevance (0.4): Does the response address the question directly and stay on topic? 
2. Correctness (0.2): Is the information factually accurate?
3. Completeness (0.15): Does it cover all the key aspects expected in the answer?
4. Clarity (0.15): Is it clearly written and easy to understand?
5. Depth (0.1): Does it show deep insight or thoughtful reasoning?

In [48]:
def evaluate_sources(question, sources):
    evaluation_prompt = f"""Evaluate the following sources for the question: "{question}"
Sources:
{sources}

Evaluate each source based on:
- The domain name (e.g., .org, .gov, known institutions, major news outlets),
- The structure and keywords in the URL (e.g., presence of date or topic),
- Any general knowledge you have about the source.

For each link, assign a **total score out of 15** based on the following 3 criteria (each scored from 0 to 5):
1. **relevance** to the user question  
2. **credibility** of the domain  
3. **freshness** (based on year or indicators in the URL)

Add the scores for each URL and return the result as a JSON list with this structure:

[
    {{
    "url": "https://example.com/article-sahel-water-2023",
    "relevance": 2,
    "credibility": 2,
    "freshness": 1
    }},
    {{
    "url": "https://oldnews.net/blog123",
    "relevance": 3,
    "credibility": 1,
    "freshness": 1
    }},
    {{
    "url": "https://un.org/sahel-water-report",
    "relevance": 2,
    "credibility": 2,
    "freshness": 1
    }}
]
Return only the raw JSON array. Do not include any explanation, commentary, or text outside the JSON. Do not wrap it in code blocks."""

    data = {
        "prompt": evaluation_prompt,
        "chat_model": EVALUATION_MODELS[0],
        "stream": True
    }
    
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        for line in (response.text).splitlines():
            last_message = None
            parsed = json.loads(line)
            if 'message' in parsed:
                last_message = parsed['message']
        return last_message
    except Exception as e:
        print(f"Error evaluating sources: {e}")
        return {}

def evaluate_response(question, response, model):
    prompt = f"""You are a strict evaluator assessing the quality of a response generated by a language model on a topic-specific question. Your job is to score the response based on the following five criteria, each from 1 (very poor) to 5 (excellent):

1. Relevance: Does the response address the question directly and stay on topic?
2. Correctness: Is the information factually accurate?
3. Completeness: Does it sufficiently cover the key aspects of the question?
4. Clarity: Is the response clearly written and easy to understand?
5. Depth: Does the response provide meaningful insight or analysis beyond surface-level information?

After rating, provide a **very brief comment** summarizing the strengths or weaknesses in 1-2 sentences.
Evaluate the following question and response:

[QUESTION]
{question}

[RESPONSE]
{response}
Return the result strictly in the following JSON format (no other text):

{{
    "Relevance": <1-5>,
    "Correctness": <1-5>,
    "Completeness": <1-5>,
    "Clarity": <1-5>,
    "Depth": <1-5>,
    "Comment": "<short comment here>"
}}
Return only the raw JSON array. Do not include any explanation, commentary, or text outside the JSON. Do not wrap it in code blocks.
Return **only a valid Python dictionary** (no additional text or explanation) in the following format:
    """
    
    data = {
        "prompt": prompt,
        "chat_model": model,
        "stream": True
    }
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        for line in (response.text).splitlines():
            last_message = None
            parsed = json.loads(line)
            if 'message' in parsed:
                last_message = parsed['message']
        return last_message
    except Exception as e:
        print(f"Error evaluating sources: {e}")
        return {}

def eval_response_per_model(question, response):
    evaluation = []
    for model in EVALUATION_MODELS:
        result = evaluate_response(question, response, model)
        result = json.loads(result)
        result['model'] = model
        evaluation.append(result)
    return evaluation

In [49]:
# question = "What are the most visited countries in 2025 and what sustainable tourism practices are being promoted?"
# question = "What were the root causes and geopolitical consequences of the Ukraine war between 2022 and 2025?"
# response = []
# for model in EVALUATION_MODELS:
#     result = evaluate_response(question, message, model)
#     result = json.loads(result)
#     result['model'] = model
#     response.append(result)

In [50]:
# with open("result2.json", 'w', encoding='utf-8') as f:
#     json.dump(response, f, indent=2)

In [None]:
questions = read_questions_from_csv("thematic_subjects.csv")
for qst in questions:
    evaluation = {}
    print(f"Question: ({qst['thematic']}) {qst['question']}")
    links, message = send_question(question=qst['question'])
    print("Getting links and message")
    eval_sources = evaluate_sources(qst['question'], links)
    evaluation['sources'] = json.loads(eval_sources)
    print("Evaluation sources")
    
    eval_response = eval_response_per_model(qst['question'], message)
    evaluation['response'] = eval_response
    print("Evaluation response")
    export_json(evaluation,  f"{EVALUATION_DIR}/Evaluation_{qst['thematic']}.json")

# Evaluation Lib

In [34]:
def get_collection_id(collection_name):
    url = "https://beewant.com/api/collections"
    response = requests.get(url, headers=headers)
    for collection in response.json()['results']:
        if collection['name'] == collection_name:
            return collection['id']
    return -1


In [74]:

import requests

def evaluate_lib(questions, collection_name):
    prompt = f"""Answer the following questions strictly based on the content of the PDF. Do not use any external knowledge. For each question, return a concise and direct answer. 
If a question cannot be answered from the document, reply with 'Not found in the document.' Return the answers in a JSON list of objects, each containing 'question' and 'answer' fields.
Questions:
{questions}
"""

    data = {
        'prompt': prompt,
        'media_type': 'collection',
        'mediaID': get_collection_id(collection_name),  
        'chat_model': CHAT_MODEL, 
        'stream': True  
    }

    response = requests.post(url, headers=headers, json=data)
    for line in response.text.splitlines():
        last_message = None
        parsed = json.loads(line)
        if 'message' in parsed:
            last_message = parsed['message']

    if '</think>' in last_message:
        last_message = last_message.split("</think>")[1].strip()
    return response.text

nvidia_report_questions = """
1. What was NVIDIA's Non-GAAP Operating Income for Fiscal 2025, and how did it compare to Fiscal 2024?
2. What were the Stretch Compensation Plan goals for Fiscal 2025 Revenue and Non-GAAP Operating Income, and were they achieved?
3. What is the total number of outstanding shares of NVIDIA common stock as of April 28, 2025, and how many shares are needed for a quorum at the 2025 Annual Meeting?
4. How did the 3-Year Relative TSR performance for MY PSUs granted in Fiscal 2023 perform, and what percentage of target opportunity became eligible to vest for the CEO versus other NEOs?
5. What was the CEO's total compensation as reported in the Summary Compensation Table for Fiscal 2025, and how does this compare to the "Compensation Actually Paid" figure?
6. Which three direct customers represented more than 10% of NVIDIA's total revenue for the fiscal year ended January 26, 2025?
7. What was the breakdown of NVIDIA's Data Center revenue between Compute and Networking segments for Fiscal 2025?
8. Who are the members of NVIDIA's Audit Committee as reported in the Audit Committee Report?
9. What is the expiration date of NVIDIA's proposed Amended and Restated Certificate of Incorporation to remove all supermajority provisions, and what percentage of shares is required for approval?
10. What was the total amount of stock-based compensation expense for Fiscal 2025, and how did it compare to Fiscal 2024?
"""

python_book_questions = """
1. Write a function that takes a list of strings and returns a single string with all items separated by commas, except for the last two items, which should be separated by "and". For example, ['apples', 'bananas', 'tofu', 'cats'] should return "apples, bananas, tofu and cats".
2. Given a nested dictionary representing a picnic guest list (where each guest is a key and the value is another dictionary of items and quantities), write a function to calculate the total number of a specific item brought to the picnic.
3. Write a program that generates 35 different quiz files, each containing 50 multiple-choice questions about US state capitals, with the order of questions and answer options randomized. Also, generate answer key files for each quiz.
4. Given a string, write a function that finds all US phone numbers in the format XXX-XXX-XXXX using regular expressions, and prints each found number.
5. Write a program that logs into an IMAP email account, searches for all emails containing the word "unsubscribe" in their HTML content, extracts the unsubscribe links, and opens them in a browser.
6. Create a program that reads an Excel spreadsheet, finds all members who have not paid their dues for a specific month, and sends them a reminder email using SMTP.
7. Write a script that uses the pyautogui module to automatically fill out and submit a web form multiple times, using data from a list of dictionaries. The script should handle tab navigation, dropdowns, radio buttons, and clicking links to submit another response.
8. Given a list of chores and a list of people, write a program that randomly assigns chores to people, ensuring that no one gets the same chore as last time (store previous assignments), and emails each person their assigned chore.
9. Write a function that takes a grid (list of lists) representing a character picture and prints it rotated 90 degrees counterclockwise.
10. Explain the difference between shallow and deep copies of lists and dictionaries in Python. Write code to demonstrate how modifying a nested list affects shallow and deep copies differently.
"""


# export_json(json.loads(last_message), "eval_lib_pdf.json")


In [86]:
prompt = """Answer the following questions strictly based on the content of the PDF. Do not use any external knowledge. 
For each question, return a concise and direct answer. 
If a question cannot be answered from the document, reply with 'Not found in the document.'
Return the answers in a JSON list of objects, each containing 'question' and 'answer' fields."""

In [75]:
result = evaluate_lib(nvidia_report_questions, "eval_pdf_nvidia")

In [76]:
result

'{"message": "<think>"}\n{"message": "<think>\\n\\n"}\n{"message": "<think>\\n\\n</think>"}\n{"message": "<think>\\n\\n</think>\\n\\n"}\n{"message": "<think>\\n\\n</think>\\n\\n```"}\n{"message": "<think>\\n\\n</think>\\n\\n```json"}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n"}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n"}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n "}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n  {\\n"}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n  {\\n   "}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n  {\\n    \\""}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n  {\\n    \\"question"}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n  {\\n    \\"question\\":"}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n  {\\n    \\"question\\": \\""}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\\n  {\\n    \\"question\\": \\"What"}\n{"message": "<think>\\n\\n</think>\\n\\n```json\\n[\

In [80]:
for line in result.splitlines():
        last_message = None
        parsed = json.loads(line)
        if 'message' in parsed:
            last_message = parsed['message']

if '</think>' in last_message:
    last_message = last_message.split("</think>")[1].strip()

last_message = last_message.replace("```json\n", "").replace("\n```", "")

In [85]:
with open("test.json", 'w') as f:
    json.dump(last_message, f, indent=2)

# Testing

In [1]:
import pandas as pd

# Load your CSV file (adjust the path if needed)
df = pd.read_csv("evaluation.csv", delimiter=",")  # or delimiter=';' if your file uses semicolons




In [15]:
# relevance,correctness,completeness,clarity,depth

claude_df = df[df['model'] == 'claude-3.7']
print(claude_df['correctness'].mean())
print(claude_df['relevance'].mean())
print(claude_df['completeness'].mean())
print(claude_df['clarity'].mean())
print(claude_df['depth'].mean())

4.1
4.75
4.65
4.95
4.7


In [5]:

criteria = ['relevance', 'correctness', 'completeness', 'clarity', 'depth']

global_avg_scores = df.groupby('model')[criteria].mean().reset_index()



In [6]:
global_avg_scores

Unnamed: 0_level_0,relevance,correctness,completeness,clarity,depth
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GPT4.1,5.0,4.7,4.95,5.0,4.95
claude-3.7,4.75,4.1,4.65,4.95,4.7


In [None]:
# Compute global average across all criteria
global_avg_scores['global_average'] = global_avg_scores[criteria].mean(axis=1)

# Print results
print(global_avg_scores)

# Optional: save results to CSV
global_avg_scores.to_csv("model_global_scores.csv", index=False)