In [2]:
import requests
import json
import csv
import os

In [3]:
url = "https://beewant.com/api/new-chat"
token_key = "89b19faabe91de4df9025c61bf06dc9029222796"
headers = {
    'Authorization': f'Token {token_key}',  
    'Content-Type': 'application/json'
}
CHAT_MODEL = 'qwen3'
EVALUATION_MODELS = ['claude-3.7', 'GPT4.1']
EVALUATION_DIR = 'evaluation_web'
os.makedirs(EVALUATION_DIR, exist_ok=True)

# Evaluation WEB

In [5]:
def read_questions_from_csv(file_path: str):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter=';')
        for row in reader:
            questions.append({
                'thematic': row['thematic'],
                'question': row['question']
            })
    return questions

def export_json(result, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, indent=2))

def send_question(question):
    # You can change this to "GPT4.1", "mistral", "llama3.3", "claude-3.7", "r1", "qwen3"

    data = {
        "prompt": question,
        "chat_model": CHAT_MODEL,
        "web": True,
        "stream": True
    }

    response = requests.post(url, headers=headers, json=data)
    for line in (response.text).splitlines():
        last_message = None
        try:
            parsed = json.loads(line)
            if 'citations' in parsed:
                citations = parsed['citations']
            if 'message' in parsed:
                last_message = parsed['message']
        except json.JSONDecodeError:
            continue
    links = []
    for link in citations:
        links.append((link['url'], link['snippet'], link['title']))
    message = last_message
    if '</think>' in last_message:
        message = last_message.split("</think>")[1].strip()
    return links, message

Response Evaluation:
1. Relevance (0.4): Does the response address the question directly and stay on topic? 
2. Correctness (0.2): Is the information factually accurate?
3. Completeness (0.15): Does it cover all the key aspects expected in the answer?
4. Clarity (0.15): Is it clearly written and easy to understand?
5. Depth (0.1): Does it show deep insight or thoughtful reasoning?

In [78]:
def evaluate_sources(question, sources):
    evaluation_prompt = f"""Evaluate the following sources for the question: "{question}"
Sources:
{sources}

Evaluate each source based on:
- The domain name (e.g., .org, .gov, known institutions, major news outlets),
- The structure and keywords in the URL (e.g., presence of date or topic),
- Any general knowledge you have about the source.

For each link, assign a **total score out of 15** based on the following 3 criteria (each scored from 0 to 5):
1. **relevance** to the user question  
2. **credibility** of the domain  
3. **freshness** (based on year or indicators in the URL)

Add the scores for each URL and return the result as a JSON list with this structure:

[
    {{
    "url": "https://example.com/article-sahel-water-2023",
    "relevance": 2,
    "credibility": 2,
    "freshness": 1
    }},
    {{
    "url": "https://oldnews.net/blog123",
    "relevance": 3,
    "credibility": 1,
    "freshness": 1
    }},
    {{
    "url": "https://un.org/sahel-water-report",
    "relevance": 2,
    "credibility": 2,
    "freshness": 1
    }}
]
Return only the raw JSON array. Do not include any explanation, commentary, or text outside the JSON. Do not wrap it in code blocks."""

    data = {
        "prompt": evaluation_prompt,
        "chat_model": EVALUATION_MODELS[0],
        "stream": True
    }
    
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        for line in (response.text).splitlines():
            last_message = None
            parsed = json.loads(line)
            if 'message' in parsed:
                last_message = parsed['message']
        return last_message
    except Exception as e:
        print(f"Error evaluating sources: {e}")
        return {}

def evaluate_response(question, response, model):
    prompt = f"""You are a strict evaluator assessing the quality of a response generated by a language model on a topic-specific question. Your job is to score the response based on the following five criteria, each from 1 (very poor) to 5 (excellent):

1. Relevance: Does the response address the question directly and stay on topic?
2. Correctness: Is the information factually accurate?
3. Completeness: Does it sufficiently cover the key aspects of the question?
4. Clarity: Is the response clearly written and easy to understand?
5. Depth: Does the response provide meaningful insight or analysis beyond surface-level information?

After rating, provide a **very brief comment** summarizing the strengths or weaknesses in 1-2 sentences.
Evaluate the following question and response:

[QUESTION]
{question}

[RESPONSE]
{response}
Return the result strictly in the following JSON format (no other text):

{{
    "Relevance": <1-5>,
    "Correctness": <1-5>,
    "Completeness": <1-5>,
    "Clarity": <1-5>,
    "Depth": <1-5>,
    "Comment": "<short comment here>"
}}
Return only the raw JSON array. Do not include any explanation, commentary, or text outside the JSON. Do not wrap it in code blocks.
Return **only a valid Python dictionary** (no additional text or explanation) in the following format:
    """
    
    data = {
        "prompt": prompt,
        "chat_model": model,
        "stream": True
    }
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        for line in (response.text).splitlines():
            last_message = None
            parsed = json.loads(line)
            if 'message' in parsed:
                last_message = parsed['message']
        return last_message
    except Exception as e:
        print(f"Error evaluating sources: {e}")
        return {}

def eval_response_per_model(question, response):
    evaluation = []
    for model in EVALUATION_MODELS:
        result = evaluate_response(question, response, model)
        result = json.loads(result)
        result['model'] = model
        evaluation.append(result)
    return evaluation

In [79]:
# question = "What are the most visited countries in 2025 and what sustainable tourism practices are being promoted?"
# question = "What were the root causes and geopolitical consequences of the Ukraine war between 2022 and 2025?"
# response = []
# for model in EVALUATION_MODELS:
#     result = evaluate_response(question, message, model)
#     result = json.loads(result)
#     result['model'] = model
#     response.append(result)

In [38]:
# with open("result2.json", 'w', encoding='utf-8') as f:
#     json.dump(response, f, indent=2)

In [80]:
questions = read_questions_from_csv("thematic_subjects.csv")
for qst in questions:
    evaluation = {}
    print(f"Question: ({qst['thematic']}) {qst['question']}")
    links, message = send_question(question=qst['question'])
    print("Getting links and message")
    eval_sources = evaluate_sources(qst['question'], links)
    evaluation['sources'] = json.loads(eval_sources)
    print("Evaluation sources")
    
    eval_response = eval_response_per_model(qst['question'], message)
    evaluation['response'] = eval_response
    print("Evaluation response")
    export_json(evaluation,  f"{EVALUATION_DIR}/Evaluation_{qst['thematic']}.json")

Question: (Religion) How are religious institutions responding to increasing secularism in Europe and North America in 2025?
Getting links and message
Evaluation sources
Evaluation response
Question: (Environment & Climate) What are the most impactful climate adaptation projects launched after COP29, and how are they being funded?
Getting links and message
Evaluation sources
Evaluation response
Question: (Social Issues) What are the main challenges and innovations in addressing global housing inequality in 2025?
Getting links and message
Evaluation sources
Evaluation response
Question: (Psychology & Mental Health) How has the rise of virtual therapy and mental health apps affected treatment accessibility and effectiveness worldwide?
Getting links and message
Evaluation sources
Evaluation response
Question: (Media & Entertainment) How is the entertainment industry adapting to AI-generated content in 2025, and what legal issues are emerging?
Getting links and message
Evaluation sources
E

# Evaluation Lib

In [10]:

import requests

prompt = """Answer the following 15 questions based on the content of the book. If the book doesn’t provide an answer, reply with "Not covered in the book."

Return your answers as a **Python list of dictionaries**, where each dictionary has the keys:
- `"question"`: the original question
- `"answer"`: the response from the book's content

Here are the questions:

1. How can I automate sending emails using Python?
2. What is the use of the `pyautogui` module?
3. How does Python handle copying and moving files automatically?
4. What is the difference between `input()` and `sys.argv` for input?
5. How can I extract data from an Excel spreadsheet using Python?
6. What does this code snippet do? `for i in range(5): print(i)`
7. How do I handle exceptions using try/except in Python?
8. What’s the purpose of the `open()` function in reading files?
9. What are the benefits of automating tasks with Python?
10. When should I use regular expressions in automation?
11. How does scheduling a script to run daily work on Windows?
12. What are some examples of boring tasks that can be automated?
13. In which chapter is web scraping introduced?
14. Which module is recommended for manipulating PDFs?
15. How is the mouse controlled programmatically in the book?


Return only a valid Python list of 15 dictionaries."""

data = {
    'prompt': prompt,
    'media_type': 'collection',
    'mediaID': 236,  
    'chat_model': CHAT_MODEL, 
    'stream': True  
}

response = requests.post(url, headers=headers, json=data)
for line in response.text.splitlines():
    last_message = None
    parsed = json.loads(line)
    if 'message' in parsed:
        last_message = parsed['message']

if '</think>' in last_message:
    last_message = last_message.split("</think>")[1].strip()

export_json(json.loads(last_message), "eval_lib_pdf.json")


# Testing

In [1]:
import pandas as pd

# Load your CSV file (adjust the path if needed)
df = pd.read_csv("evaluation.csv", delimiter=",")  # or delimiter=';' if your file uses semicolons




In [15]:
# relevance,correctness,completeness,clarity,depth

claude_df = df[df['model'] == 'claude-3.7']
print(claude_df['correctness'].mean())
print(claude_df['relevance'].mean())
print(claude_df['completeness'].mean())
print(claude_df['clarity'].mean())
print(claude_df['depth'].mean())

4.1
4.75
4.65
4.95
4.7


In [5]:

criteria = ['relevance', 'correctness', 'completeness', 'clarity', 'depth']

global_avg_scores = df.groupby('model')[criteria].mean().reset_index()



In [6]:
global_avg_scores

Unnamed: 0_level_0,relevance,correctness,completeness,clarity,depth
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GPT4.1,5.0,4.7,4.95,5.0,4.95
claude-3.7,4.75,4.1,4.65,4.95,4.7


In [None]:
# Compute global average across all criteria
global_avg_scores['global_average'] = global_avg_scores[criteria].mean(axis=1)

# Print results
print(global_avg_scores)

# Optional: save results to CSV
global_avg_scores.to_csv("model_global_scores.csv", index=False)