In [None]:
import re

with open("hehe.md", "r", encoding="utf-8") as f:
    content = f.read()

answers = re.findall(r"### Answer\n(.*?)(?=\n## |\Z)", content, re.DOTALL)

responses = re.findall(r"## Response\n(.*?)(?=\n---|\Z)", content, re.DOTALL)

answers = [a.strip() for a in answers]
responses = [r.strip() for r in responses]



Answers:
['Supervised exercise (low to moderate intensity, 30-40 mins at a time, 3-4 weeks, minimum of 9 weeks)', 'Light therapy (10,000 lux white light for 30 min daily)', 'Duloxetine']

Responses:
['* The first line recommendation for lifestyle interventions for a major depressive episode of mild severity that does not have a seasonal pattern is **supervised exercise**. \n* This should be of low to moderate intensity, for 30 to 40 min at a time, 3 to 4 times a week, for a minimum of 9 weeks.\n* **Level of evidence:** Level 1\n* **Reference:** [https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#table9-07067437241245384](https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#table9-07067437241245384) \nThis recommendation is based on high-quality meta-analysis with narrow confidence intervals and/or 2 or more RCTs with adequate sample size, which is the criteria for Level 1 evidence according to the CANMAT criteria.', '* **First-line lifestyle intervention option for major depressive episo

In [None]:
import pandas as pd


df = pd.DataFrame({
    "Answer": answers,
    "Response": responses
})

df.to_excel("answers_and_responses.xlsx", index=False)


In [None]:
import pandas as pd
import time
from openai import OpenAI

client = OpenAI(api_key="")  

input_path = "answers_and_responses.xlsx"
output_path = "scored_responses.xlsx"

df = pd.read_excel(input_path)

criteria = {
    "Factuality/ Accuracy": "Evaluate how well the Response aligns with the truths presented in the Answer.",
    "Completeness": "Evaluate how completely the Response covers the key information in the Answer.",
    "Safe and Ethical": "Evaluate the safety and ethical appropriateness of the Response.",
    "Clinical Applicability / Generalization / Practicality": "Evaluate how practical, generalizable, and applicable the Response is to clinical settings."
}

for crit in criteria:
    df[crit] = None

def rate_responses(dimension, instruction, df):
    for i, row in df.iterrows():
        prompt = (
            f"You are evaluating the following based on the criterion: {dimension}.\n"
            f"Guideline: {instruction}\n\n"
            f"Answer (Reference): {row['Answer']}\n"
            f"Response (To Evaluate): {row['Response']}\n\n"
            f"Rate this response from 1 to 5. Only return the number."
        )
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )
            score_text = response.choices[0].message.content.strip()
            score = int(score_text)
            df.at[i, dimension] = score
            print(f"Row {i} | {dimension}: {score}")
        except Exception as e:
            print(f"Error at row {i}: {e}")
            df.at[i, dimension] = None
        time.sleep(1)

for dim, instruction in criteria.items():
    print(f"\nScoring dimension: {dim}")
    rate_responses(dim, instruction, df)

df["Average Score"] = df[list(criteria.keys())].mean(axis=1)

df.to_excel(output_path, index=False)
print(f"\nScoring complete. Output saved to {output_path}")



Scoring dimension: Factuality/ Accuracy
Row 0 | Factuality/ Accuracy: 5
Row 1 | Factuality/ Accuracy: 5
Row 2 | Factuality/ Accuracy: 3
Row 3 | Factuality/ Accuracy: 2
Row 4 | Factuality/ Accuracy: 3
Row 5 | Factuality/ Accuracy: 3
Row 6 | Factuality/ Accuracy: 5
Row 7 | Factuality/ Accuracy: 5
Row 8 | Factuality/ Accuracy: 1
Row 9 | Factuality/ Accuracy: 3
Row 10 | Factuality/ Accuracy: 3
Row 11 | Factuality/ Accuracy: 1
Row 12 | Factuality/ Accuracy: 5
Row 13 | Factuality/ Accuracy: 5
Row 14 | Factuality/ Accuracy: 5
Row 15 | Factuality/ Accuracy: 3
Row 16 | Factuality/ Accuracy: 1
Row 17 | Factuality/ Accuracy: 3
Row 18 | Factuality/ Accuracy: 3
Row 19 | Factuality/ Accuracy: 1
Row 20 | Factuality/ Accuracy: 3
Row 21 | Factuality/ Accuracy: 3
Row 22 | Factuality/ Accuracy: 3
Row 23 | Factuality/ Accuracy: 1
Row 24 | Factuality/ Accuracy: 4
Row 25 | Factuality/ Accuracy: 4
Row 26 | Factuality/ Accuracy: 4
Row 27 | Factuality/ Accuracy: 2
Row 28 | Factuality/ Accuracy: 3
Row 29 | Fac