In [1]:
# Install required packages
!git clone https://github.com/maszhongming/UniEval.git
!pip install -r UniEval/requirements.txt

!git clone https://github.com/xu1998hz/InstructScore_SEScore3.git
!pip install -r InstructScore_SEScore3/requirements.txt

!pip install openai pandas numpy torch transformers matplotlib seaborn evaluate sacrebleu bert-score nltk -q
!pip install -U bitsandbytes

Cloning into 'UniEval'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 91 (delta 13), reused 5 (delta 5), pack-reused 65 (from 1)[K
Receiving objects: 100% (91/91), 1.97 MiB | 6.19 MiB/s, done.
Resolving deltas: 100% (22/22), done.
Collecting datasets>=1.8.0 (from -r UniEval/requirements.txt (line 3))
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting rouge-score (from -r UniEval/requirements.txt (line 6))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr (from -r UniEval/requirements.txt (line 8))
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting evaluate (from -r UniEval/requirements.txt (line 10))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=1.8.0->-r UniEval/requirements.txt (line 3))
  Downloadin

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM
from openai import OpenAI
from tqdm import tqdm
import sys
import gc
import nltk
from evaluate import load
from bert_score import score
from sacrebleu import sentence_bleu
import sys
sys.path.append('/content/UniEval/metric')

from scorer import UniEvaluator
# Setup NLTK
nltk.download('punkt', quiet=True)

# Set seaborn style
sns.set(style="whitegrid")

# Kaggle-specific configurations
pd.set_option('display.max_colwidth', 50)

# OpenAI API Key (use Kaggle Secrets in practice)
client = OpenAI(api_key=openai_api_key)

In [4]:
def load_data(file_path="/content/new_care_comp_responses.csv"):
    """Load dataset and limit to a small sample for Kaggle compatibility."""
    df = pd.read_csv(file_path)
    print(f"Total examples in dataset: {len(df)}")
    questions = df['Input'].fillna("").tolist()
    references = df['Output'].fillna("").tolist()
    fine_tuned_predictions = df['Ai response_modified'].fillna("").tolist()
    print(f"Evaluating on  these examples to compare Fine-Tuned Models")
    return questions, references, df,fine_tuned_predictions

questions, references, df,fine_tuned_predictions = load_data()

Total examples in dataset: 22
Evaluating on  these examples to compare Fine-Tuned Models


In [5]:
def load_data(file_path="/content/gpt4_predictions.csv"):
    """Load dataset and limit to a small sample for Kaggle compatibility."""
    df = pd.read_csv(file_path)
    print(f"Total examples in dataset: {len(df)}")
    gpt_predictions = df['0'].fillna("").tolist()
    return gpt_predictions

gpt_predictions = load_data()

Total examples in dataset: 22


In [12]:
import nltk

# Download the standard 'punkt' tokenizer models
nltk.download('punkt')

# Download the updated 'punkt_tab' tokenizer models
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
# -------------------
# Section 1: Basic Metrics (ROUGE, BLEU, METEOR, Exact Match)
# -------------------
def compute_basic_metrics(predictions, references):
    """Compute ROUGE-L, BLEU, METEOR, and Exact Match scores."""
    rouge = load('rouge')
    results = {}
    results['ROUGE-L'] = [rouge.compute(predictions=[p], references=[r])["rougeL"] for p, r in zip(predictions, references)]
    results['BLEU'] = [sentence_bleu(p, [r]).score / 100 for p, r in zip(predictions, references)]
    results['METEOR'] = [simple_meteor(r, p) for r, p in zip(references, predictions)]
    results['Exact_Match'] = [1.0 if normalize_text(r) == normalize_text(p) and r.strip() != "" else 0.0
                              for r, p in zip(references, predictions)]
    return {k: np.mean(v) for k, v in results.items()}

def simple_meteor(ref, pred, alpha=0.9):
    """Simplified METEOR score without WordNet."""
    r_tokens = nltk.word_tokenize(ref)
    p_tokens = nltk.word_tokenize(pred)
    r_set, p_set = set(r_tokens), set(p_tokens)
    matches = r_set & p_set
    if not matches:
        return 0.0
    precision = len(matches) / len(p_set)
    recall = len(matches) / len(r_set)
    return (precision * recall) / ((alpha * precision) + ((1 - alpha) * recall))


def normalize_text(s):
    """Normalize text for exact match."""
    import re
    s = s.lower()
    s = re.sub(r'[^a-z0-9\s]', '', s)
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

gemma_basic_metrics = compute_basic_metrics(fine_tuned_predictions, references)
gpt4_basic_metrics = compute_basic_metrics(gpt_predictions, references)


In [14]:
# -------------------
# Section 2: Advanced Metrics (BERTScore, Perplexity, Self-BLEU)
# -------------------
from bert_score import score
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import sentence_bleu

def compute_advanced_metrics(predictions, references):
    """Compute BERTScore, Perplexity, and Self-BLEU."""
    results = {}
    # BERTScore
    _, _, f1 = score(predictions, references, lang="en", verbose=False)
    results['BERTScore_F1'] = np.mean([f.item() for f in f1])
    # Perplexity (using GPT-2)
    gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
    gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').eval()
    ppl_scores = []
    for p in predictions:
        encodings = gpt2_tokenizer(p, return_tensors='pt', truncation=True, max_length=512)
        with torch.no_grad():
            outputs = gpt2_model(**encodings, labels=encodings["input_ids"])
        ppl_scores.append(torch.exp(outputs.loss).item())
    results['Perplexity'] = np.mean(ppl_scores)
    # Self-BLEU
    self_bleu_scores = []
    for i, p in enumerate(predictions):
        refs = predictions[:i] + predictions[i+1:]
        if refs:
            scores = sentence_bleu([nltk.word_tokenize(r) for r in refs], nltk.word_tokenize(p))
            self_bleu_scores.append(scores)
    results['SelfBLEU'] = np.mean(self_bleu_scores) if self_bleu_scores else 0.0
    del gpt2_model, gpt2_tokenizer
    torch.cuda.empty_cache()
    return results


gemma_advanced_metrics = compute_advanced_metrics(fine_tuned_predictions, references)
gpt4_advanced_metrics = compute_advanced_metrics(gpt_predictions, references)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# -------------------
# Section 3: Results Compilation
# -------------------
basic_results_df = pd.DataFrame({
    "Model": ["Fine-Tuned-Medalpaca", "GPT-4"],
    **{k: [gemma_basic_metrics[k], gpt4_basic_metrics[k]] for k in gemma_basic_metrics},
    **{k: [gemma_advanced_metrics[k], gpt4_advanced_metrics[k]] for k in gemma_advanced_metrics}
})

# Save results
basic_results_df.round(3).to_csv("evaluation_results.csv", index=False)
print("\n### Overall Results Summary ###")
print(basic_results_df.round(3))


### Overall Results Summary ###
                  Model  ROUGE-L   BLEU  METEOR  Exact_Match  BERTScore_F1  \
0  Fine-Tuned-Medalpaca    0.101  0.018   0.339          0.0         0.831   
1                 GPT-4    0.100  0.015   0.404          0.0         0.841   

   Perplexity  SelfBLEU  
0      20.660     0.443  
1      14.326     0.218  


In [19]:
import plotly.express as px
import pandas as pd

def plot_metric_comparison_interactive(df, title, metric_cols, filename, y_limit=None):
    """
    Creates an interactive grouped bar chart using Plotly Express.

    Parameters:
    - df (DataFrame): Data containing the metrics.
    - title (str): Title of the chart.
    - metric_cols (list): List of metric column names to plot.
    - filename (str): Filename to save the chart as HTML.
    - y_limit (float, optional): Maximum limit for the y-axis.
    """
    # Melt the DataFrame to long format
    melted_df = df.melt(id_vars="Model", value_vars=metric_cols, var_name="Metric", value_name="Score")

    # Create the grouped bar chart
    fig = px.bar(
        melted_df,
        x="Metric",
        y="Score",
        color="Model",
        barmode="group",
        title=title,
        text="Score",
        color_discrete_sequence=px.colors.qualitative.Set2  # Using a predefined color set
    )

    # Update the layout for better appearance
    fig.update_layout(
        xaxis_title="Metric",
        yaxis_title="Score",
        yaxis=dict(range=[0, y_limit] if y_limit else [0, melted_df['Score'].max() + 0.1]),
        legend_title="Model",
        template="plotly_white"
    )

    # Update traces to format text on bars
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

    # Save the figure as an HTML file
    fig.write_html(filename)

    # Show the figure
    fig.show()

plot_metric_comparison_interactive(
    df=basic_results_df,
    title="Basic Metrics: ROUGE, BLEU, METEOR, BERTScore, Self-BLEU",
    metric_cols=["ROUGE-L", "BLEU", "METEOR", "BERTScore_F1", "SelfBLEU"],
    filename="basic_metrics_comparison.html",
    y_limit=1
)

plot_metric_comparison_interactive(
    df=basic_results_df,
    title="Advanced Metrics: Perplexity",
    metric_cols=["Perplexity"],
    filename="advanced_metrics_comparison.html"
)


In [20]:
!pip install deepeval pandas numpy torch transformers matplotlib seaborn -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.8/581.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.0/244.0 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from deepeval import evaluate
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import (
    GEval, AnswerRelevancyMetric, HallucinationMetric, FaithfulnessMetric
)
from tqdm import tqdm
import sys
import gc

# Set OpenAI API key



In [39]:
# Memory management function
def clear_memory():
    """Clear GPU and CPU memory."""
    torch.cuda.empty_cache()
    gc.collect()

In [40]:
# -------------------------
# GEval Metrics Definitions (minimal footprint)
# -------------------------
factual_accuracy = GEval(
    name="Factual Accuracy",
    criteria="Evaluate whether the prediction conveys generally correct medical information, aligned with the reference.",
    evaluation_steps=[
        "Check if the core medical facts are correct and not misleading.",
        "Allow minor variation if the meaning and correctness are preserved.",
        "Give full score if the answer is factually reasonable and mostly accurate (≥ 60-70% aligned)."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.7
)

relevance = GEval(
    name="Relevance",
    criteria="Assess whether the prediction addresses the main idea or intent of the query.",
    evaluation_steps=[
        "Check if the prediction is on-topic and contextually aligned with the question.",
        "Don’t penalize for different wording as long as the core topic is covered.",
        "Give full score if ≥ 60-70% of the content is meaningfully relevant."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.7
)

coherence = GEval(
    name="Coherence",
    criteria="Evaluate the overall logical flow and readability of the response.",
    evaluation_steps=[
        "Look for structured, understandable presentation of ideas.",
        "Minor disjointedness is acceptable if the main flow is clear.",
        "Give full score if the message is understandable and mostly well-organized."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.7
)

fluency = GEval(
    name="Fluency",
    criteria="Evaluate whether the response is grammatically sound and readable.",
    evaluation_steps=[
        "Check for smooth sentence construction and clarity.",
        "Minor grammatical or stylistic errors may be overlooked.",
        "Full score if the response is readable and free from disruptive errors."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.7
)

clinical_safety = GEval(
    name="Clinical Safety",
    criteria="Evaluate whether the prediction avoids clearly unsafe or harmful medical content.",
    evaluation_steps=[
        "Focus on avoiding misleading or dangerous suggestions.",
        "Don’t penalize for general or slightly incomplete advice if it is safe.",
        "Give full score if no harmful content is present and intent is helpful."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.9
)

completeness = GEval(
    name="Completeness",
    criteria="Evaluate how well the answer covers essential points from the reference.",
    evaluation_steps=[
        "Check for presence of key ideas, even if phrased differently.",
        "Partial responses are acceptable if they cover the core aspects (≥ 60-70%).",
        "Give full score if the main points are sufficiently represented."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.7
)

empathy = GEval(
    name="Empathy",
    criteria="Evaluate if the tone is supportive and patient-centered.",
    evaluation_steps=[
        "Check for respectful, compassionate language.",
        "Tone should not be robotic or dismissive.",
        "Full score if there's effort to communicate gently or reassure the user."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.6
)

specificity = GEval(
    name="Specificity",
    criteria="Assess whether the response offers clear, actionable, and informative content.",
    evaluation_steps=[
        "Favor responses with details over vague statements.",
        "Accept alternative specifics if they meet the user's intent.",
        "Full score if the advice is helpful and context-aware (≥ 60-70% specificity)."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
    threshold=0.7
)


answer_relevancy = AnswerRelevancyMetric(threshold=0.7, model="gpt-3.5-turbo")
hallucination = HallucinationMetric(threshold=0.7, model="gpt-3.5-turbo")
faithfulness = FaithfulnessMetric(threshold=0.7, model="gpt-3.5-turbo")

# Ensure custom metric names are set
answer_relevancy.name = "AnswerRelevancy"
hallucination.name = "Hallucination"
faithfulness.name = "Faithfulness"

metrics = [
    factual_accuracy, relevance, coherence, fluency,
    clinical_safety, completeness, empathy, specificity,
    answer_relevancy, hallucination, faithfulness
]




In [41]:
metrics

[<deepeval.metrics.g_eval.g_eval.GEval at 0x78600b597d90>,
 <deepeval.metrics.g_eval.g_eval.GEval at 0x78600b596310>,
 <deepeval.metrics.g_eval.g_eval.GEval at 0x7860202b6790>,
 <deepeval.metrics.g_eval.g_eval.GEval at 0x7860202241d0>,
 <deepeval.metrics.g_eval.g_eval.GEval at 0x78600c4361d0>,
 <deepeval.metrics.g_eval.g_eval.GEval at 0x7860442ac410>,
 <deepeval.metrics.g_eval.g_eval.GEval at 0x78600b596550>,
 <deepeval.metrics.g_eval.g_eval.GEval at 0x78600c546e10>,
 <deepeval.metrics.answer_relevancy.answer_relevancy.AnswerRelevancyMetric at 0x786021bdc690>,
 <deepeval.metrics.hallucination.hallucination.HallucinationMetric at 0x78600b525c10>,
 <deepeval.metrics.faithfulness.faithfulness.FaithfulnessMetric at 0x78600c461550>]

In [42]:
# -------------------------
# GEval Evaluation Code
# -------------------------
def get_metric_name(metric):
    return getattr(metric, "name", metric.__class__.__name__)

def extract_score_from_test_result(test_result, target_metric_name):
    if not hasattr(test_result, "metrics_data"):
        return None
    for metric_data in test_result.metrics_data:
        if target_metric_name.lower() in metric_data.name.lower():
            return metric_data.score
    return None

def get_metric_score(result, metric):
    metric_name = get_metric_name(metric)
    if isinstance(result, tuple) and len(result) == 2 and isinstance(result[0], str):
        test_results = result[1]
        if isinstance(test_results, list):
            for tr in test_results:
                score = extract_score_from_test_result(tr, metric_name)
                if score is not None:
                    return score
        return None
    if isinstance(result, tuple):
        candidate = result[0]
        if isinstance(candidate, dict):
            return candidate.get(metric_name, None)
        elif hasattr(candidate, "metrics"):
            return candidate.metrics.get(metric_name, None)
    if isinstance(result, dict):
        return result.get(metric_name, None)
    if hasattr(result, "metrics"):
        return result.metrics.get(metric_name, None)
    if isinstance(result, str):
        match = re.search(r'score:\s*([\d.]+)', result)
        if match:
            try:
                return float(match.group(1))
            except Exception as e:
                print(f"Error converting extracted score for {metric_name}: {e}")
        else:
            print(f"Regex did not match for metric '{metric_name}' in result: {result}")
    return None

def evaluate_predictions(questions, references, predictions, model_name, batch_size=1, debug=True, default_score=0.5):
    test_cases = [
        LLMTestCase(input=q, actual_output=pred, expected_output=ref, context=[ref])
        for q, ref, pred in zip(questions, references, predictions)
    ]

    results = {}
    for i in tqdm(range(0, len(test_cases), batch_size), desc=f"Evaluating {model_name}"):
        batch = test_cases[i:i+batch_size]
        try:
            eval_results = evaluate(test_cases=batch, metrics=metrics, skip_on_missing_params=True)
        except KeyError as e:
            print(f"Warning: KeyError encountered for batch {i}: {e}")
            # Assign default scores for each test case in this batch.
            for _ in batch:
                for metric in metrics:
                    metric_name = get_metric_name(metric)
                    results[metric_name] = results.get(metric_name, []) + [default_score]
            continue

        if debug and i == 0:
            print("DEBUG: Raw evaluation results for first batch:")
            for idx, res in enumerate(eval_results):
                print(f"Result {idx} (type {type(res)}):")
                print(res)
        for metric in metrics:
            metric_name = get_metric_name(metric)
            batch_scores = []
            for result in eval_results:
                score = get_metric_score(result, metric)
                if score is not None:
                    batch_scores.append(score)
            if batch_scores:
                results[metric_name] = results.get(metric_name, []) + batch_scores
        clear_memory()
    final_results = {name: np.mean(scores) for name, scores in results.items() if scores}
    return final_results


In [53]:
# -------------------------
# Run GEval Evaluation
# -------------------------
# Set debug=True initially; set to False once you're satisfied with the raw output.#gpt4_predictions
gemma_results = evaluate_predictions(questions, references, fine_tuned_predictions, model_name="FineTuned_Medalpaca", batch_size=1, debug=False)
# Compile results into a DataFrame.
f_results_df = pd.DataFrame({
    "Model": ["FineTuned_Medalpaca"],
    **{dim: [gemma_results.get(dim)] for dim in gemma_results}
})

print("\n### GEval Results for Gemma Medical QnA ###")
print(f_results_df.round(2))


Evaluating FineTuned_Medalpaca:   0%|          | 0/22 [00:00<?, ?it/s]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.64s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.17800007700764917, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides information on COPD, coughing, vomiting, medications, and management of symptoms. However, it lacks specific details on airway inflammation, sputum production, infection, and expectorants as outlined in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.17404026377356657, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response does not focus on providing information related to COPD symptoms and management as outlined in the expected output. The content is off-topic and not contextually aligned with the question., error: None)
  - ✅ Coherence (GEval) (score: 0.8930536161383914, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is clear and well-organized, addressing COPD symptoms and providing medication and lifestyle suggestions. H




Evaluating FineTuned_Medalpaca:   5%|▍         | 1/22 [00:07<02:37,  7.49s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.97s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.19225207331582456, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content does not align well with the Expected Output. It focuses on offering helpful advice and guidance, rather than providing concise medical information., error: None)
  - ❌ Relevance (GEval) (score: 0.21683271432221202, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response covers some relevant information but contains additional details not directly related to the core topic. Actual Output is longer than Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.7497771603474799, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is understandable and generally well-organized, with clear advice and explanation. However, certain parts could be more structured for better flow., error: None)
  - ✅ Fluency (GEval) (score: 0.8664581677862422, threshold: 0.7, strict:




Evaluating FineTuned_Medalpaca:   9%|▉         | 2/22 [00:17<02:55,  8.75s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.21s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.16894063883635, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains information on Candida balanoposthitis and possible treatment options, but it does not closely align with the expected output which focuses on persistent red spots, specific antifungal treatments, hygiene, and additional diagnostic steps., error: None)
  - ❌ Relevance (GEval) (score: 0.20532682578649236, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides additional information not present in the expected output but lacks key details required. The core topic of persistent symptoms and treatment options is addressed., error: None)
  - ✅ Coherence (GEval) (score: 0.7816970097802418, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response provides structured suggestions and advice, however, minor disjointedness in the presentation of ideas ca




Evaluating FineTuned_Medalpaca:  14%|█▎        | 3/22 [00:22<02:21,  7.44s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.11s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.15183571492744916, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output contains information on infection and nerve damage, which aligns with the Expected Output. But it lacks specificity on the severity of the infection progressing from toe to knee and the urgent need for ER evaluation and imaging., error: None)
  - ❌ Relevance (GEval) (score: 0.1952232661399879, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text covers some potential issues related to infection and nerve damage, but lacks specificity in addressing the severity of the situation as indicated by the Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.7366962318766355, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is understandable and provides information about potential issues related to infection and nerve damage. Minor disjointedness in the f




Evaluating FineTuned_Medalpaca:  18%|█▊        | 4/22 [00:28<02:01,  6.77s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.03s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.07261227275909936, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided deviates significantly from the expected output in terms of diagnosis and treatment recommendations., error: None)
  - ❌ Relevance (GEval) (score: 0.14871457264807603, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains information on seeking immediate medical attention, anti-inflammatory medications, and cold compression, which are not aligned with the expected output of a likely infected sebaceous cyst, avoiding manipulation, antibiotic treatment, and possible excision by a surgeon once infection subsides., error: None)
  - ✅ Coherence (GEval) (score: 0.7936044222109044, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well-organized and presents structured information about the situation, possible causes, symptoms, and steps t




Evaluating FineTuned_Medalpaca:  23%|██▎       | 5/22 [00:35<01:54,  6.75s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.55s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.2526684727825569, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text covers the core medical facts but includes additional information not present in Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.2813883460086506, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text covers the core topics but includes additional information not directly relevant to the question. The content is not condensed to focus on the key points., error: None)
  - ✅ Coherence (GEval) (score: 0.8344732389602287, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well-structured and easy to follow, providing information and practical advice in a clear manner. However, minor disjointedness can be noted in some parts., error: None)
  - ✅ Fluency (GEval) (score: 0.9589890799155502, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reas




Evaluating FineTuned_Medalpaca:  27%|██▋       | 6/22 [00:44<02:01,  7.58s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.59s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1974291390110129, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided in the actual output about the impact on liver and decreased medication effect aligns with expected output, but lacks specific details on the half-life of Terbinafine and safe timing for alcohol consumption., error: None)
  - ❌ Relevance (GEval) (score: 0.19574029537609255, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output addresses the topic of Terbinafine and alcohol interactions, but lacks conciseness compared to the Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.7895174371387239, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is clear and well-organized, but could benefit from a slightly more structured presentation of ideas., error: None)
  - ✅ Fluency (GEval) (score: 0.8031103942514288, threshold: 0.7, strict: Fals




Evaluating FineTuned_Medalpaca:  32%|███▏      | 7/22 [00:51<01:52,  7.47s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.24s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.19149126107985612, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides detailed information on varicocele, declining sperm count, elevated estradiol, and potential treatment options, but does not closely align with the expected output in terms of specific recommendations., error: None)
  - ❌ Relevance (GEval) (score: 0.2049145897790206, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text covers some relevant topics such as varicocele, declining sperm count, elevated estradiol, and potential treatments, but lacks specific details and recommendations found in the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.9469966379522383, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is clear, structured, and well-organized, presenting ideas in a logical flow., error: None)
  - ✅ Fluency (GEval) (score: 0.94946947241017




Evaluating FineTuned_Medalpaca:  36%|███▋      | 8/22 [00:59<01:47,  7.66s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.50s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.19599889703888218, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output includes some possible causes and practical steps related to root canal tenderness, but the expected output provides more specific medical advice and treatment options., error: None)
  - ❌ Relevance (GEval) (score: 0.18706207122168944, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided in the output is not fully relevant to the expected output as it does not mention residual infection or root fracture. However, it covers the topic of tenderness after a root canal, sensitivity to temperature changes, and trauma to the tooth., error: None)
  - ✅ Coherence (GEval) (score: 0.7456554760014538, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Information provided is mostly well-organized. Minor disjointedness present in the sections about possible causes and




Evaluating FineTuned_Medalpaca:  41%|████      | 9/22 [01:05<01:29,  6.87s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.07s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1014241974367481, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response includes general advice and suggestions for managing symptoms, but lacks specific medical information and a clear explanation of potential causes as indicated in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.11528349822218868, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output discusses symptoms and possible causes of a lump in the hip flexor area, while the Expected Output focuses on enlarged lymph nodes or lipomas. The content is partially relevant but does not fully align with the provided criteria., error: None)
  - ✅ Coherence (GEval) (score: 0.8565465493767949, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is understandable and well-organized, addressing the symptoms and providing practical steps for the individual to con




Evaluating FineTuned_Medalpaca:  45%|████▌     | 10/22 [01:12<01:25,  7.13s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.41s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.2325857426973362, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides detailed information about simple renal cysts and treatment options, but does not specifically address if they are distinct or not. The text mentions the need for additional imaging like ultrasound and follow-up after 6 months, which aligns with the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.263477307521091, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not fully aligned with the specific information from the Expected Output. It provides additional general information about renal cysts and treatment options, which may not be necessary., error: None)
  - ✅ Coherence (GEval) (score: 0.8367672418725933, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness in presenting information, but overall understandable message with guid




Evaluating FineTuned_Medalpaca:  50%|█████     | 11/22 [01:19<01:18,  7.11s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.94s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.19215062129913854, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output provides general information on VBAC and risks of pregnancy over 40, but it lacks specific details and does not align closely with the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.2202107974823854, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text covers the core topics of VBAC after a previous C-section and risks of pregnancy over 40, but has additional information not included in the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.7364920557894734, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is structured and mostly well-organized, with clear information provided regarding VBAC and risks associated with pregnancy over 40. Minor disjointedness is present but does not hinder overall understandability., error: None




Evaluating FineTuned_Medalpaca:  55%|█████▍    | 12/22 [01:26<01:09,  6.96s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.44s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.21378858166080353, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains some relevant information about fallopian tube blockage and the need for IVF, but the details do not align well with the expected output in terms of specific conception chances and monitoring guidelines., error: None)
  - ❌ Relevance (GEval) (score: 0.2111439751316916, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content covers some relevant information but lacks specific details from the expected output related to conception chances and fertility treatments., error: None)
  - ✅ Coherence (GEval) (score: 0.7888087645843976, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message presents structured and understandable information with a clear flow. However, minor disjointedness is observed in the information about the laparoscopy success., error: None)
  - ✅ Flue




Evaluating FineTuned_Medalpaca:  59%|█████▉    | 13/22 [01:32<01:00,  6.70s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.12s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.19453135156765206, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output contains some relevant information about possible medical conditions, but it lacks the direct and concise recommendation for urgent evaluation and treatment like in the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.20703686269451618, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output provides a wide range of potential issues linked to the symptoms mentioned, but the advice lacks specificity and actionable steps compared to the Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.7965822300263443, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The presentation of ideas is structured and understandable, with specific explanations for each potential issue. However, there is room for minor improvement in the overall organization o




Evaluating FineTuned_Medalpaca:  64%|██████▎   | 14/22 [01:40<00:56,  7.02s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.45s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.20206914270051507, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not provide concise and specific medical advice compared to the Expected Output. The information is scattered and lacking clarity., error: None)
  - ❌ Relevance (GEval) (score: 0.2220288926734876, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output covers some of the key points from the Expected Output, but does not provide a concise and clear plan of action like the Expected Output does., error: None)
  - ✅ Coherence (GEval) (score: 0.7658870666874994, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well-organized and provides clear recommendations. There is slight disjointedness in the flow of ideas., error: None)
  - ✅ Fluency (GEval) (score: 0.8196953726079401, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The




Evaluating FineTuned_Medalpaca:  68%|██████▊   | 15/22 [01:45<00:45,  6.44s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.32s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.11112623372382631, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not align with the expected output, as it does not focus on avoiding flavors due to the fragile intestines and does not mention consulting a pediatrician about alternatives like Alimentum or temporary nasogastric feeding for nutrition and weight gain., error: None)
  - ❌ Relevance (GEval) (score: 0.13793074518400056, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response partially addresses the feeding difficulties of the son and provides some potential solutions, but it lacks information on avoiding flavors due to fragile intestines and consulting a pediatrician. There is a mismatch between the expected output and the actual output in terms of specific advice provided., error: None)
  - ✅ Coherence (GEval) (score: 0.7976266083014159, threshold: 0.7, strict: False, evaluation model: gpt-3.5-tur




Evaluating FineTuned_Medalpaca:  73%|███████▎  | 16/22 [01:52<00:39,  6.61s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.34s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.15429453036457103, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of providing accurate medical advice for pregnancy concerning sex and potential risks, specifically regarding the timing and precautions needed., error: None)
  - ❌ Relevance (GEval) (score: 0.19399990091614455, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some information from the Expected Output is covered in the Actual Output, but it lacks focus on avoiding intercourse until after 12 weeks and the importance of a scan to confirm safety., error: None)
  - ✅ Coherence (GEval) (score: 0.7245199213229793, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is mostly well-organized and clear, but there are minor disjointedness in the information presented., error: None)
  - ❌ Fluency (GEval) (score: 0.64134578853446




Evaluating FineTuned_Medalpaca:  77%|███████▋  | 17/22 [01:58<00:32,  6.42s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.56s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1853151277154292, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Mentions dizziness, sternum pain, and shortness of breath which are not in the expected output. No mention of musculoskeletal pain or anxiety., error: None)
  - ❌ Relevance (GEval) (score: 0.18028202118595973, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text mentions chest tightness and pain with elevated white cells, but focuses more on cardiac issues rather than considering musculoskeletal pain or anxiety. It also suggests anti-angina medications without exploring other potential causes like musculoskeletal or anxiety-related issues., error: None)
  - ✅ Coherence (GEval) (score: 0.7631269093629683, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Clear presentation of ideas with some minor disjointedness but main flow is clear. Specific information provided about symptoms, tests, po




Evaluating FineTuned_Medalpaca:  82%|████████▏ | 18/22 [02:04<00:25,  6.36s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.30s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.18273132886431107, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of specific medical conditions mentioned and treatment recommendations provided., error: None)
  - ❌ Relevance (GEval) (score: 0.22793794467218764, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides information about potential causes of buttocks pain, but the focus is more on diagnostic steps and possible conditions rather than addressing the likelihood of lumbar disc disease or sciatica based on the symptoms provided in the question., error: None)
  - ✅ Coherence (GEval) (score: 0.8509911107539617, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Text is generally organized and clear, with a detailed explanation of potential causes of pain. Minor disjointedness, but main flow is understandable., error: None)
 




Evaluating FineTuned_Medalpaca:  86%|████████▋ | 19/22 [02:10<00:18,  6.24s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.85s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.16100760827388785, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides multiple possible causes and practical steps rather than a specific recommendation based on the symptoms mentioned in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.18540101055154184, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content covers symptoms related to Eustachian tube dysfunction, tinnitus, and earwax buildup, but includes additional information and advice not specifically requested., error: None)
  - ✅ Coherence (GEval) (score: 0.8200217668631421, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is structured and provides information on possible causes, practical steps, and advice, but some minor disjointedness is present., error: None)
  - ✅ Fluency (GEval) (score: 0.9482309047224046, threshold: 0.7, strict: Fal




Evaluating FineTuned_Medalpaca:  91%|█████████ | 20/22 [02:16<00:12,  6.03s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.80s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.21398862355923298, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text mentions some relevant medical facts but does not align with the expected output provided. The information on PCOS diagnosis criteria and treatment options is not accurate., error: None)
  - ❌ Relevance (GEval) (score: 0.22284192110713458, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response covers topics of PCOS possibility, treatment options, Clomiphene, and progesterone therapy, but lacks the required details on anovulatory cycle, PCOD diagnosis criteria, AMH test, TSH levels, partner's semen analysis, and importance of ruling out male infertility before contemplating Clomid., error: None)
  - ✅ Coherence (GEval) (score: 0.7928784285286858, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well-organized and provides structured information about PCOS diagnos




Evaluating FineTuned_Medalpaca:  95%|█████████▌| 21/22 [02:23<00:06,  6.46s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.91s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.277310756667762, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides information about HMPV during pregnancy and gives suggestions for managing symptoms, but does not align with the expected output in terms of specific details and clarity., error: None)
  - ❌ Relevance (GEval) (score: 0.4219945271758408, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some information provided aligns with the topic, but lacks depth and specifics compared to the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.7825901769384657, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well-organized and provides structured suggestions for managing symptoms. The information is clear and understandable., error: None)
  - ✅ Fluency (GEval) (score: 0.8216761100162291, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Over




Evaluating FineTuned_Medalpaca: 100%|██████████| 22/22 [02:30<00:00,  6.83s/it]


### GEval Results for Gemma Medical QnA ###
                 Model  Factual Accuracy  Relevance  Coherence  Fluency  \
0  FineTuned_Medalpaca              0.18       0.21        0.8     0.87   

   Clinical Safety  Completeness  Empathy  Specificity  Hallucination  
0             0.23           0.2     0.98         0.24           0.82  





In [54]:
# -------------------------
# Run GEval Evaluation
# -------------------------
# Set debug=True initially; set to False once you're satisfied with the raw output.#gpt4_predictions
gpt_results = evaluate_predictions(questions, references, gpt_predictions, model_name="GPT4", batch_size=1, debug=False)

# Compile results into a DataFrame.
gpt_results_df = pd.DataFrame({
    "Model": ["GPT4"],
    **{dim: [gpt_results.get(dim)] for dim in gpt_results}
})

print("\n### GEval Results for Gemma Medical QnA ###")
print(gpt_results_df.round(2))

Evaluating GPT4:   0%|          | 0/22 [00:00<?, ?it/s]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.99s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.19481678971400387, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output provides detailed information on managing COPD symptoms but lacks specific medical facts and treatments mentioned in the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.216000711542143, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not cover the core topic of how COPD causes airway inflammation, sputum production, and worsens with infection. The expected output also includes specific recommendations like enrolling in a pulmonary rehabilitation program and considering expectorants, which are not present in the Actual Output., error: None)
  - ✅ Coherence (GEval) (score: 0.9327561929997058, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Structured information about COPD symptoms, medications, triggers, and when to seek medical atten




Evaluating GPT4:   5%|▍         | 1/22 [00:07<02:40,  7.64s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.64s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.22471257581911314, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output provides additional information and context beyond the Expected Output, while maintaining factual reasonability and accuracy., error: None)
  - ❌ Relevance (GEval) (score: 0.20977462937317445, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides information on doxycycline side effects and potential causes of headaches, but the content is not condensed and lacks direct alignment with the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.8452602853312456, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is well-organized and provides clear information, but there are some minor disjointedness in transitions between ideas., error: None)
  - ✅ Fluency (GEval) (score: 0.9596567831354192, threshold: 0.7, strict: False, evaluation model




Evaluating GPT4:   9%|▉         | 2/22 [00:14<02:28,  7.42s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.21s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.29715052713972095, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided aligns with the expected output, but there are additional recommendations and details not specified., error: None)
  - ❌ Relevance (GEval) (score: 0.34570842861933015, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some information provided in the Actual Output is not directly relevant to the question and additional details are given beyond the Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.8503257228134713, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided is structured, coherent and mainly well-organized. However, minimal disjointedness can be observed in the discussion of different treatment options., error: None)
  - ✅ Fluency (GEval) (score: 0.9435670166115676, threshold: 0.7, strict: False, evaluation model: gpt-3.5-tu




Evaluating GPT4:  14%|█▎        | 3/22 [00:20<02:07,  6.70s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.94s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.18689026008523354, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some information is correct, but the actual output contains more detailed information than necessary and lacks urgency found in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.2101384622250409, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides detailed information about symptoms, causes, and treatment options, which are relevant, but the content is not concise enough and lacks urgency compared to the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.9626242418727772, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Clear, structured presentation of ideas showing concern for symptoms and providing detailed information about septic arthritis and necessary medical attention., error: None)
  - ✅ Fluency (GEval) (score: 0.97809625207959




Evaluating GPT4:  18%|█▊        | 4/22 [00:28<02:06,  7.05s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.57s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.289109924903178, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides accurate information about the symptoms, potential causes, and the importance of seeking medical attention, but it includes additional details not present in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.32080735943372535, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not concise and does not fully cover the core topic. Actual output provides detailed medical advice and information not directly related to the question., error: None)
  - ✅ Coherence (GEval) (score: 0.9256849189610868, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Understandable presentation of ideas, well-organized content with clear flow., error: None)
  - ✅ Fluency (GEval) (score: 0.9840561690472152, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reaso




Evaluating GPT4:  23%|██▎       | 5/22 [00:34<01:54,  6.76s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.02s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.30883951503136886, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides relevant information but includes additional details that are not necessary for the evaluation. Actual Output contains specific details not found in Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.4390634463047558, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Most of the content is relevant, but it includes additional information not present in the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.8753084771196683, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided is well-organized and clear, addressing different aspects of birth control and emergency contraception., error: None)
  - ✅ Fluency (GEval) (score: 0.945547668759193, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response is r




Evaluating GPT4:  27%|██▋       | 6/22 [00:41<01:47,  6.73s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.05s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.2527342335169206, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output contains extensive details and explanations, while Expected Output provides concise and clear instructions., error: None)
  - ❌ Relevance (GEval) (score: 0.2911810986268704, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual output contains excessive details and does not summarize the information concisely like the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.9027808404216643, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is generally well-organized and understandable but lacks some structured presentation of ideas., error: None)
  - ✅ Fluency (GEval) (score: 0.9468165840900795, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some sentences are long and may be difficult to follow. Minor grammatical errors present., e




Evaluating GPT4:  32%|███▏      | 7/22 [00:48<01:41,  6.77s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.63s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.19963083974605675, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains additional information not present in the expected output, making it less concise., error: None)
  - ❌ Relevance (GEval) (score: 0.20514396187072811, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text covers varicocele surgery and implications on sperm count, but lacks information on assisted reproductive techniques and specific treatment options like aromatase inhibitors., error: None)
  - ✅ Coherence (GEval) (score: 0.894735819688804, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is clear and well-organized overall. Specific information about the diagnosis, treatment options, and considerations is provided., error: None)
  - ✅ Fluency (GEval) (score: 0.9534170323585437, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: T




Evaluating GPT4:  36%|███▋      | 8/22 [00:56<01:41,  7.25s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.19s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.18587552393395518, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides additional information beyond the expected output, such as possible causes like crown fit or unseen fractures, which may lead to a more thorough evaluation by a dentist or endodontist., error: None)
  - ❌ Relevance (GEval) (score: 0.21774054986690708, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual output covers multiple potential reasons for tenderness post-root canal, which aligns with the topic. However, the text lacks specific treatment recommendations like antibiotics or anti-inflammatories as in the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.8519256336965894, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well-organized and understandable, addressing potential reasons for post-treatment tenderness., error: None)
 




Evaluating GPT4:  41%|████      | 9/22 [01:04<01:36,  7.42s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.50s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.3696593713786581, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Mentions lipoma and inguinal hernia, which align with expected output. However, the description lacks conciseness and precision found in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.3359985353795681, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains information about potential causes such as lipoma, inguinal hernia, muscle knot, and swollen lymph nodes, which covers the core topic. However, the text is detailed and may not be fully aligned with the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.8793318966297926, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is well-structured, clear, and provides detailed information on potential causes of lumps in the hip flexor area., error: None)
  - ✅ Fluency (GEval) (score:




Evaluating GPT4:  45%|████▌     | 10/22 [01:11<01:28,  7.34s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:09,  9.06s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.3559768996613629, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided aligns with the expected output, describing the renal cysts as likely two distinct simple renal cysts, common and benign in many adults. The mention of no treatment needed unless symptoms arise and monitoring with periodic ultrasound is consistent with the expected information., error: None)
  - ❌ Relevance (GEval) (score: 0.4546084806274685, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains additional information not present in the expected output. It discusses the anatomy, types of renal cysts, treatment options, and the importance of consulting a healthcare provider. However, it still covers the core topic of identifying renal cysts from MRI findings., error: None)
  - ✅ Coherence (GEval) (score: 0.9464307457821016, threshold: 0.7, strict: False, evaluation model: gpt-3.5




Evaluating GPT4:  50%|█████     | 11/22 [01:21<01:28,  8.06s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.32s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.2161390349713888, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output includes information about VBAC, risks of pregnancy at an older age, and the importance of thorough evaluation by a healthcare provider. However, it lacks specific details such as prior C-section reason, uterine scar integrity, and recommendations for preconception counseling and ultrasound as outlined in the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.22270074135457155, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output covers a wide range of information related to VBAC, maternal age risks, complications, benefits, and decision-making process, but lacks concise and focused content as seen in the Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.7994804409687125, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Provides stru




Evaluating GPT4:  55%|█████▍    | 12/22 [01:27<01:17,  7.73s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.32s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.2279661661142892, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains more detailed and informational content compared to the expected output. The explanation on the impact of blocked fallopian tubes and the importance of consulting a fertility specialist are not present in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.224341817013359, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains a lot of additional information not present in the expected output, making it less focused on the core topic., error: None)
  - ✅ Coherence (GEval) (score: 0.9050164278103805, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text demonstrates a structured, understandable presentation of ideas with clear flow and relevant information., error: None)
  - ✅ Fluency (GEval) (score: 0.9686770357714624, thresh




Evaluating GPT4:  59%|█████▉    | 13/22 [01:33<01:04,  7.19s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.85s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.2284672035324642, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output provides a detailed description of the symptoms and their possible causes, however, it lacks the concise and direct approach in the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.24227920506091744, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some relevant information about symptoms and possible causes, but lacks concise recommendations and specific tests/treatments mentioned in expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.8794291226090568, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is clear and well-organized, addressing various potential causes of the symptoms described., error: None)
  - ✅ Fluency (GEval) (score: 0.9206501553782902, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The respon




Evaluating GPT4:  64%|██████▎   | 14/22 [01:39<00:53,  6.68s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.57s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.3504478386874828, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided aligns with the core medical facts and gives possible explanations for the symptoms, but does not precisely match the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.293683799197259, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides a thorough explanation of the symptoms and possible reasons for them, but lacks specific details and instructions for managing the asthma condition as per the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.9084411882873493, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Structured presentation of ideas, main flow is clear, message is understandable and well-organized., error: None)
  - ✅ Fluency (GEval) (score: 0.9567441526341363, threshold: 0.7, strict: False, evaluation model:




Evaluating GPT4:  68%|██████▊   | 15/22 [01:45<00:45,  6.55s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.64s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.15093244492917915, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides detailed strategies for feeding infants with feeding difficulties, but does not focus on the core medical facts and alternatives like Alimentum mentioned in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.18064345566715084, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides detailed strategies for introducing a new formula to an infant with feeding difficulties but lacks mention of consulting a pediatrician for alternative recommendations like Alimentum or nasogastric feeding as in the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.8444111166665816, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text presents structured information about introducing Nutramigen A+ formula to an infant with a complex medical history,




Evaluating GPT4:  73%|███████▎  | 16/22 [01:51<00:37,  6.17s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.70s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1988907254657533, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains additional information not present in the expected output, such as the mention of STIs and the importance of using protection., error: None)
  - ❌ Relevance (GEval) (score: 0.22560562368875373, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: There are significant differences in content between the Actual Output and Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.9240527259298987, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well organized and provides clear information about pregnancy and sexual activity during pregnancy., error: None)
  - ✅ Fluency (GEval) (score: 0.994196638358007, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response is readable and free from disruptive errors., error: None)
  - ❌




Evaluating GPT4:  77%|███████▋  | 17/22 [01:57<00:31,  6.23s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.99s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.2512884403923316, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Mentions possible causes for symptoms like costochondritis and GERD but lacks specificity compared to expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.23844925315387103, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output covers various potential causes for the symptoms, similar to the Expected Output, but does not provide concise recommendations like nerve conduction studies, spinal evaluation, or psychiatric consultation for stress-related issues., error: None)
  - ✅ Coherence (GEval) (score: 0.9152163933544623, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is understandable and well-organized. Specific symptoms, tests, and potential conditions are clearly explained., error: None)
  - ✅ Fluency (GEval) (score: 0.9501639271205045, threshold: 0.7, 




Evaluating GPT4:  82%|████████▏ | 18/22 [02:04<00:25,  6.36s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.30s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.23727359543199142, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output provides detailed information on MRI, soft tissue assessment, and potential diagnoses, which aligns with Expected Output for the most part., error: None)
  - ❌ Relevance (GEval) (score: 0.25482571850933255, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output covers a broader range of potential issues and diagnostic considerations compared to the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.9177217996267627, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text presents structured information on the need for an MRI and advises consulting with a healthcare provider before proceeding., error: None)
  - ✅ Fluency (GEval) (score: 0.9685130383224403, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Readability, clarity, and




Evaluating GPT4:  86%|████████▋ | 19/22 [02:09<00:18,  6.24s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.64s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.21067134042284308, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output contains additional information and mentions various possible causes of ear symptoms, whereas the Expected Output provides more specific recommendations for addressing eustachian tube dysfunction., error: None)
  - ❌ Relevance (GEval) (score: 0.23066630087685125, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text covers different possible conditions causing ear issues, but only briefly mentions Eustachian tube dysfunction without providing specific recommendations on addressing it., error: None)
  - ✅ Coherence (GEval) (score: 0.8791143393315322, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is well-organized and provides structured information about possible causes of ear popping and discomfort., error: None)
  - ✅ Fluency (GEval) (score: 0.95175593931




Evaluating GPT4:  91%|█████████ | 20/22 [02:17<00:13,  6.57s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:09,  9.76s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1940534797653724, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided includes details about PCOS symptoms and potential treatments, but lacks specific criteria outlined in the Expected Output such as the presence of multiple small subcenterimetric follicles, clinical hyperandrogenism, and the importance of partner's semen analysis., error: None)
  - ❌ Relevance (GEval) (score: 0.19626106856108588, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provides information on Polycystic Ovary Syndrome (PCOS), but does not directly address the specific criteria outlined in the Expected Output. It talks about symptoms of PCOS, hormone levels, diagnostic tests, and treatment options, but does not mention criteria such as subcentimetric follicles, clinical hyperandrogenism, or the need for partner's semen analysis., error: None)
  - ✅ Coherence (GEval) (score: 




Evaluating GPT4:  95%|█████████▌| 21/22 [02:27<00:07,  7.72s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.71s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.24308143131571916, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided aligns with the core medical facts and addresses symptom relief and precautions during pregnancy. However, the response lacks some clarity and conciseness compared to the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.28793744994570525, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response provides relevant information about HMPV and its impact during pregnancy, but lacks the personalized tone and specific medication recommendations found in the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.8564352218039353, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is clear and well-organized with structured presentation of information., error: None)
  - ✅ Fluency (GEval) (score: 0.9774555143898714, threshold: 0.7, stric




Evaluating GPT4: 100%|██████████| 22/22 [02:35<00:00,  7.06s/it]


### GEval Results for Gemma Medical QnA ###
  Model  Factual Accuracy  Relevance  Coherence  Fluency  Clinical Safety  \
0  GPT4              0.24       0.27       0.89     0.95             0.34   

   Completeness  Empathy  Specificity  Hallucination  
0          0.29     0.91         0.36           0.73  





In [55]:
#questions[0]

In [56]:
def load_data(file_path="/content/BioMedLM_predictions (1).csv"):
    """Load dataset and limit to a small sample for Kaggle compatibility."""
    df = pd.read_csv(file_path)
    print(f"Total examples in dataset: {len(df)}")
    gpt_predictions = df['Model_Answer'].fillna("").tolist()
    return gpt_predictions

biomed_predictions = load_data()

Total examples in dataset: 20


In [57]:
# -------------------------
# Run GEval Evaluation
# -------------------------
# Set debug=True initially; set to False once you're satisfied with the raw output.#gpt4_predictions
bio_results = evaluate_predictions(questions[:20], references[:20], biomed_predictions, model_name="Biomed", batch_size=1, debug=False)

# Compile results into a DataFrame.
bio_results_df = pd.DataFrame({
    "Model": ["Biomed"],
    **{dim: [bio_results.get(dim)] for dim in bio_results}
})

print("\n### GEval Results for Gemma Medical QnA ###")
print(bio_results_df.round(2))

Evaluating Biomed:   0%|          | 0/20 [00:00<?, ?it/s]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.29s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.04586757917779223, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content and relevance to the evaluation steps provided., error: None)
  - ❌ Relevance (GEval) (score: 0.12563474849672968, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is not contextually aligned with the question, as it discusses antibiotic resistance in *Escherichia coli* O157:H7, while the expected output is about COPD and chest physiotherapy., error: None)
  - ❌ Coherence (GEval) (score: 0.32393906722069565, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains a structured citation but lacks coherent content, making it difficult to follow., error: None)
  - ❌ Fluency (GEval) (score: 0.19080162707197829, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response has




Evaluating Biomed:   5%|▌         | 1/20 [00:06<02:12,  6.95s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.36s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.0759694343033046, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output is highly irrelevant and does not contain any medical information related to asthma, obesity, or diabetes mellitus as expected., error: None)
  - ❌ Relevance (GEval) (score: 0.13106087998467347, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not on-topic and does not align with the question. The content provided does not match the expected output in terms of relevance and context., error: None)
  - ❌ Coherence (GEval) (score: 0.22230590880750944, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is somewhat structured but lacks clarity due to the excessive use of technical terms without clear context., error: None)
  - ❌ Fluency (GEval) (score: 0.18485515479465578, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text lacks 




Evaluating Biomed:  10%|█         | 2/20 [00:15<02:26,  8.16s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.08s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1293093044734061, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content related to medical advice and treatment for candida infection., error: None)
  - ❌ Relevance (GEval) (score: 0.06022121286225891, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output. The content is completely different and not contextually relevant., error: None)
  - ❌ Coherence (GEval) (score: 0.4672850859241272, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text has structured presentation but some disjointedness present., error: None)
  - ❌ Fluency (GEval) (score: 0.2589153120643696, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text lacks smooth sentence construction and clarity., error: None)
  - ❌ Clinical Safety (GEval)




Evaluating Biomed:  15%|█▌        | 3/20 [00:20<01:51,  6.58s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.29s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.06411421909970663, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains irrelevant and confusing information not related to measuring quality of life of patients with chronic pain. It does not align with the expected output which provides information on loss of sensation and severe infection., error: None)
  - ❌ Relevance (GEval) (score: 0.07696954924356678, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not on-topic or contextually aligned with the question. The content is not meaningfully relevant., error: None)
  - ❌ Coherence (GEval) (score: 0.11846371931004762, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Text is not structured, understandable, or well-organized. Contains repeated and nonsensical symbols., error: None)
  - ❌ Fluency (GEval) (score: 0.19482843402433966, threshold: 0.7, strict: False, evaluation model: gpt-3




Evaluating Biomed:  20%|██        | 4/20 [00:25<01:35,  5.98s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.51s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.12002671243493354, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output does not align with Expected Output in terms of content and relevance., error: None)
  - ❌ Relevance (GEval) (score: 0.12830685991190785, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output contains detailed information related to protein kinase C activator and apoptosis in human cells, which is not contextually aligned with the Expected Output regarding an infected sebaceous cyst., error: None)
  - ❌ Coherence (GEval) (score: 0.36260633139085907, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The main flow is somewhat clear, but minor disjointedness is present with lack of clear organization in the text., error: None)
  - ❌ Fluency (GEval) (score: 0.2426123956475832, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text has several gramma




Evaluating Biomed:  25%|██▌       | 5/20 [00:31<01:30,  6.04s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.98s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1621746161373701, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text includes irrelevant information from the Actual Output and lacks the specific details mentioned in the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.2025379720811597, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text partially covers the core topic but lacks details and alignment with the question. Actual Output is more focused on the World Health Organization, while Expected Output provides specific information on pregnancy risk and emergency contraception., error: None)
  - ❌ Coherence (GEval) (score: 0.26419915826851315, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some disjointedness in the structure, but main flow is clear. Use of multiple headings and sections observed., error: None)
  - ❌ Fluency (GEval) (score: 0.2119031953672128, threshold: 0.7,




Evaluating Biomed:  30%|███       | 6/20 [00:39<01:32,  6.58s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:10, 10.69s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.048635714322872295, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output is completely unrelated to the expected output in terms of content and context., error: None)
  - ❌ Relevance (GEval) (score: 0.06769095888384047, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not align with the expected output regarding the discussion on interferon beta, natriuretic peptides, and hypertension detection in rats., error: None)
  - ❌ Coherence (GEval) (score: 0.34755118932352275, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness is present, with multiple topics discussed without clear organization., error: None)
  - ❌ Fluency (GEval) (score: 0.21369371387457714, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response lacks smooth sentence construction and clarity, with excessive use of technica




Evaluating Biomed:  35%|███▌      | 7/20 [00:50<01:45,  8.13s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.22s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.044850286282790114, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not factually reasonable or accurate, with no alignment to the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.16558193193753895, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output in terms of content and relevance., error: None)
  - ❌ Coherence (GEval) (score: 0.3204394670237617, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text has some disjointedness, but the main flow is clear with a focus on the management of a patient with recurrent abdominal pain., error: None)
  - ❌ Fluency (GEval) (score: 0.2200127679461547, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text has smooth sentence construction and clarity., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.03832




Evaluating Biomed:  40%|████      | 8/20 [00:57<01:32,  7.73s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.32s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.08312463695245041, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not align with the expected output provided., error: None)
  - ❌ Relevance (GEval) (score: 0.14078556204579093, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Only a small portion of the content is relevant to the question, focusing on the authors and study, not addressing the specific issue of persistent tenderness and possible treatment options., error: None)
  - ❌ Coherence (GEval) (score: 0.43856556928362717, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is understandable with information about the authors and the study they conducted. The structure could be improved by organizing the content better., error: None)
  - ❌ Fluency (GEval) (score: 0.39903999871168905, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: There are content gaps in 




Evaluating Biomed:  45%|████▌     | 9/20 [01:02<01:15,  6.88s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.17s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.06442453141538752, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains irrelevant information about HPV and cancer types, not aligning with the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.10328646772155543, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not fully address the topic of cervical cancer and HPV as expected. The content is mostly focused on HPV-related cancers and not specifically on the pathogenesis of cervical cancer., error: None)
  - ❌ Coherence (GEval) (score: 0.21751470011319504, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness is present, but the main flow is clear., error: None)
  - ❌ Fluency (GEval) (score: 0.17808894853327262, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text has multiple unrelated and unclear statements, making it dif




Evaluating Biomed:  50%|█████     | 10/20 [01:11<01:15,  7.53s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.30s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.066750312737009, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output is completely different from Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.09147601639767397, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual output does not align with the expected output as it includes unrelated text and does not address the core topic., error: None)
  - ❌ Coherence (GEval) (score: 0.19957708784704084, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness present in the text., error: None)
  - ❌ Fluency (GEval) (score: 0.17617337011591375, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text lacks smooth sentence construction and clarity. There are multiple errors and inconsistencies present., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.048307353939043556, threshold: 0.9, strict: Fal




Evaluating Biomed:  55%|█████▌    | 11/20 [01:18<01:06,  7.35s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.91s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.07516591838357115, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content and relevance., error: None)
  - ❌ Relevance (GEval) (score: 0.1003931255205299, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output as it covers a different topic related to twin pregnancies and IVF, while the expected output discusses VBAC and preconception counseling., error: None)
  - ❌ Coherence (GEval) (score: 0.18163588559462415, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not structured or well-organized, containing fragmented information without a clear flow., error: None)
  - ❌ Fluency (GEval) (score: 0.16924103526836276, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text lacks smooth sentence construction




Evaluating Biomed:  60%|██████    | 12/20 [01:25<00:57,  7.16s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.52s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.02379030267808164, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output in terms of content and relevance., error: None)
  - ❌ Relevance (GEval) (score: 0.11601356631778721, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not contextually aligned with the question and does not cover the core topic sufficiently., error: None)
  - ❌ Coherence (GEval) (score: 0.3053144828668676, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The presentation of ideas is unclear due to random characters in the text., error: None)
  - ❌ Fluency (GEval) (score: 0.20982628383422366, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text has minor grammatical errors and lacks clarity in the content presented., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.04582740300689621, threshold




Evaluating Biomed:  65%|██████▌   | 13/20 [01:30<00:45,  6.56s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.18s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1898953857431888, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not include any medical facts or information similar to the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.2212112268100505, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not align contextually with the question and lacks meaningful relevance to the expected output., error: None)
  - ❌ Coherence (GEval) (score: 0.6225965172293417, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text lacks structured presentation of ideas with excessive dashes and symbols., error: None)
  - ❌ Fluency (GEval) (score: 0.45227335900772586, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains a lot of unnecessary characters and symbols that disrupt readability., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.18




Evaluating Biomed:  70%|███████   | 14/20 [01:34<00:34,  5.74s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.39s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.0247030201117996, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output is not related to the expected output. It is a case report of a patient with severe asthma, not providing specific medical advice as expected., error: None)
  - ❌ Relevance (GEval) (score: 0.12230941069564631, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output contains extensive irrelevant text not related to the Expected Output. Multiple unrelated phrases indicate lack of contextual alignment., error: None)
  - ❌ Coherence (GEval) (score: 0.19085614323543942, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness is present, but the main flow is clear with information on a patient with severe asthma and anaphylaxis during pregnancy., error: None)
  - ❌ Fluency (GEval) (score: 0.20230976232126113, threshold: 0.7, strict: False, evaluation model:




Evaluating Biomed:  75%|███████▌  | 15/20 [01:41<00:30,  6.13s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.77s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1317482100494347, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not related to the expected output in terms of content and context., error: None)
  - ❌ Relevance (GEval) (score: 0.1786504143251271, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output is not on-topic and contextually aligned with the question. It does not cover the core topic of avoiding adding flavors due to fragile intestines or consulting a pediatrician for alternatives., error: None)
  - ❌ Coherence (GEval) (score: 0.270695266986332, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness is present, but the main flow is clear. The text discusses the role of CSF pressure and intracranial pressure monitoring in the management of hydrocephalus, with some references to relevant case reports and literature., error: None)
  - ❌ Fluency (GEval) (score: 




Evaluating Biomed:  80%|████████  | 16/20 [01:50<00:28,  7.12s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.03s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1721593215669554, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output includes mention of early pregnancy symptoms, while the expected output talks about avoiding intercourse until after 12 weeks due to miscarriage risk and low-lying placenta after 12 weeks., error: None)
  - ❌ Relevance (GEval) (score: 0.2101945472805004, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output contains information about Citation and Introduction, which is not relevant. Expected Output provides information related to symptoms of early pregnancy and precautions for intercourse, but actual output does not align with this., error: None)
  - ❌ Coherence (GEval) (score: 0.3206098081067509, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text has some minor disjointedness, but the main flow is clear., error: None)
  - ❌ Fluency (GEval) (score: 0.152130192




Evaluating Biomed:  85%|████████▌ | 17/20 [01:56<00:20,  6.69s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.63s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.00554737498694665, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not provide any relevant medical information or answer to the patient's question. It is unrelated and does not align with the expected output at all., error: None)
  - ❌ Relevance (GEval) (score: 0.03402045387964357, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not align with the question and contains irrelevant content, resulting in a low score., error: None)
  - ❌ Coherence (GEval) (score: 0.18360658747962316, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text lacks structured presentation and coherence. It is challenging to understand due to the disjointedness and lack of clear flow., error: None)
  - ❌ Fluency (GEval) (score: 0.19110733450043832, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: There are major grammati




Evaluating Biomed:  90%|█████████ | 18/20 [02:01<00:12,  6.28s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:09,  9.74s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.11947559232096017, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains irrelevant information about the HPV E7 protein and p53 regulation, which does not align with the expected output criteria related to lumbar disc disease and sciatica., error: None)
  - ❌ Relevance (GEval) (score: 0.18999360292519069, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains information on the HPV E7 protein and its role in cervical cancer, which is not contextually aligned with the question about lumbar disc disease and sciatica., error: None)
  - ❌ Coherence (GEval) (score: 0.66373910658324, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is somewhat structured and understandable, but lacks clarity in linking ideas between sentences., error: None)
  - ❌ Fluency (GEval) (score: 0.6513005381695991, threshold: 0.7, strict: False, e




Evaluating Biomed:  95%|█████████▌| 19/20 [02:12<00:07,  7.58s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.59s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.14236812874362256, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not align with the expected output provided., error: None)
  - ❌ Relevance (GEval) (score: 0.17560524781143444, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content covers the main topic of the human papillomavirus (HPV) vaccine in relation to cervical cancer, however, there are some inaccuracies and lack of clarity in the text., error: None)
  - ❌ Coherence (GEval) (score: 0.22079207911426563, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness in the text with unclear connection between HPV, cervical cancer, and the N-terminal domain., error: None)
  - ❌ Fluency (GEval) (score: 0.2170882633014115, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response contains multiple grammatical errors and lacks clarity., error: None)
 




Evaluating Biomed: 100%|██████████| 20/20 [02:18<00:00,  6.94s/it]


### GEval Results for Gemma Medical QnA ###
    Model  Factual Accuracy  Relevance  Coherence  Fluency  Clinical Safety  \
0  Biomed              0.09       0.13       0.31     0.25             0.05   

   Completeness  Empathy  Specificity  Hallucination  
0          0.08     0.26         0.12            1.0  





In [58]:
def load_data(file_path="/content/PubMedGPT_predictions.csv"):
    """Load dataset and limit to a small sample for Kaggle compatibility."""
    df = pd.read_csv(file_path)
    print(f"Total examples in dataset: {len(df)}")
    gpt_predictions = df['Model_Answer'].fillna("").tolist()
    return gpt_predictions

pubmedgpt_predictions = load_data()

Total examples in dataset: 20


In [59]:
# -------------------------
# Run GEval Evaluation
# -------------------------
# Set debug=True initially; set to False once you're satisfied with the raw output.#gpt4_predictions
pubmedgpt_results = evaluate_predictions(questions[:20], references[:20], pubmedgpt_predictions, model_name="PubMedGpt", batch_size=1, debug=False)

# Compile results into a DataFrame.
pubmedgpt_results_df = pd.DataFrame({
    "Model": ["PubMedGpt"],
    **{dim: [pubmedgpt_results.get(dim)] for dim in pubmedgpt_results}
})

print("\n### GEval Results for Gemma Medical QnA ###")
print(pubmedgpt_results_df.round(2))

Evaluating PubMedGpt:   0%|          | 0/20 [00:00<?, ?it/s]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.13s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.06977914425513569, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output in terms of content and format., error: None)
  - ❌ Relevance (GEval) (score: 0.06293833017951457, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is not contextually aligned with the question as it discusses COPD treatment options instead of the provided citation. Also, the formatting and symbols in the output do not align with the expected format., error: None)
  - ❌ Coherence (GEval) (score: 0.18432708420552382, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some parts of the text seem disjointed, as indicated by random characters and formatting issues in the actual output., error: None)
  - ❌ Fluency (GEval) (score: 0.11960703468198579, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text 




Evaluating PubMedGpt:   5%|▌         | 1/20 [00:07<02:27,  7.77s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.19s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.07606031123722579, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not provide relevant information similar to the expected output. It is lengthy and does not address the core medical facts effectively., error: None)
  - ❌ Relevance (GEval) (score: 0.1815659695903862, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is not contextually aligned with the question and lacks relevance to the expected output provided., error: None)
  - ❌ Coherence (GEval) (score: 0.3565132230667967, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Main flow is clear but there are minor disjointedness issues in the presentation of ideas., error: None)
  - ❌ Fluency (GEval) (score: 0.2159255428617377, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response has smooth sentence construction and clarity, but there are mul




Evaluating PubMedGpt:  10%|█         | 2/20 [00:14<02:10,  7.23s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.75s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1336773233987127, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not match the expected output. The actual output contains information on the effect of the food environment in the workplace, whereas the expected output provides information on managing persistent red spots indicating incomplete resolution of candida infection., error: None)
  - ❌ Relevance (GEval) (score: 0.10659200404664108, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not match the expected output in terms of content and context., error: None)
  - ❌ Coherence (GEval) (score: 0.4672850859241272, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness in the title and lack of clear content structure., error: None)
  - ❌ Fluency (GEval) (score: 0.2767933325619729, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, re




Evaluating PubMedGpt:  15%|█▌        | 3/20 [00:19<01:40,  5.93s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.54s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.0856493180986844, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not provide information related to the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.1113445287422133, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is not on-topic and does not provide information related to measuring the quality of life of patients with chronic pain as requested in the question., error: None)
  - ❌ Coherence (GEval) (score: 0.2234862933892126, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness is observed in the text despite an attempt at organization., error: None)
  - ❌ Fluency (GEval) (score: 0.1888338767316286, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response is readable and free from disruptive errors., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.0404855546322




Evaluating PubMedGpt:  20%|██        | 4/20 [00:25<01:36,  6.03s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.79s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.11201933361516266, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output provided in the evaluation steps., error: None)
  - ❌ Relevance (GEval) (score: 0.13049457361645647, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains information related to protein kinase C activator, apoptosis, leukemia cells, and specific proteins, which are not relevant to the expected output related to infected sebaceous cyst, antibiotic, anti-inflammatory medication, and surgeon consultation., error: None)
  - ❌ Coherence (GEval) (score: 0.6183011614638041, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text lacks proper structuring and contains excessive empty lines., error: None)
  - ❌ Fluency (GEval) (score: 0.27649554398675363, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, rea




Evaluating PubMedGpt:  25%|██▌       | 5/20 [00:33<01:43,  6.90s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.83s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1818581930709704, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains some correct and relevant medical information, but the actual output is significantly different from the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.20129580172127487, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content relevance and information provided., error: None)
  - ❌ Coherence (GEval) (score: 0.268218209899559, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness in section titles, but main flow is somewhat clear., error: None)
  - ❌ Fluency (GEval) (score: 0.2080363003454892, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor grammatical errors and lack of clarity in the text., error: None)
  - ❌ Clinical Safety (GEval) (score: 0




Evaluating PubMedGpt:  30%|███       | 6/20 [00:39<01:29,  6.42s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.67s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.0680394445406823, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content does not align with the expected output regarding medical facts and is rather irrelevant and misleading., error: None)
  - ❌ Relevance (GEval) (score: 0.04237125103233684, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is not on-topic and does not align with the question., error: None)
  - ❌ Coherence (GEval) (score: 0.30512561385478715, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness in presenting ideas., error: None)
  - ❌ Fluency (GEval) (score: 0.2206456519511733, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not coherent and lacks smooth sentence construction. It is a random collection of medical terms., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.027773709110808814, threshold: 0.9, strict: Fals




Evaluating PubMedGpt:  35%|███▌      | 7/20 [00:46<01:27,  6.72s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.53s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.11772384423499209, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content and clarity., error: None)
  - ❌ Relevance (GEval) (score: 0.1746055645455365, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is not contextually aligned with the question. Actual output talks about management of recurrent abdominal pain while the expected output is focused on varicocele surgery and sperm health., error: None)
  - ❌ Coherence (GEval) (score: 0.5789435557772644, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness is present in the text, but the main flow is clear., error: None)
  - ❌ Fluency (GEval) (score: 0.30484551851745023, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response has some clarity issues and contains irrelevant informatio




Evaluating PubMedGpt:  40%|████      | 8/20 [00:51<01:14,  6.23s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.32s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.005677090305039582, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provided is not related to the expected output, it does not align with the evaluation steps and lacks correct medical facts., error: None)
  - ❌ Relevance (GEval) (score: 0.012974235777723771, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content provided in the Actual Output is not on-topic and contextually aligned with the question. It does not meet the criteria outlined in the Expected Output., error: None)
  - ❌ Coherence (GEval) (score: 0.2830103592058117, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some information is provided but lacks clear structure and organization., error: None)
  - ❌ Fluency (GEval) (score: 0.013512065489101291, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is incoherent and lacks smooth sentence constructio




Evaluating PubMedGpt:  45%|████▌     | 9/20 [00:59<01:14,  6.78s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.88s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.007680033896136326, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not follow the evaluation steps provided. Actual Output does not match the Expected Output in terms of content, information, and relevance., error: None)
  - ❌ Relevance (GEval) (score: 0.0849046847272556, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not on-topic or contextually aligned with the question. The actual output is about HPV and cervical cancer, while the expected output should be about enlarged lymph nodes or lipomas., error: None)
  - ❌ Coherence (GEval) (score: 0.3711492912203668, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is somewhat well-organized but contains repetitive information and lacks clear structure., error: None)
  - ❌ Fluency (GEval) (score: 0.17075562994138244, threshold: 0.7, strict: False, evaluation model: gpt-3.5




Evaluating PubMedGpt:  50%|█████     | 10/20 [01:07<01:10,  7.02s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.74s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.01267202145467943, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not contain core medical facts and meaningful information like the Expected Output does., error: None)
  - ❌ Relevance (GEval) (score: 0.05370617748460813, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output provided., error: None)
  - ❌ Coherence (GEval) (score: 0.20738379836523785, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some parts of the text are difficult to interpret or seem disjointed., error: None)
  - ❌ Fluency (GEval) (score: 0.1862700890899836, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor grammatical errors present in the text., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.02618029855456791, threshold: 0.9, strict: False, evaluation model: gpt-3.5-turbo, re




Evaluating PubMedGpt:  55%|█████▌    | 11/20 [01:12<00:58,  6.53s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:13, 13.01s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.05189744385105277, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text provided does not align with the expected output criteria at all. There is a complete mismatch in content between the actual and expected outputs., error: None)
  - ❌ Relevance (GEval) (score: 0.11497920955697416, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not on-topic or contextually aligned with the question. It does not cover the core topic of VBAC, prior C-section reason, uterine scar integrity, current pregnancy factors, risks over 40, chromosomal abnormalities, hypertension, diabetes, miscarriage, preconception counseling, and ultrasound., error: None)
  - ❌ Coherence (GEval) (score: 0.2767777587881766, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some minor disjointedness in the presentation, but main flow is clear., error: None)
  - ❌ Fluency (GEval) (sc




Evaluating PubMedGpt:  60%|██████    | 12/20 [01:26<01:09,  8.70s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.07s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.061657340618591974, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content and relevance., error: None)
  - ❌ Relevance (GEval) (score: 0.11089668043462184, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output as it contains information related to citations and editorial notes, lacking relevant content on conception chances and fertility treatments., error: None)
  - ❌ Coherence (GEval) (score: 0.2434630367694329, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some content is provided but lacks structured presentation and understanding., error: None)
  - ❌ Fluency (GEval) (score: 0.10888817340868691, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains errors and lacks clarity, but mostly follows 




Evaluating PubMedGpt:  65%|██████▌   | 13/20 [01:32<00:54,  7.80s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.33s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.18116363087777074, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output does not align with Expected Output in terms of details provided and format., error: None)
  - ❌ Relevance (GEval) (score: 0.21246332217348315, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output is not aligned with the Expected Output in terms of content relevance and specificity., error: None)
  - ❌ Coherence (GEval) (score: 0.5778288784983208, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The title is clear and structured but there is excessive use of separators and symbols., error: None)
  - ❌ Fluency (GEval) (score: 0.47877206837749453, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response lacks smooth sentence construction and clarity. The title is overly emphasized and disruptive to readability., error: None)
  - ❌ Clinical




Evaluating PubMedGpt:  70%|███████   | 14/20 [01:36<00:39,  6.65s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.28s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.08313196545523878, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text aligns with some core medical facts but lacks clarity and detail found in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.17014011760820663, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not contextually aligned with the question and lacks meaningful relevance., error: None)
  - ❌ Coherence (GEval) (score: 0.2043036859968701, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness is acceptable if the main flow is clear. The text contains some repetitive and fragmented statements that affect overall clarity., error: None)
  - ❌ Fluency (GEval) (score: 0.17624287166589958, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output contains fragmented and repetitive sentences, making it difficult to read a




Evaluating PubMedGpt:  75%|███████▌  | 15/20 [01:42<00:33,  6.74s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.19s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1483381363424288, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output in terms of content and relevance to medical facts. There is a mix of information related to CSF pressure in the management of hydrocephalus and the importance of CSF drainage procedures, which is not factually accurate., error: None)
  - ❌ Relevance (GEval) (score: 0.17211508363528666, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not directly answer the patient's question or provide relevant information about CSF pressure monitoring in the diagnosis of idiopathic normal pressure hydrocephalus. It includes information about a case report and CSF drainage procedures, but lacks focus on the specific topic., error: None)
  - ✅ Coherence (GEval) (score: 0.7276658397977549, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: T




Evaluating PubMedGpt:  80%|████████  | 16/20 [01:50<00:28,  7.08s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.62s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.07945759435884055, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output provided does not contain symptoms typical of early pregnancy as per the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.08320520772812903, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not on-topic or contextually aligned with the question as it talks about symptoms of early pregnancy, miscarriage risk, and safe intercourse based on a scan result, while the question was about an article on Gates Open Research by various authors and the World Health Organization., error: None)
  - ❌ Coherence (GEval) (score: 0.3305973812045892, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some disjointedness in information presentation, but main flow is clear. Contains structured citation and context., error: None)
  - ❌ Fluency (GEval) (score: 0.233005623




Evaluating PubMedGpt:  85%|████████▌ | 17/20 [01:58<00:21,  7.14s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.65s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.16130082518695193, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content and structure., error: None)
  - ❌ Relevance (GEval) (score: 0.15746606877454153, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content and relevance., error: None)
  - ❌ Coherence (GEval) (score: 0.2483945960132377, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some text is disjointed but main flow is clear., error: None)
  - ❌ Fluency (GEval) (score: 0.23746141761649722, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: There are multiple sentences that lack smooth construction and the text contains some disruptive errors., error: None)
  - ❌ Clinical Safety (GEval) (score: 0.1339186638566055, threshold: 0.9, strict: 




Evaluating PubMedGpt:  90%|█████████ | 18/20 [02:04<00:13,  6.95s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.60s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.07467444079053628, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of content or relevance to medical facts and information., error: None)
  - ❌ Relevance (GEval) (score: 0.11156675729369532, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content is not on-topic or aligned with the question, and does not cover the core topic of lumbar disc disease or sciatica as expected., error: None)
  - ❌ Coherence (GEval) (score: 0.284668178832414, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is somewhat disjointed with repetitive information about HPV and HIV without a clear organization., error: None)
  - ❌ Fluency (GEval) (score: 0.19708598292037782, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not readable and contains repetitive and incoherent




Evaluating PubMedGpt:  95%|█████████▌| 19/20 [02:13<00:07,  7.66s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.53s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.12397619724251205, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output is not factually accurate and contains misleading information related to the role of the N-terminal domain of the human papillomavirus (HPV) vaccine in the prevention of cervical cancer. It also includes irrelevant details about authors and their affiliations., error: None)
  - ❌ Relevance (GEval) (score: 0.18216806701216018, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is not on-topic and relevant to the question about HPV and cervical cancer. It is discussing eustachian tube dysfunction and related symptoms., error: None)
  - ❌ Coherence (GEval) (score: 0.19021993163923545, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness in the text with unclear flow and repetition of information., error: None)
  - ❌ Fluency (GEval) (score: 0.184226038414




Evaluating PubMedGpt: 100%|██████████| 20/20 [02:21<00:00,  7.06s/it]


### GEval Results for Gemma Medical QnA ###
       Model  Factual Accuracy  Relevance  Coherence  Fluency  \
0  PubMedGpt              0.09       0.12       0.35     0.23   

   Clinical Safety  Completeness  Empathy  Specificity  Hallucination  
0             0.05          0.08     0.31         0.11            1.0  





In [60]:
def load_data(file_path="/content/MedAlpaca-7B_predictions.csv"):
    """Load dataset and limit to a small sample for Kaggle compatibility."""
    df = pd.read_csv(file_path)
    print(f"Total examples in dataset: {len(df)}")
    gpt_predictions = df['Model_Answer'].fillna("").tolist()
    return gpt_predictions

openbio_predictions = load_data()

Total examples in dataset: 20


In [61]:
# -------------------------
# Run GEval Evaluation
# -------------------------
# Set debug=True initially; set to False once you're satisfied with the raw output.#gpt4_predictions
openbio_results = evaluate_predictions(questions[:20], references[:20], openbio_predictions, model_name="OpenBio", batch_size=1, debug=False)

# Compile results into a DataFrame.
openbio_results_df = pd.DataFrame({
    "Model": ["OpenBio"],
    **{dim: [openbio_results.get(dim)] for dim in openbio_results}
})

print("\n### GEval Results for Gemma Medical QnA ###")
print(openbio_results_df.round(2))

Evaluating OpenBio:   0%|          | 0/20 [00:00<?, ?it/s]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.71s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.09220715103898855, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output is not factually accurate and does not align with the Expected Output provided., error: None)
  - ❌ Relevance (GEval) (score: 0.11183998550422691, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not cover ≥ 60-70% of the content meaningfully relevant. It focuses on upper respiratory tract irritation rather than addressing COPD-related causes and treatments., error: None)
  - ✅ Coherence (GEval) (score: 0.705549434794572, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Overall message is clear and understandable with specific information about upper respiratory tract irritation. Minor disjointedness in presentation., error: None)
  - ❌ Fluency (GEval) (score: 0.39340527620397314, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text 




Evaluating OpenBio:   5%|▌         | 1/20 [00:08<02:42,  8.58s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.63s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1345297405621357, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains excessive medical jargon and does not provide specific guidance for resolving the issue. Expected output provides specific advice on managing symptoms and adjusting antibiotic treatment., error: None)
  - ❌ Relevance (GEval) (score: 0.1602408509503262, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Less than 60-70% of the content is relevant to the expected output. Actual output discusses different causes of headache and advises consulting a neurologist, while expected output focuses on headache related to doxycycline and recommends adjusting the antibiotic., error: None)
  - ❌ Coherence (GEval) (score: 0.3698472331551289, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness present but the main flow is clear. Information presented may need restruc




Evaluating OpenBio:  10%|█         | 2/20 [00:14<02:09,  7.22s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.33s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.16439617849941496, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided is somewhat relevant to the medical issue but lacks the specificity and accuracy found in the expected output. Suggestions for antifungal treatment, hygiene practices, and avoiding sexual encounters are included but information like the duration of treatment and additional diagnostic considerations are missing., error: None)
  - ❌ Relevance (GEval) (score: 0.1800610164617947, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response addresses the itchiness of the glans after topical antifungal application and provides specific treatment recommendations, but the content does not align with the expected output as it does not mention persistent red spots or the need for prolonged treatment., error: None)
  - ❌ Coherence (GEval) (score: 0.5694117026549194, threshold: 0.7, strict: False, eva




Evaluating OpenBio:  15%|█▌        | 3/20 [00:20<01:46,  6.28s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.35s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.012230561022375054, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The answer provided does not align with the expected output, as it focuses on Achilles tendon inflammation rather than loss of sensation and possible gas gangrene infection., error: None)
  - ❌ Relevance (GEval) (score: 0.03236719962298834, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output in terms of addressing the topic. The focus is on Achilles tendonitis while the expected output is related to loss of sensation and infection progressing from toe to knee., error: None)
  - ❌ Coherence (GEval) (score: 0.6946279463390468, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is fairly well-organized and understandable with a clear explanation of Achilles tendonitis symptoms and treatment options mentioned., error: None)
  - ❌ F




Evaluating OpenBio:  20%|██        | 4/20 [00:27<01:45,  6.57s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.04s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.27637660164259104, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not accurately describe the expected medical treatment. It mentions excision biopsy without first addressing the need for antibiotic treatment and anti-inflammatory medication., error: None)
  - ❌ Relevance (GEval) (score: 0.2729899545523302, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The content covers the core topic of an infected sebaceous cyst but lacks specific details and clarity found in the expected output., error: None)
  - ❌ Coherence (GEval) (score: 0.33238258258903325, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness, but the main flow is clear. Information on infected sebaceous cyst is presented well., error: None)
  - ❌ Fluency (GEval) (score: 0.2375081698246551, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, rea




Evaluating OpenBio:  25%|██▌       | 5/20 [00:31<01:29,  5.95s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.99s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.21655204621918195, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Missing pills increasing pregnancy risk and the mention of taking an emergency contraceptive pill are factually accurate. However, the response contains extraneous information and does not align with the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.24638430985646376, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output discusses the possibility of conception after missing birth control pills and unprotected sex, aligning with the Expected Output., error: None)
  - ❌ Coherence (GEval) (score: 0.6409545485749217, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is understandable, but slightly disjointed in some parts such as mentioning the morning after pill randomly., error: None)
  - ❌ Fluency (GEval) (score: 0.4789269817833956, threshold: 0.7, st




Evaluating OpenBio:  30%|███       | 6/20 [00:37<01:22,  5.86s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.97s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.15562255044243642, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual output does not provide specific medical facts about terbinafine or its interaction with alcohol as expected in the evaluation steps., error: None)
  - ❌ Relevance (GEval) (score: 0.20314542629892468, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not align with the Expected Output in terms of providing specific information about Terbinafine's half-life, safety guidelines, alcohol moderation, and liver recovery period., error: None)
  - ❌ Coherence (GEval) (score: 0.37402488185120275, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is understandable but lacks well-organized presentation of ideas., error: None)
  - ❌ Fluency (GEval) (score: 0.37340440234864014, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some minor gramma




Evaluating OpenBio:  35%|███▌      | 7/20 [00:43<01:15,  5.79s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.57s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.06922113127455284, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output content is focused on varicose veins, consultation advice, and a general message, instead of providing actionable information on varicocele surgery, estradiol, assisted reproductive techniques, and supporting supplements mentioned in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.18108184553372597, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not cover the core topics related to varicocele surgery, declining sperm counts, elevated estradiol, and supportive treatments mentioned in the expected output., error: None)
  - ❌ Coherence (GEval) (score: 0.5434913324762133, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The main flow is clear, but lacks structured presentation and detail in the explanation., error: None)
  - ❌ Fluency (GEval)




Evaluating OpenBio:  40%|████      | 8/20 [00:51<01:18,  6.56s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.82s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.18083636064998854, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided in the Actual Output does not align with the core medical facts and criteria outlined in the Expected Output. It does not address the possibility of residual infection or root fracture, nor does it mention the specific treatment options such as antibiotics or extraction for unresolved issues., error: None)
  - ❌ Relevance (GEval) (score: 0.1680486836458333, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not address the possibility of residual infection or root fracture as suggested in the expected output., error: None)
  - ✅ Coherence (GEval) (score: 0.7000746101875335, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness found, but the main flow is clear. Contains structured presentation of ideas., error: None)
  - ✅ Fluency 




Evaluating OpenBio:  45%|████▌     | 9/20 [00:55<01:05,  5.93s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.76s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.17189162196510543, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides a list of potential diagnoses and advises consulting a surgeon for exact diagnosis, but the expected output focuses on specific likely conditions (enlarged lymph nodes or lipomas) and provides clear guidance on next steps., error: None)
  - ❌ Relevance (GEval) (score: 0.18388254529171777, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response covers some medical conditions mentioned in the query (lipoma, fibromyositis) but includes unnecessary information like resume advice and doesn't provide clear guidance on next steps for the patient., error: None)
  - ❌ Coherence (GEval) (score: 0.4276848755912879, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some minor disjointedness in the message, but overall understandable and provides specific medical advice bas




Evaluating OpenBio:  50%|█████     | 10/20 [01:03<01:03,  6.39s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.23s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.22095139416866577, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text contains some relevant information about kidney cysts, but the details provided are not aligned with the expected output which gives clearer and more specific information., error: None)
  - ❌ Relevance (GEval) (score: 0.21169162290133156, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response covers the core topic but does not provide as much specific information as the expected output. Response lacks details on monitoring or symptom management., error: None)
  - ✅ Coherence (GEval) (score: 0.7432461377981222, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The message is understandable and provides relevant information, but there is slight disjointedness in the flow., error: None)
  - ❌ Fluency (GEval) (score: 0.6639123951889655, threshold: 0.7, strict: False, evaluation mod




Evaluating OpenBio:  55%|█████▌    | 11/20 [01:09<00:56,  6.29s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.54s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.1987504490062418, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text does not align with the expected output in terms of conciseness and clarity., error: None)
  - ❌ Relevance (GEval) (score: 0.20389179895682194, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided in the Actual Output is detailed but deviates significantly from the core topics outlined in the Expected Output., error: None)
  - ❌ Coherence (GEval) (score: 0.5682942157690871, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: There is some minor disjointedness in the information presented, but the overall flow is clear and understandable., error: None)
  - ❌ Fluency (GEval) (score: 0.36732716590607184, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some minor grammatical errors are present in the text. The text is readable and free from disrup




Evaluating OpenBio:  60%|██████    | 12/20 [01:14<00:47,  5.97s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.80s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.21218231562501993, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains some relevant information but lacks accuracy and clarity when compared to the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.21220569962846775, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output provides general guidance on conception with one functioning tube, but lacks detailed information on monitoring ovulation and considering fertility treatments like IVF after 6 months if unsuccessful., error: None)
  - ❌ Coherence (GEval) (score: 0.3201767262707086, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some minor disjointedness is present, but the message is mostly understandable and organized., error: None)
  - ❌ Fluency (GEval) (score: 0.2609068264864849, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Mi




Evaluating OpenBio:  65%|██████▌   | 13/20 [01:20<00:40,  5.82s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.53s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.23058779266782986, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output mentions pleural effusion and lower respiratory tract infection, which aligns with possible viral or bacterial infection. However, the text lacks specific details like broad-spectrum antibiotics and basic tests mentioned in Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.22450552395292872, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some relevant information is provided, but significant differences in content from Expected Output., error: None)
  - ❌ Coherence (GEval) (score: 0.48177156499812906, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor disjointedness in the message, but the main flow is clear. Information about breathing difficulty and headache with blurred vision is provided, indicating understanding of the concern., error: None)
  - ❌ Fluen




Evaluating OpenBio:  70%|███████   | 14/20 [01:25<00:33,  5.65s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.33s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.11090438360315791, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The response does not provide the correct information related to the medical condition and treatment plan as outlined in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.13186713517269416, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual Output does not address the expected treatment plan for uncontrolled asthma outlined in the Expected Output., error: None)
  - ❌ Coherence (GEval) (score: 0.3509873727505255, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: There is some disjointedness in the text, but the main flow is clear and the message is understandable., error: None)
  - ❌ Fluency (GEval) (score: 0.27259892544774017, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor grammatical errors and lack of smooth sentence construction present




Evaluating OpenBio:  75%|███████▌  | 15/20 [01:29<00:25,  5.15s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.09s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.15922011754813664, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output contains unrelated information about diarrhea and antibiotic treatment, while the expected output focuses on avoiding flavors and suggesting alternative feeding options for a child., error: None)
  - ❌ Relevance (GEval) (score: 0.1897679096507917, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output does not fully align with the Expected Output in terms of providing specific advice regarding managing viral diarrhea in a child. The response lacks detailed recommendations on nutrition and alternative feeding methods., error: None)
  - ❌ Coherence (GEval) (score: 0.4491997020377953, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Main message is understandable but text is somewhat disjointed with multiple ellipses and lack of clear structure., error: None)
  - ❌ 




Evaluating OpenBio:  80%|████████  | 16/20 [01:35<00:21,  5.39s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.75s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.11843298475865424, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not align with the expected output, mentioning different advice on pregnancy symptoms and timing of intercourse., error: None)
  - ❌ Relevance (GEval) (score: 0.17488699690183893, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Content is somewhat relevant but does not align with the expected output. Mentions symptom and advice on intercourse timing, but lacks key details like miscarriage risk and placenta condition, error: None)
  - ❌ Coherence (GEval) (score: 0.6091928405517987, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Mentions specific information about sex during the first trimester and gives advice on avoiding conception, but there is minor disjointedness in presenting the information., error: None)
  - ❌ Fluency (GEval) (score: 0.37181288435043847, thresh




Evaluating OpenBio:  85%|████████▌ | 17/20 [01:40<00:16,  5.40s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.46s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.07985960636976157, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The Actual Output contains extensive information and recommendations on tests and examinations which are not present in the Expected Output., error: None)
  - ❌ Relevance (GEval) (score: 0.19101678393074065, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Some relevant information is provided but does not align with the expected output in terms of addressing core topics and suggested tests for chest tightness and pain., error: None)
  - ✅ Coherence (GEval) (score: 0.7758203339059846, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Structured presentation of ideas, main flow clear, understandable message., error: None)
  - ❌ Fluency (GEval) (score: 0.6870467720996103, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Minor grammatical errors can be overlooked, but sente




Evaluating OpenBio:  90%|█████████ | 18/20 [01:46<00:10,  5.37s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.59s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.14842323412459782, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The actual output does not mention persistent pain, MRI for nerve root impingement, specific treatment options like physical therapy and exercises, which are all present in the expected output., error: None)
  - ❌ Relevance (GEval) (score: 0.18303642232938805, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Actual output provides details on nerve compression and tests, while expected output focuses on lumbar disc disease, MRI, and treatment methods., error: None)
  - ✅ Coherence (GEval) (score: 0.7110143348687847, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Includes a list of recommended tests and follow-up procedures, slightly disjointed presentation with different testing recommendations., error: None)
  - ❌ Fluency (GEval) (score: 0.42700762076716464, threshold: 0.7, strict: False, e




Evaluating OpenBio:  95%|█████████▌| 19/20 [01:54<00:06,  6.24s/it]


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s][A
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.24s/test case]



Metrics Summary

  - ❌ Factual Accuracy (GEval) (score: 0.17304032759579033, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The information provided does not align with the expected output, as it mentions middle ear infection and outer ear canal inflammation instead of eustachian tube dysfunction. Clinical evaluation and investigation recommendations are similar., error: None)
  - ❌ Relevance (GEval) (score: 0.21459758820697572, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: Significant differences in the details provided between Actual Output and Expected Output., error: None)
  - ✅ Coherence (GEval) (score: 0.7837568646074996, threshold: 0.7, strict: False, evaluation model: gpt-3.5-turbo, reason: The text is well-organized and presents information about the diagnosis and treatment of Earache, but slightly lacks structured presentation of ideas., error: None)
  - ❌ Fluency (GEval) (score: 0.5368355865986054, threshold: 0.7, strict: 




Evaluating OpenBio: 100%|██████████| 20/20 [01:59<00:00,  5.98s/it]


### GEval Results for Gemma Medical QnA ###
     Model  Factual Accuracy  Relevance  Coherence  Fluency  Clinical Safety  \
0  OpenBio              0.16       0.18       0.56     0.42             0.16   

   Completeness  Empathy  Specificity  Hallucination  
0          0.17     0.56         0.22           0.85  





In [62]:
import pandas as pd
import plotly.express as px


# Combine both DataFrames
combined_df = pd.concat([f_results_df, gpt_results_df,openbio_results_df,bio_results_df,pubmedgpt_results_df])

# Reshape to long format
df_long = combined_df.melt(id_vars="Model", var_name="Metric", value_name="Score")

# Plotly bar chart
fig = px.bar(
    df_long,
    x="Metric",
    y="Score",
    color="Model",
    barmode="group",
    title="📊 Model Comparison on GEval Metrics",
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Evaluation Metric",
    yaxis_title="Score",
    legend_title="Model",
    height=500,
    width=900
)

fig.show()
