In [None]:
!pip install openai

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import openai
from openai import OpenAI
import os

In [None]:
key = "OPEN-AI-KEY"
client = OpenAI(api_key=key)

In [None]:
model="o4-mini"

languages = ['english', 'hebrew', 'arabic']
models = ['gemma3', 'phi4','phi4mini' 'qwen3']
models = ['phi4mini']
numbers = ['1', '2', '3']

path = "/content/drive/Shareddrives/mohamad/LLMs_Answers"

In [None]:
english_prompt = """
      Evaluate the following answer to a basic scientific question.
      Question: {Question}
      Answer: {Answer}

      Assess it according to these 3 criteria:
      Accuracy (40%) – Is the information correct and scientifically valid?
      Completeness (30%) – Does the answer fully address the question?
      Clarity (30%) – Is the answer clearly written and easy to understand?

      Return the following format:
      Accuracy: [0–100]
      Completeness: [0–100]
      Clarity: [0–100]

      Final Score (weighted): [0–100]

      Explanation: [1–2 sentences justifying the score]

"""
hebrew_prompt = """
      הערך את התשובה לשאלה מדעית בסיסית לפי שלושת הקריטריונים הבאים:
      שאלה: {Question}
      תשובה: {Answer}

      דיוק (40%) – האם המידע נכון ומבוסס מדעית?
      שלמות (30%) – האם התשובה מכסה את כל החלקים הרלוונטיים של השאלה?
      בהירות (30%) – האם התשובה מנוסחת בצורה ברורה וקלה להבנה?

      החזר את התשובה בפורמט הבא:
      דיוק: [0–100]
      שלמות: [0–100]
      בהירות: [0–100]

      ציון סופי משוקלל: [0–100]

      הסבר: [משפט או שניים המצדיקים את הציון]

"""

arabic_prompt = """
      قَيِّم الإجابة على سؤال علمي أساسي وفقًا للمعايير الثلاثة التالية:
      السؤال: {Question}
      الإجابة: {Answer}

      الدقة (40٪) – هل المعلومات صحيحة وصحيحة علميًا؟
      الشمولية (30٪) – هل تغطي الإجابة جميع جوانب السؤال ذات الصلة؟
      الوضوح (30٪) – هل تمت صياغة الإجابة بشكل واضح وسهل الفهم؟

      أعد الإجابة بالتنسيق التالي:
      الدقة: [0–100]
      الشمولية: [0–100]
      الوضوح: [0–100]

      النتيجة النهائية الموزونة: [0–100]

      تفسير: [جملة أو جملتان تبرران النتيجة]

"""

In [None]:
def eval(_prompt):
  response = client.chat.completions.create(
      model=model,
      messages=[
          {"role": "user", "content": _prompt}
      ],
      )
  # The way to access the content has also changed slightly
  return response.choices[0].message.content

In [None]:
def parse_evaluation(eval_string):
    accuracy = None
    completeness = None
    clarity = None
    final_score = None
    explanation = None

    lines = eval_string.split('\n')
    explanation_lines = []
    in_explanation = False

    # Define keyword dictionaries for different languages
    keywords = {
        'english': {
            'accuracy': 'Accuracy:',
            'completeness': 'Completeness:',
            'clarity': 'Clarity:',
            'final_score': 'Final Score (weighted):',
            'explanation': 'Explanation:'
        },
        'hebrew': {
            'accuracy': 'דיוק:',
            'completeness': 'שלמות:',
            'clarity': 'בהירות:',
            'final_score': 'ציון סופי משוקלל:',
            'explanation': 'הסבר:'
        },
        'arabic': {
            'accuracy': 'الدقة:',
            'completeness': 'الشمولية:',
            'clarity': 'الوضوح:',
            'final_score': 'النتيجة النهائية الموزونة:',
            'explanation': 'تفسير:'
        }
    }

    # Determine the language based on keywords present in the string
    language = 'english' # Default
    for lang, lang_keywords in keywords.items():
        if any(kw in eval_string for kw in lang_keywords.values()):
            language = lang
            break

    current_keywords = keywords[language]

    for line in lines:
        line = line.strip()
        if line.startswith(current_keywords['accuracy']):
            try:
                # Handle potential prefixes like "Accuracy:" or "Accuracy: [0-100]"
                parts = line.split(current_keywords['accuracy'], 1)[1].strip().split('/')
                accuracy = int(parts[0].split('[')[-1].split(']')[0].strip())
            except (ValueError, IndexError):
                pass
        elif line.startswith(current_keywords['completeness']):
            try:
                 # Handle potential prefixes like "Completeness:" or "Completeness: [0-100]"
                parts = line.split(current_keywords['completeness'], 1)[1].strip().split('/')
                completeness = int(parts[0].split('[')[-1].split(']')[0].strip())
            except (ValueError, IndexError):
                pass
        elif line.startswith(current_keywords['clarity']):
            try:
                # Handle potential prefixes like "Clarity:" or "Clarity: [0-100]"
                parts = line.split(current_keywords['clarity'], 1)[1].strip().split('/')
                clarity = int(parts[0].split('[')[-1].split(']')[0].strip())
            except (ValueError, IndexError):
                pass
        elif line.startswith(current_keywords['final_score']):
            try:
                # Handle potential prefixes like "Final Score (weighted):" or "Final Score (weighted): [0-100]"
                parts = line.split(current_keywords['final_score'], 1)[1].strip().split('/')
                final_score = int(parts[0].split('[')[-1].split(']')[0].strip())
            except (ValueError, IndexError):
                pass
        elif line.startswith(current_keywords['explanation']):
            in_explanation = True
            explanation_lines.append(line.split(current_keywords['explanation'], 1)[1].strip())
        elif in_explanation:
            explanation_lines.append(line)

    explanation = ' '.join(explanation_lines).strip()

    return pd.Series({
        'Accuracy': accuracy,
        'Completeness': completeness,
        'Clarity': clarity,
        'Final Score': final_score,
        'Explanation': explanation
    })

def save_to_csv(df, m,l,n):
    # Apply the function to the 'eval' column and create new columns
    df[['Accuracy', 'Completeness', 'Clarity', 'Final Score', 'Explanation']] = df['eval'].apply(parse_evaluation)

    # Save the updated dataframe
    output_file_path = f"/content/drive/Shareddrives/mohamad/eval/{m}_{l}_{n}_evaluated_english.csv" # Modified filename to include language
    df.to_csv(output_file_path, index=False)

    print(f"Evaluated data saved to: {output_file_path}")

In [None]:
mapper_propmt = {
    'english': english_prompt,
    'hebrew': hebrew_prompt,
    'arabic': arabic_prompt
}

for l in languages:
  for m in models:
    for n in numbers:
      if os.path.exists(f"/content/drive/Shareddrives/mohamad/eval/{m}_{l}_{n}_evaluated_{l}.csv"):
        print(f"File /content/drive/Shareddrives/mohamad/eval/{m}_{l}_{n}_evaluated_{l}.csv already exists. Skipping evaluation.")
        continue # Skip to the next iteration if the file exists
      else:
        print(f"File /content/drive/Shareddrives/mohamad/eval/{m}_{l}_{n}_evaluated_{l}.csv not found. Proceeding with evaluation.")
        file_path = f"{path}/{m}_{l}_{n}.csv"
        print(f"Reading: {file_path}")
        df = pd.read_csv(file_path)
        if 'text' not in df.columns:
          df.columns = ['text', 'classify']
          print("Columns set to ['text', 'classify']")
        if m == "qwen3": df["classify"] = df["classify"].apply(lambda x: x.split("</think>")[-1].strip() if isinstance(x, str) else x)
        # df["eval"] = df.apply(lambda row: eval(english_prompt.format(Question=row["text"], Answer=row["classify"])), axis=1)
        df["eval"] = df.apply(lambda row: eval(mapper_propmt[l].format(Question=row["text"], Answer=row["classify"])), axis=1)
        save_to_csv(df, m,l,n)


File /content/drive/Shareddrives/mohamad/eval/phi4mini_english_1_evaluated_english.csv not found. Proceeding with evaluation.
Reading: /content/drive/Shareddrives/mohamad/LLMs_Answers/phi4mini_english_1.csv


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/Shareddrives/mohamad/LLMs_Answers/phi4mini_english_1.csv'

In [None]:
mapper_propmt = {
    'english': english_prompt,
    'hebrew': hebrew_prompt,
    'arabic': arabic_prompt
}

for l in languages:
  for m in models:
    for n in numbers:
      if os.path.exists(f"/content/drive/Shareddrives/mohamad/eval/{m}_{l}_{n}_evaluated_english.csv"):
        print(f"File /content/drive/Shareddrives/mohamad/eval/{m}_{l}_{n}_evaluated_english.csv already exists. Skipping evaluation.")
        continue # Skip to the next iteration if the file exists
      else:
        print(f"File /content/drive/Shareddrives/mohamad/eval/{m}_{l}_{n}_evaluated_english.csv not found. Proceeding with evaluation.")
        file_path = f"{path}/{m}_{l}_{n}.csv"
        print(f"Reading: {file_path}")
        df = pd.read_csv(file_path)
        if 'text' not in df.columns:
          df.columns = ['text', 'classify']
          print("Columns set to ['text', 'classify']")
        df["eval"] = df.apply(lambda row: eval(english_prompt.format(Question=row["text"], Answer=row["classify"])), axis=1)
        save_to_csv(df, m,l,n)


File /content/drive/Shareddrives/mohamad/eval/phi4mini_english_1_evaluated_english.csv already exists. Skipping evaluation.
File /content/drive/Shareddrives/mohamad/eval/phi4mini_english_2_evaluated_english.csv already exists. Skipping evaluation.
File /content/drive/Shareddrives/mohamad/eval/phi4mini_english_3_evaluated_english.csv already exists. Skipping evaluation.
File /content/drive/Shareddrives/mohamad/eval/phi4mini_hebrew_1_evaluated_english.csv not found. Proceeding with evaluation.
Reading: /content/drive/Shareddrives/mohamad/LLMs_Answers/phi4mini_hebrew_1.csv
Evaluated data saved to: /content/drive/Shareddrives/mohamad/eval/phi4mini_hebrew_1_evaluated_english.csv
File /content/drive/Shareddrives/mohamad/eval/phi4mini_hebrew_2_evaluated_english.csv not found. Proceeding with evaluation.
Reading: /content/drive/Shareddrives/mohamad/LLMs_Answers/phi4mini_hebrew_2.csv
Evaluated data saved to: /content/drive/Shareddrives/mohamad/eval/phi4mini_hebrew_2_evaluated_english.csv
File /