# Evaluations of Gemma and Leo-Mistral models

Ask the models which of the outputs they prefer according to criteria specified in prompts.

In [1]:
PROMPT = """<|im_start|>system
Du bist ein hilfreicher Assistent, der zwei Texte in einfacher Sprache mit Markdown-Annotationen vergleicht und bewertet.
<|im_end|>
<|im_start|>user
Du erhältst zwei Texte in einfacher Sprache. Die Texte enthalten:
- Markdown-Titel (#, ##, ###)
- Markdown-Fettmarkierungen (**) für wichtige Wörter oder Phrasen
Ausserdem erhältst du den originalen Text in üblicher Sprache und ohne Annotationen.

Deine Aufgabe:
1. Bewerte, welcher Text die einfachere Sprache verwendet.
2. Bewerte, welcher Text die besseren Titel und wichtigen Wörter hat.
3. Bewerte, ob Text 1 dem Original ähnlich ist.
4. Bewerte, ob Text 2 dem Original ähnlich ist.
3. Bewerte, welchen Text du insgesamt besser findest.

Kriterien für die Bewertung:
- Sprache:
  - Sehr einfaches Vokabular und Grammatik
  - Sehr kurze Sätze
  - Jeder Satz steht in einer neuen Zeile
- Annotationen:
  - Titel sind kurz und sinnvoll
  - Wichtige Wörter sind korrekt markiert
  - Es sind nicht zu viele Wörter hintereinander fett markiert
- Gesamte Verständlichkeit für Menschen mit kognitiven Beeinträchtigungen

Instruktionen:
- Wähle für Sprache einen Text (Text 1 oder Text 2)
- Wähle für Annotationen einen Text (Text 1 oder Text 2)
- Bewerte, ob Text 1 dem Original ähnlich ist {TRUE oder FALSE}
- Bewerte, ob Text 2 dem Original ähnlich ist {TRUE oder FALSE}
- Wähle welchen Text du allgemein besser findest (Text 1 oder Text 2)
- Deine Ausgabe muss genau dieses Format haben:

AUSGABE:
Sprache: {Text 1 oder Text 2}
Annotationen: {Text 1 oder Text 2}
Allgemein: {Text 1 oder Text 2}
Text 1 ist dem Original ähnlich: {TRUE oder FALSE}
Text 2 ist dem Original ähnlich: {TRUE oder FALSE}

Begründung:


Hier sind die Texte:

Text 1:
{Text 1}




Text 2:
{Text 2}



Originaler Text:
{original}
<|im_end|>
<|im_start|>assistant
"""


In [2]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

MODEL_NAME = "google/gemma-2-9b-it"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

###############################################################################
# LOAD MODEL
###############################################################################

tokenizer_gemma = AutoTokenizer.from_pretrained(MODEL_NAME)
model_gemma = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32#,
    #device_map="auto"
).to("cuda")



  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 47.66it/s]


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# -------------------------------
# Model Configuration
# -------------------------------
MODEL_NAME = "LeoLM/leo-mistral-hessianai-7b-chat"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# Load Tokenizer
# -------------------------------
tokenizer_leo = AutoTokenizer.from_pretrained(MODEL_NAME)
# -------------------------------
# Load Model
# -------------------------------
model_leo = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    #device_map="auto",  # Automatically place layers on available devices
    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
).to("cuda")

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.50s/it]


In [None]:
def evaluate_gemma(end2end_output, pipeline_output, original_text):
    try:
        prompt = PROMPT.replace("{Text 1}", end2end_output).replace("{Text 2}", pipeline_output).replace("{original}", original_text)
        inputs = tokenizer_gemma(prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            output = model_gemma.generate(
                **inputs,
                max_new_tokens=2048,
                temperature=0.9,
                do_sample=True
            )

        raw = tokenizer_gemma.decode(output[0], skip_special_tokens=True)

        return raw
    
    except Exception as e:
        print("Error in evaluation:", e)
        #traceback.print_exc()
        return None   # Ensure failure never crashes main loop    

In [None]:
def evaluate_leo(end2end_output, pipeline_output, original_text):
    try:
        prompt = PROMPT.replace("{Text 1}", end2end_output).replace("{Text 2}", pipeline_output).replace("{original}", original_text)

        inputs = tokenizer_leo(prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            output = model_leo.generate(
                **inputs,
                max_new_tokens=4096,
                temperature=0.9,
                do_sample=True
            )

        raw = tokenizer_leo.decode(output[0], skip_special_tokens=True)

        return raw
    
    except Exception as e:
        print("Error in evaluation:", e)
        #traceback.print_exc()
        return None   # Ensure failure never crashes main loop    

In [None]:
#import pandas as pd
#import openpyxl

def evaluate(END2END_INPUT_FILE, PIPELINE_INPUT_FILE, OUTPUT_FILE):
    torch.cuda.empty_cache()

    with open(END2END_INPUT_FILE, "r", encoding="utf8") as f:
        lines_end2end = [json.loads(l) for l in f]

    with open(PIPELINE_INPUT_FILE, "r", encoding="utf8") as f2:
        lines_pipeline = [json.loads(l) for l in f2]
    print("length end2end", len(lines_end2end))
    print("length pipeline", len(lines_pipeline))
    
    out = open(OUTPUT_FILE, "w", encoding="utf8")

    failed_evaluation_gemma = []
    failed_evaluation_leo = []


    rows = []
    #for entry in tqdm(lines):
    for i in range(len(lines_end2end)):
        end2end_output = lines_end2end[i]["model_output"]
        pipeline_output = lines_pipeline[i]["model_output"]
        original_text = lines_end2end[i]["input"]

        prompt = PROMPT.replace("{Text 1}", end2end_output).replace("{Text 2}", pipeline_output).replace("{original}", original_text)



        evaluation_gemma = evaluate_gemma(end2end_output, pipeline_output, original_text)
        evaluation_leo = evaluate_leo(end2end_output, pipeline_output, original_text)


        if evaluation_gemma == None:
            failed_evaluation_gemma.append(f"line {i}")
            continue
        if evaluation_leo == None:
            failed_evaluation_leo.append(f"line {i}")
            continue
        new_entry = {
            "id": lines_end2end[i]["id"],
            "instruction":prompt,#entry2.get("instruction", ""),
            "original": original_text,
            "end2end_output": end2end_output,
            "pipeline_output": pipeline_output,
            "evaluation_gemma": evaluation_gemma,
            "evaluation_leo": evaluation_leo
        }
        out.write(json.dumps(new_entry, ensure_ascii=False) + "\n")


    out.close()

In [7]:
END2END_INPUT_FILE= "evaluation/end2end_testset_output_final.jsonl"
PIPELINE_INPUT_FILE = "evaluation/pipeline_testset_output_final.jsonl"
OUTPUT_FILE = "evaluation/evaluation_all2.jsonl"
evaluate(END2END_INPUT_FILE, PIPELINE_INPUT_FILE, OUTPUT_FILE)

length end2end 27
length pipeline 27


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [8]:
import json
import pandas as pd

def extract_after_assistant(text: str):
    """
    Returns the substring of `text` that occurs after the first occurrence
    of the word 'assistant'. If not found, returns the original text.
    """
    if not isinstance(text, str):
        return text
    key = "assistant"
    idx = text.find(key)
    if idx == -1:
        return text
    return text[idx + len(key):].strip()

def jsonl_to_excel(jsonl_path: str, excel_path: str):
    """
    Convert a JSONL file to an Excel file and extract only the part
    after 'assistant' for evaluation fields.
    """
    rows = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                obj = json.loads(line)

                # Process evaluation fields
                if "evaluation_gemma" in obj:
                    obj["evaluation_gemma"] = extract_after_assistant(obj["evaluation_gemma"])
                if "evaluation_leo" in obj:
                    obj["evaluation_leo"] = extract_after_assistant(obj["evaluation_leo"])

                rows.append(obj)

    df = pd.DataFrame(rows)
    df.to_excel(excel_path, index=False)
    print(f"Excel file created at: {excel_path}")



In [9]:
jsonl_to_excel("evaluation/evaluation_all2.jsonl", "evaluation/evaluation_output_all2.xlsx")

Excel file created at: evaluation/evaluation_output_all2.xlsx


In [None]:
pip install openpyxl
