In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import json
import random
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
from tqdm import tqdm

tqdm.pandas()


In [3]:
def jprint(obj):
    print(json.dumps(obj, indent=2))

In [4]:
from openai import OpenAI

client = OpenAI(
    base_url="http://0.0.0.0:8032/v1",
    api_key="_",
)

MODEL = "qwen-2.5-32b"
TEMPERATURE = 0.3

In [5]:
exp_A = 'moral-prof'
exp_B = 'gawsy-paps'

In [7]:
df_a = pd.read_json(f'../../tmp/erx-results/results-{exp_A}.jsonl', lines=True)
df_a.head()

Unnamed: 0,text,triples,predicted_triples,exact.precision,exact.recall,exact.f1,fuzzy.precision,fuzzy.recall,fuzzy.f1
0,"Caterpillar Inc. is located in Peoria, Illinois.","[Caterpillar Inc. | location | Peoria, Illinois]","[Caterpillar Inc. | location | Peoria, Illinois]",1.0,1.0,1.0,1.0,1.0,1.0
1,Dale Tallon is the general manager of the Flor...,[Florida Panthers | general manager | Dale Tal...,[Florida Panthers | general manager | Dale Tal...,1.0,1.0,1.0,1.0,1.0,1.0
2,"Aaron Turner played with the band Twilight, is...",[Aaron Turner | associated band/associated mus...,[Aaron Turner | associated band/associated mus...,0.96,0.96,0.96,1.0,1.0,1.0
3,"From Spain, Ajoblanco (alternatively known as ...","[Ajoblanco | country | Spain, Ajoblanco | alte...","[Ajoblanco | country | Spain, Ajoblanco | alte...",1.0,1.0,1.0,1.0,1.0,1.0
4,Alfredo Zitarrosa performs Zamba music and sta...,"[Alfredo Zitarrosa | genre | Zamba (artform), ...","[Alfredo Zitarrosa | genre | Zamba (artform), ...",1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
df_b = pd.read_json(f'../../tmp/erx-results/results-{exp_B}.jsonl', lines=True)
df_b.head()

Unnamed: 0,text,triples,predicted_triples,exact.precision,exact.recall,exact.f1,fuzzy.precision,fuzzy.recall,fuzzy.f1
0,"Caterpillar Inc. is located in Peoria, Illinois.","[Caterpillar Inc. | location | Peoria, Illinois]","[Caterpillar Inc. | location | Peoria, Illinois]",1.0,1.0,1.0,1.0,1.0,1.0
1,Dale Tallon is the general manager of the Flor...,[Florida Panthers | general manager | Dale Tal...,[Dale Tallon | role | General Manager of the F...,0.5,0.454545,0.47619,0.7,0.636364,0.666667
2,"Aaron Turner played with the band Twilight, is...",[Aaron Turner | associated band/associated mus...,"[Aaron Turner | band | Twilight, Aaron Turner ...",0.52,0.52,0.52,0.68,0.68,0.68
3,"From Spain, Ajoblanco (alternatively known as ...","[Ajoblanco | country | Spain, Ajoblanco | alte...","[Ajoblanco | ingredient | Bread, Ajoblanco | a...",0.666667,0.666667,0.666667,1.0,1.0,1.0
4,Alfredo Zitarrosa performs Zamba music and sta...,"[Alfredo Zitarrosa | genre | Zamba (artform), ...","[Alfredo Zitarrosa | music genre | Zamba, Alfr...",0.222222,0.235294,0.228571,0.555556,0.588235,0.571429


In [8]:
from functools import wraps


def silent(exception_class=Exception):
    """
    A decorator to silence errors.
    """

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except exception_class as e:
                return None

        return wrapper

    return decorator

In [9]:
# def parse_decision(comparison):
#     match = re.search(r"Result: (.*)", comparison)
#     if match:
#         decision = match.group(1).strip().strip("*").upper()
#         if decision in ["A", "B", "DRAW"]:
#             return decision
#     return None


# assert parse_decision("somethingsometghing Result: A") == "A"
# assert parse_decision("somethingsometghing\nResult: B") == "B"
# assert parse_decision("somethingsometghing Result: **DRAW**") == "DRAW"
# assert parse_decision("somethingsometghing # Result: **A**") == "A"
# assert parse_decision("somethingsometghing # Result: B") == "B"

In [10]:
from pydantic import BaseModel, Field, ValidationError


class Result(BaseModel):
    explanation: str
    decision: str = Field(description="A or B or DRAW")


schema = Result.model_json_schema()

SYSTEM_PROMPT = """
Compare the triples in the form of `subject | relation | object` extracted by different joint entity relation extraction models from the text given below. 
- The triple set extracted by the model is good if it is complete, correct, consistent with the text and does not contain duplicate triples. 
- Priority: Completeness > Correctness > Consistency > No duplicates

# Text
{text}

# A
{triples_a}

# B
{triples_b}

Output the result in the following JSON format:
{schema}
"""


def compare_triples_with_llm(text, triples_a, triples_b):
    response = client.chat.completions.create(
        model=MODEL,
        temperature=TEMPERATURE,
        messages=[
            {
                "role": "system",
                "content": SYSTEM_PROMPT.format(text=text, triples_a=triples_a, triples_b=triples_b, schema=schema),
            }
        ],
        response_format={"type": "json_object", "value": schema},
    )
    return Result.model_validate_json(response.choices[0].message.content)


@silent(ValidationError)
def compare_triples(text, triples_a, triples_b, flip=False):
    # randomize the order of triples in the prompt between A and B
    if flip:
        triples_a, triples_b = triples_b, triples_a
        mapping = {"A": "B", "B": "A", "DRAW": "DRAW"}
    else:
        mapping = {"A": "A", "B": "B", "DRAW": "DRAW"}

    result = compare_triples_with_llm(text, triples_a, triples_b)
    result.decision = mapping.get(result.decision)
    return {**result.model_dump(), "flipped": flip}


In [11]:
text = "Claude is a large language model developed by Anthropic."
triples_a = "Claude | is a | large language model\nClaude | is developed by | Anthropic"
triples_b = "Claude | is a | large language model\nClaude | is developed by | Anthropic\nAnthropic | focus on | AI safety"

In [12]:
result = compare_triples_with_llm(text, triples_a, triples_b)
result

Result(explanation="Both sets A and B contain the same triples that are directly supported by the text. However, set B includes an additional triple, 'Anthropic | focus on | AI safety', which is not supported by the given text. Therefore, set A is more consistent with the provided text and does not contain any unsupported information.", decision='A')

In [13]:
# test compare_triples
result = compare_triples(text, triples_a, triples_b)
jprint(result)

{
  "explanation": "Both sets A and B correctly and completely extract the information that 'Claude is a large language model' and 'Claude is developed by Anthropic'. However, set B includes an additional triple 'Anthropic | focus on | AI safety' which is not supported by the given text, making it inconsistent with the text. Therefore, set A is preferred as it is complete, correct, and consistent with the text without any duplicates.",
  "decision": "A",
  "flipped": false
}


In [24]:
comp_df = pd.merge(df_a, df_b, on='text', how='inner', suffixes=['_A', '_B'])[['text', 'predicted_triples_A', 'predicted_triples_B']].sample(frac=0.1)
comp_df

Unnamed: 0,text,predicted_triples_A,predicted_triples_B
437,The ALCO RS-3 was produced from May 1950 to Au...,[ALCO RS-3 | builder | American Locomotive Com...,[ALCO RS-3 | manufacturer | American Locomotiv...
1098,Native Americans are an ethnic group within th...,"[Auburn, Alabama | is part of | Lee County, Al...",[Native Americans | ethnic group | United Stat...
68,A Fortress of Grey Ice is from the United Stat...,"[United States | language | English language, ...",[A Fortress of Grey Ice | country origin | Uni...
524,"Austin is the capital city of Texas, where Hou...","[Arlington, Texas | is part of | Tarrant Count...",[]
111,The AMC Matador is a mid-size car also known a...,[AMC Matador | alternative name | American Mot...,"[AMC Matador | assembly location | Thames, New..."
...,...,...,...
592,Philippines is the country the dish Batchoy co...,"[Philippines | ethnic group | Ilocano people, ...","[Philippines | dish | Batchoy, Philippines | e..."
265,Aleksandr Chumakov died in Russia where the le...,"[Aleksandr Chumakov | death place | Russia, Ru...","[Aleksandr Chumakov | death place | Russia, Ru..."
357,The Acharya Institute of Technology in Bangalo...,[Acharya Institute of Technology | president |...,[Acharya Institute of Technology | establishme...
412,A Fortress of Grey Ice is from the United Stat...,"[United States | language | English language, ...",[A Fortress of Grey Ice | country origin | Uni...


In [26]:
def process(row):
    result = compare_triples(
        row["text"],
        row["predicted_triples_A"],
        row["predicted_triples_B"],
        flip=random.random() < 0.5,
    )
    return {**row, **result}


def process_dataframe(dataf, num_threads=16):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process, row) for _, row in dataf.iterrows()]
        results = []
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Error processing row: {e}")
                results.append(None)
    return pd.DataFrame(results)


In [None]:
comp_df = process_dataframe(comp_df)
comp_df

 81%|████████▏ | 91/112 [06:20<00:52,  2.52s/it]

In [91]:
def compute_stats(df, model_a, model_b):
    return {
        model_a: float(df['decision'].value_counts()["A"] / len(df)),
        model_b: float(df['decision'].value_counts()["B"] / len(df)),
        "DRAW": float(df['decision'].value_counts()['DRAW'] / len(df)),
    }

compute_stats(comp_df, exp_A, exp_B)

{'moral-prof': 0.3, 'gawsy-paps': 0.6, 'DRAW': 0.1}

In [96]:
def present_row(row):
    print(row['text'])
    print('-'*100)
    print(f"Model A: {row['predicted_triples_A']}")
    print('-'*100)
    print(f"Model B: {row['predicted_triples_B']}")
    print('-'*100)
    print(f"Flipped: {row['flipped']}")
    print(f"Explanation:\n{row['explanation']}")
    print('-'*100)
    print(f"Decision: {row['decision']}")

In [97]:
row = comp_df.iloc[1]
present_row(row)

Dale Tallon is the general manager of the Florida Panthers.
Jens Hartel plays for SV Germania Schoneiche.
The alternative name for Ajoblanco is "Ajo blanco".
Abdulsalami Abubakar was born in Niger State and was the Chief of the Defence Staff in Nigeria.
Robert E Lee was a commander in the American Civil War when the Battle of Cold Harbor took place. Aaron S Daggett fought in this battle.
Aenir was written by Garth Nix and has an ISBN number of 0-439-17684-0. The book is in the print form.
----------------------------------------------------------------------------------------------------
Model A: ['Florida Panthers | general manager | Dale Tallon', 'Jens Härtel | club | SV Germania Schöneiche', 'Ajoblanco | alternative name | Ajo blanco', 'Abdulsalami Abubakar | office | Chief of the Defence Staff (Nigeria)', 'Abdulsalami Abubakar | birth place | Niger State', 'Battle of Cold Harbor | is part of military conflict | American Civil War', 'Aaron S. Daggett | battle | Battle of Cold Harbor