In [1]:
from collections import Counter, defaultdict, namedtuple
from copy import deepcopy
import functools
import inspect
import json
import os
from pathlib import Path
import pickle
from pprint import pp, pprint, pformat
import re
import sys
import time
from typing import Dict, List

import jmespath
from jsonschema import validate
import numpy as np
import pandas as pd
import plotly.express as px
import xmltodict

from colorutils import Color

from dotenv import load_dotenv
from jinja2 import Environment, FileSystemLoader, Template
import textwrap
from tqdm.auto import tqdm
# from tqdm import tqdm

import openai

from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl, process_to_jsonl
from aic_nlp_utils.pycfg import parse_pycfg_args, read_pycfg
%load_ext autoreload
%autoreload 2

from prompt_opt.optimizers.predict_evaluate import get_candidate_score, rank_candidates
from prompt_opt.utils import *

sys.path.append("/home/drchajan/devel/python/FC/automated-fact-checking")

os.environ['VLLM_WORKER_MULTIPROC_METHOD']='spawn'
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


False

In [9]:
from networkx import dfs_edges


def load_scores(pattern):
    recs = []
    for exp_root in Path().glob(pattern):
        for exp_dir in exp_root.glob("seed_*"):
            archive_file = Path(exp_dir, "archive.jsonl")
            if archive_file.is_file():
                archive = read_jsonl(archive_file)
                for candidate in archive:
                    if "split" not in candidate:
                        continue
                    splits = candidate["split"].keys()
                    for split in splits:
                        samples = candidate["split"][split]
                        if any("eval" not in s for s in samples):
                            continue
                        metrics = samples[0]["eval"].keys()
                        for metric in metrics:
                            scores = [sample["eval"][metric]["score"] for sample in samples]
                            mean_score = np.mean(scores)
                            n_samples = len(scores)
                            recs.append({
                                "name": Path(exp_root).name,
                                "seed": Path(exp_dir).name.replace("seed_", ""),
                                "cid": candidate["id"],
                                # "parent_id": candidate.keys(),
                                "split": split,
                                "metric": metric,
                                "score": mean_score,
                                "samples": n_samples,
                            })
                        # for sample in samples:
                            # for metric in sample["eval"]:
                                # score = 
                                # print(Path(exp_dir), idx, split, metric, score)
    df = pd.DataFrame(recs)
    
    # compute virtual "all" splits which are weigthed sum over all
    df['weighted_score'] = df['score'] * df['samples']
    group_cols = ['name', 'seed', 'cid', 'metric']
    agg = df.groupby(group_cols).agg(
        total_weighted_score=('weighted_score', 'sum'),
        total_samples=('samples', 'sum')
    ).reset_index()
    agg['score'] = agg['total_weighted_score'] / agg['total_samples']
    agg['split'] = 'all'
    agg = agg[['name', 'seed', 'cid', 'split', 'metric', 'score', 'total_samples']]
    agg.rename(columns={'total_samples': 'samples'}, inplace=True)
    df = pd.concat([df, agg], ignore_index=True)
    df.drop(columns=['weighted_score'], errors='ignore', inplace=True)
    
    score_pivot = df.pivot_table(
        index=['name', 'seed', 'cid', 'metric'],  # keep these as row identifiers
        columns='split',                          # split values become new columns
        values='score'                            # fill with score values
    ).reset_index()
    
    samples_pivot = df.pivot_table(
        index=['name', 'seed', 'cid', 'metric'],
        columns='split',
        values='samples'
    ).reset_index()

    samples_pivot.columns = [
        f"{col}_samples" if col not in ['name', 'seed', 'cid', 'metric'] else col
        for col in samples_pivot.columns
    ]
    
    for col in samples_pivot.columns:
        if col.endswith('_samples'):
            samples_pivot[col] = samples_pivot[col].astype('Int64')
    
    df = pd.merge(score_pivot, samples_pivot, on=['name', 'seed', 'cid', 'metric'])
    return df

df = load_scores("EXP/BBH*")
df


Unnamed: 0,name,seed,cid,metric,all,dev,trn,tst,all_samples,dev_samples,trn_samples,tst_samples
0,BBH_date_understanding_V1_EA,297831,1,oa,0.725,0.7,0.833333,0.708333,40,10,6,24
1,BBH_date_understanding_V1_EA,297831,2,oa,0.625,0.9,0.333333,0.583333,40,10,6,24
2,BBH_date_understanding_V1_EA,297831,3,oa,0.875,0.9,0.833333,0.875000,40,10,6,24
3,BBH_date_understanding_V1_EA,297831,4,oa,0.800,1.0,0.500000,0.791667,40,10,6,24
4,BBH_date_understanding_V1_EA,297831,5,oa,0.350,0.2,0.333333,0.416667,40,10,6,24
...,...,...,...,...,...,...,...,...,...,...,...,...
859,BBH_temporal_sequences_orig_V1_EA,5642423,30,oa,1.000,1.0,1.000000,1.000000,40,10,6,24
860,BBH_temporal_sequences_orig_V1_EA,5642423,31,oa,0.000,0.0,0.000000,0.000000,40,10,6,24
861,BBH_temporal_sequences_orig_V1_EA,5642423,32,oa,0.000,0.0,0.000000,0.000000,40,10,6,24
862,BBH_temporal_sequences_orig_V1_EA,5642423,33,oa,1.000,1.0,1.000000,1.000000,40,10,6,24


In [10]:
best_idx = df.dropna().groupby("name")["all"].idxmax()
best_performers = df.loc[best_idx].reset_index(drop=True)
best_performers

Unnamed: 0,name,seed,cid,metric,all,dev,trn,tst,all_samples,dev_samples,trn_samples,tst_samples
0,BBH_date_understanding_V1_EA,297831,14,oa,0.925,0.8,1.0,0.958333,40,10,6,24
1,BBH_date_understanding_orig_V1_EA,2291309,2,oa,0.95,1.0,0.833333,0.958333,40,10,6,24
2,BBH_disambiguation_qa_V1_EA,9593128,31,oa,0.95,0.9,1.0,0.958333,40,10,6,24
3,BBH_disambiguation_qa_orig_V1_EA,7335201,40,oa,0.875,0.8,1.0,0.875,40,10,6,24
4,BBH_hyperbaton_V1_EA,4121327,5,oa,1.0,1.0,1.0,1.0,40,10,6,24
5,BBH_hyperbaton_orig_V1_EA,9411324,1,oa,1.0,1.0,1.0,1.0,40,10,6,24
6,BBH_penguins_in_a_table_V1_EA,6046176,5,oa,1.0,1.0,1.0,1.0,40,10,6,24
7,BBH_penguins_in_a_table_orig_V1_EA,1686955,6,oa,1.0,1.0,1.0,1.0,40,10,6,24
8,BBH_salient_translation_error_detection_orig_V...,5719577,1,oa,0.825,0.9,0.666667,0.833333,40,10,6,24
9,BBH_temporal_sequences_V1_EA,4337022,8,oa,1.0,1.0,1.0,1.0,40,10,6,24


In [11]:
best_idx = df.dropna().groupby(["name", "seed"])["all"].idxmax()
best_performers = df.loc[best_idx].reset_index(drop=True)
best_performers

Unnamed: 0,name,seed,cid,metric,all,dev,trn,tst,all_samples,dev_samples,trn_samples,tst_samples
0,BBH_date_understanding_V1_EA,297831,14,oa,0.925,0.8,1.0,0.958333,40,10,6,24
1,BBH_date_understanding_V1_EA,4150482,8,oa,0.925,0.9,0.833333,0.958333,40,10,6,24
2,BBH_date_understanding_V1_EA,8781166,6,oa,0.925,0.9,1.0,0.916667,40,10,6,24
3,BBH_date_understanding_orig_V1_EA,2291309,2,oa,0.95,1.0,0.833333,0.958333,40,10,6,24
4,BBH_date_understanding_orig_V1_EA,7554421,12,oa,0.95,0.9,1.0,0.958333,40,10,6,24
5,BBH_date_understanding_orig_V1_EA,9195109,14,oa,0.925,1.0,0.833333,0.916667,40,10,6,24
6,BBH_disambiguation_qa_V1_EA,3909189,22,oa,0.875,1.0,1.0,0.791667,40,10,6,24
7,BBH_disambiguation_qa_V1_EA,8891931,22,oa,0.725,0.9,0.666667,0.666667,40,10,6,24
8,BBH_disambiguation_qa_V1_EA,9593128,31,oa,0.95,0.9,1.0,0.958333,40,10,6,24
9,BBH_disambiguation_qa_orig_V1_EA,6857035,1,oa,0.775,0.7,0.833333,0.791667,40,10,6,24


# OLD Review all below!

In [4]:
pip install xmltodict

Collecting xmltodict
  Using cached xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Using cached xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.14.2
Note: you may need to restart the kernel to use updated packages.


In [93]:
# archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/eventsV3attributions-V1d/seed_1315744"
# archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/eventsV3people-V1d/seed_4359670"
# archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/eventsV3orgs-V1d/seed_9554819"
# archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/eventsV3locations-V1d/seed_7602201"
# archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/cro_V1/seed_7351606"
# archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/events2attributions_V2/seed_6612193"
# archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/events2attributions_V2/seed_115822"
archive_dir = "/home/drchajan/devel/python/FC/prompt_opt/EXP/events2attributions_V2_direct/seed_1989249"
archive_jsonl = Path(archive_dir, "archive.jsonl")
archive = read_jsonl(archive_jsonl)

In [10]:
def compare_pipelines(res_paths):
    res = {k: read_jsonl(r) for k, r in res_paths.items()}
    scores = {k: defaultdict(list) for k in res.keys()}
    for k, r in res.items():
        for sample in r:
            for score_key, score in sample.items():
                scores[k][score_key].append(score["score"])
        
    return scores

res_paths = {
    "ds-llama": "/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_events_complete_V1_croV2-ds-llama/store_36_merge_evaluate.jsonl",
    "qwen3_14b": "/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_events_complete_V1_croV2-qwen3_14b/store_36_merge_evaluate.jsonl",
}
res = compare_pipelines(res_paths)

In [19]:
def cmp():
    for k, scores in res.items():
        for score_key in scores.keys():
            # print(scores)
            print(k, "\t", score_key, "\t\t\t\t\t", np.mean(scores[score_key]))
            
            
cmp()

ds-llama 	 oa_people 					 0.6459824067892332
ds-llama 	 mbj_people 					 0.748
ds-llama 	 oa_locs 					 0.8039645843832004
ds-llama 	 mbj_locs 					 0.881
ds-llama 	 oa_orgs 					 0.7582635638933772
ds-llama 	 mbj_orgs 					 0.7906666666666665
ds-llama 	 mbj_event-subevent_desc 					 0.7643333333333334
ds-llama 	 mbj_event-subevent_desc+span 					 0.7769999999999999
ds-llama 	 mbj_event-subevent_desc+rep 					 0.7756666666666666
ds-llama 	 mbj_event-subevent_desc+time 					 0.7853333333333333
qwen3_14b 	 oa_people 					 0.8499899324863239
qwen3_14b 	 mbj_people.dseek_llama70b 					 0.8629999999999999
qwen3_14b 	 oa_locs 					 0.8177335257335258
qwen3_14b 	 mbj_locs.dseek_llama70b 					 0.8630000000000001
qwen3_14b 	 oa_orgs 					 0.6892048134289488
qwen3_14b 	 mbj_orgs.dseek_llama70b 					 0.799
qwen3_14b 	 mbj_event-subevent_desc.dseek_llama70b 					 0.7753333333333333
qwen3_14b 	 mbj_event-subevent_desc+span.dseek_llama70b 					 0.7476666666666667
qwen3_14b 	 mbj_event-subevent_desc

# Finetune pipeline evaluations (CANERASE)

In [2]:
def analyse(data):
    m2scores = defaultdict(list)
    for e in data:
        for metric, res in e.items():
            m2scores[metric].append(res["score"])

    res = []
    for metric, scores in m2scores.items():
        res.append({"metric": metric, "score": np.mean(scores), "length": len(scores)})
    return pd.DataFrame(res)

data = read_jsonl("/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_events_complete_V1_croV2-ds-llama_s1234/store_36_merge_evaluate.jsonl")
analyse(data)

FileNotFoundError: [Errno 2] No such file or directory: '/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_events_complete_V1_croV2-ds-llama_s1234/store_36_merge_evaluate.jsonl'

In [3]:
data = read_jsonl("/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_ners_V1_croV2-ds-llama_s1234/store_13_merge_evaluate.jsonl")
analyse(data)

Unnamed: 0,metric,score,length
0,oa_people,0.862067,30
1,mbj_people,0.887333,30
2,oa_locs,0.676198,30
3,mbj_locs,0.839333,30
4,oa_orgs,0.658653,30
5,mbj_orgs,0.789333,30


In [4]:
data = read_jsonl("/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_ners_V1_croV2-ds-llama_s1234_2/store_13_merge_evaluate.jsonl")
analyse(data)


Unnamed: 0,metric,score,length
0,oa_people,0.862067,30
1,mbj_people,0.887333,30
2,oa_locs,0.676198,30
3,mbj_locs,0.821,30
4,oa_orgs,0.658653,30
5,mbj_orgs,0.789333,30


In [5]:
data = read_jsonl("/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_ners_V1_croV2-ds-llama_s1235/store_13_merge_evaluate.jsonl")
analyse(data)


Unnamed: 0,metric,score,length
0,oa_people,0.862067,30
1,mbj_people,0.887333,30
2,oa_locs,0.676198,30
3,mbj_locs,0.821,30
4,oa_orgs,0.658653,30
5,mbj_orgs,0.789333,30


In [7]:
data = read_jsonl("/home/drchajan/devel/python/FC/prompt_opt/data/extraction_pipeline/extraction_pipeline_ners_V1_croV2-ds-llama_s1236/store_13_merge_evaluate.jsonl")
analyse(data)


Unnamed: 0,metric,score,length
0,oa_people,0.862067,30
1,mbj_people,0.887333,30
2,oa_locs,0.676198,30
3,mbj_locs,0.821,30
4,oa_orgs,0.658653,30
5,mbj_orgs,0.789333,30


# Playground OpenAI Models

In [97]:
# task_prompt = archive[0]["split"]["trn"][0]["messages"][4]["content"]
task_prompt = archive[2]["messages"][0]["content"]
print(task_prompt)

# Task: Improve Instructions for the LLM  

We are optimizing prompts that generate JSON output. Each prompt consists of three key components:  

1) **Instructions**: Guidelines describing how to transform a query into an answer.  
2) **Query**: Encapsulated within `<query></query>` tags.  
3) **JSON Schema**: Defines the expected structure of the output.  

## Provided Data  
You will receive:  

- The original **instructions** to be improved.  
- The **JSON schema** specifying the required output format.  
- One or more **examples** where the original instructions failed, enclosed in `<example>` tags with an `id` attribute for reference. Each example contains:  
  1) A **query** (`<query></query>`)  
  2) The **LLM's prediction** (`<prediction></prediction>`)  
  3) The **gold-standard answer** (`<gold></gold>`)  

## Your Task  

Analyze the original instructions and refine them to ensure the model's predictions align more closely with the gold answers. Specifically:  

1) **Identif

In [44]:
from openai import OpenAI

client = OpenAI()

THINKING_SYSTEM_PROMPT="""**You are a helpful assistant dedicated to solving any task given.**  
Your responses must follow a two-part structure:  

1. **Thinking Part** (enclosed within `<think>` and `</think>` tags): This section contains your detailed reasoning and step-by-step thought process for solving the problem.  
2. **Final Answer**: This is the user-facing response and should NOT be enclosed in any tags. Since the reasoning part is hidden from the user, ensure that all necessary information is fully conveyed in this section.  

**Important Guidelines:**  
- Always follow this structured format.  
- The final answer must be clear, complete, and fully explain the solution step by step, just as in the `<think>` section.  
- Do not omit critical details from the reasoning process when constructing the final answer.  
- The final answer should be written as if the `<think>` section does not exist.  
- If a problem involves calculations, formulas, or logic, ensure the final answer includes them explicitly.  
- Do not reference or mention the `<think>` section in the final answer.  
- If a problem has multiple solutions, include the best one in the final answer, while the `<think>` section can explore alternatives.
"""

completion = client.chat.completions.create(
    # model="o3-mini",
    model="gpt-4o-mini",
    # model="gpt-4o",
    # temperature=0.6,
    messages=[
        {
            "role": "developer",
            "content": THINKING_SYSTEM_PROMPT,
        },
        {
            "role": "user",
            # "content": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?",
            "content": task_prompt,
        },
    ],
)

print(completion.choices[0].message.content)

1. **Identify Events**: Start by identifying all events from the `<events>` section of the query. Each event will be a distinct object that is represented in the final answer.

2. **Extract Event IDs**: For each event in the `<events>` section, note down its unique identifier (`id`). This will be necessary for the construction of the final answer as it references how the event is categorized.

3. **Attribution Identification**: For each identified event, you must attribute relevant organizations or individuals based on contextual information in the `<text>`, `<organizations>`, and `<people>` sections:
   - **Analyze Context**: Read the provided text comprehensively to understand which organizations or individuals associated with the events due to quotes, actions, or responsibilities mentioned in the text.
   - **Match with References**: Compare these references with the identifiers provided in the `<organizations>` and `<people>` sections. List their IDs to create an attribution list f

In [61]:
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI()

schema = {
    "name": "json_response",
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "surname": {"type": "string"},
        "birth_date": {"type": "string"},
    },
    "required": [
            "name",
            "surname",
            "birth_date"
        ],
    "additionalProperties": False
}

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": 'Define Churchill as JSON. Use format: {"name": ..., "surname": ..., "birth_date": ...}'}],
    # max_tokens=500,
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "math_response",
            "schema": schema,
            "strict": True},
        },
    seed=0,
    temperature= 0.0,
    frequency_penalty= 0.05
)


# Print the generated text
print(response.choices[0].message.content)

{"name":"Winston","surname":"Churchill","birth_date":"1874-11-30"}


# Playground OpenAI API

In [28]:
# reasoning https://github.com/vllm-project/vllm/pull/12955
# openai.api_base = "http://localhost:8881/v1"
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(base_url="http://g04:8333/v1")

schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "surname": {"type": "string"},
        "birth_date": {"type": "string"},
    },
    "required": [
            "name",
            "surname",
            "birth_date"
        ],
    "additionalProperties": False
}

schema = {
    "type": "object",
    "properties": {
        "people": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string"
                    },
                    "roles": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "required": [
                    "name",
                    "roles"
                ]
            }
        }
    },
    "required": [
        "people"
    ]
}

response = client.chat.completions.create(
    # model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    model="casperhansen/deepseek-r1-distill-llama-70b-awq",
    # messages=[{"role": "user", "content": 'Define Churchill as JSON. Use format: {"name": ..., "surname": ..., "birth_date": ...}'}],
    messages=[{"role": "user", "content": meta[0]["messages"][0]["content"]}],
    # max_tokens=500,
    extra_body={"guided_json": schema},
    seed=0,
    temperature= 0.0,
    frequency_penalty= 0.05
)


# Print the generated text
print(response.choices[0].message.content)
print(response.choices[0].message.reasoning_content)

{"people":[{"name":"Ivana Lokajová","roles":["infiltrátorka"]},{"name":"Sonja Hájková","roles":["doktorka","provozovatelka"]},{"name":"Radkin Honzák","roles":["psychiatr","infiltrační poradce"]},{"name":"Barbora Vlková","roles":["osoba"]},{"name":"Simona Vlková","roles":["matka Barbory Vlkové"]},{"name":"Zdena Švancarová","roles":["životní poradkyně","spirituální uklízení"]},{"name":"Eva Xaris Strmisková","roles":["vědma","vzdělávací kurz"]}]}

Okay, I need to extract the roles of people mentioned in the given text. Let me read through the text carefully.

First, I see Ivana Lokajová is mentioned as an infiltrátorka, so her role is "infiltrátorka." She also participated in a sněm, but that's part of her role as infiltrator.

Next, Sonja Hájková is described as provozující holistické Centrum přírodní léčby. So her roles are "doktorka" and "provozovatelka."

Radkin Honzák is introduced as a psychiatrist and infiltrační poradce, so his roles are "psychiatr" and "infiltrační poradce."

Bar

In [4]:
response.choices[0].message

ChatCompletionMessage(content='{\n\n"name": "Winston Churchill",\n\n"surname": "Churchill",\n\n"birth_date": "1874-11-30T00:00:00Z[GMT]([+0:00])"  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], reasoning_content='\n\n')

In [7]:
msg = response.choices[0].message

In [None]:
hasattr(msg, "reasoning_content")


False

In [None]:
prompt = ("Generate a JSON with the brand, model and car_type of"
          "the most iconic car from the 90's, think in 100 tokens")
completion = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    messages=[{
        "role": "user",
        "content": prompt,
    }],
    extra_body={"guided_json": json_schema},
)
print("content: ", completion.choices[0].message.content)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)

# Events: Transform Predictions and Answers
To visualize results -- based on query.

In [27]:
def transform_qa(q, a, target=None, element=None):
    def pname(e):
        return e["@name"] + f'(P{e["@id"]})'
    
    def lname(e):
        r = e["@abbreviation"] if e["@abbreviation"] != "" else e["@name"]
        r += "/" + e["@type"]
        return r + f'(L{e["@id"]})'
    
    def oname(e):
        r = e["@abbreviation"] if e["@abbreviation"] != "" else e["@name"]
        r += "/" + e["@type"]
        return r + f'(O{e["@id"]})'
    
    t = xmltodict.parse("<doc>" + q + "</doc>")["doc"]
    id2data = {}
    
    if "people" in t:
        people = t["people"]["person"] if t["people"] else []
        people = people if isinstance(people, list) else [people]
        id2data.update({e["@id"]: pname(e) for e in people})
    
    if "locations" in t:
        locs = t["locations"]["loc"] if t["locations"] else []
        locs = locs if isinstance(locs, list) else [locs]
        id2data.update({e["@id"]: lname(e) for e in locs})
    
    if "organizations" in t:
        orgs = t["organizations"]["org"] if t["organizations"] else []
        orgs = orgs if isinstance(orgs, list) else [orgs]
        id2data.update({e["@id"]: oname(e) for e in orgs})
    
    events = t["events"]["event"]
    events = events if isinstance(events, list) else [events]
    id2event = {int(e["@id"]): e["@text"]for e in events}
    
    a = deepcopy(a)
    if not target:
        for ev in a:
            ev["people"] = sorted([id2data.get(str(id_), f"ERR_P{id_}") for id_ in ev["people"]])
            ev["locations"] = sorted([id2data.get(str(id_), f"ERR_L{id_}") for id_ in ev["locations"]])
            ev["orgs"] = sorted([id2data.get(str(id_), f"ERR_O{id_}") for id_ in ev["orgs"]])
            ev["attributions"] = sorted([id2data.get(str(id_), f"ERR_{id_}") for id_ in ev["attributions"]])
    else:
        ret = []
        for e in a[target]:
            if e["event_id"] not in id2event:
                print(f"WARNING: missing event_id: {e['event_id']}")
                continue
            event = id2event[e["event_id"]]
            el = sorted([id2data.get(str(id_), f"ERR_O{id_}") for id_ in e[element]])
            ret.append({"event": event, element: el})
        a[target] = ret
            
    return a
    
    
    
sample = archive[0]["split"]["tst"][6]
q = sample["query"]
# a = sample["gold"]
a = sample["pred"]
# transform_qa(q, a)
transform_qa(q, a, target="attributions", element="attribution")

KeyError: 'attributions'

In [26]:
pprint(sample)

{'corrections': 0,
 'eval': {'oa': {'reasoning': 'The predicted output scores overall 94%, let us '
                              'align the predicted output to the gold and '
                              'analyze the differences:\n'
                              '  KEY = The predicted key "locations" exactly '
                              'matches the gold.\n'
                              '  VALUE = The predicted list scores 94%:\n'
                              '      KEY = The predicted key "locations" '
                              'exactly matches the gold.\n'
                              '      VALUE = The predicted list scores 50%:\n'
                              '        The predicted output misses the "6743" '
                              'list item from the gold.\n'
                              '        The predicted value "7953" exactly '
                              'matches the gold.\n'
                              '\n'
                              '      KEY = 

In [30]:
def transform_all(archive, out_jsonl, **kwargs):
    archive = deepcopy(archive)
    for candidate in archive:
        if "split" not in candidate:
            continue
        for split in candidate["split"].keys():
            print("split", split)
            for idx, sample in enumerate(candidate["split"][split]):
                print("idx", idx)
                sample["gold"] = transform_qa(sample["query"], sample["gold"], **kwargs)
                sample["pred"] = transform_qa(sample["query"], sample["pred"], **kwargs)
    write_jsonl(out_jsonl, archive)
    return archive
    
# transform_all(archive, Path(archive_dir, "archive_trans.jsonl"))
# archive_trans = transform_all(archive, Path(archive_dir, "archive_trans.jsonl"), target="attributions", element="attribution")
# archive_trans = transform_all(archive, Path(archive_dir, "archive_trans.jsonl"), target="people", element="people")
# archive_trans = transform_all(archive, Path(archive_dir, "archive_trans.jsonl"), target="orgs", element="orgs")
archive_trans = transform_all(archive, Path(archive_dir, "archive_trans.jsonl"), target="locations", element="locations")

split trn
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
split dev
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
split tst
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
idx 10
idx 11
idx 12
idx 13
split trn
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
split dev
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
split tst
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
idx 10
idx 11
idx 12
idx 13
split trn
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
split dev
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
split tst
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
idx 10
idx 11
idx 12
idx 13
split trn
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
split dev
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
split tst
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
idx 10
idx 11
idx 12
idx 13
split trn
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
split dev
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
split tst
idx 0
idx 1
idx 2


In [8]:
archive_trans[0]["split"]["tst"][6]["pred"]

{'people': [{'event': 'Prezident Emmanuel Macron zúčastnil slavnostního představení nové francouzské útočné jaderné ponorky Suffren třídy Barracuda ve městě Cherbourg.',
   'people': ['Emmanuel Macron(P456)']},
  {'event': 'Náčelník generálního štábu François Lecointre řekl, že je naprosto nezbytné, aby Francie byla přítomna v krizových oblastech a představovala hrozbu, pokud by to bylo nutné.',
   'people': ['François Lecointre(P1173)']}]}

# Extract Best Prompts

In [None]:
def extract_best_prompt(archive_dir, split, metric, candidate2prompt, out_file):
    archive = read_jsonl(Path(archive_dir, "archive.jsonl"))
    print(f"# candidates: {len(archive)}")
    best_score = 0
    best_candidate = None
    best_idx = -1
    for idx, candidate in enumerate(archive):
        if "split" not in candidate or split not in candidate["split"]:
            continue
        try:
            scores = [sample["eval"][metric]['score'] for sample in candidate["split"][split]]
        except:
            print("skipping incomplete candidate")
            continue
        # pprint(scores)
        mean_score = np.mean(scores)
        if mean_score > best_score:
            best_score = mean_score
            best_candidate = candidate
            best_idx = idx
    
    print(f"best score: {best_score} for candidate idx: {best_idx}")
    prompt = candidate2prompt(best_candidate)
    print(prompt)
    Path(out_file).write_text(prompt)
    return best_candidate


# ret = extract_best_prompt(archive_dir="EXP/eventsV3people-V1d/seed_4359670", split="tst", metric="oa", candidate2prompt=candidate2prompt_dseek)
# ret = extract_best_prompt(archive_dir="EXP/eventsV3orgs-V1d/seed_9554819", split="tst", metric="oa", candidate2prompt=candidate2prompt_dseek)

In [17]:
task = "peopleV2-V7"
# seed = 932203
seed = 814566
metric = "mbj"
ret = extract_best_prompt(archive_dir=f"EXP/{task}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 100
best score: 0.947142857142857 for candidate idx: 39
To transform a query into the corresponding answer, follow these steps:

1. **Identify Named Entities**: Extract all names of individuals mentioned in the query. This includes people but excludes organizations unless they are personified (e.g., "WHO" as an organization is not included unless it refers to a specific person).

2. **Determine Roles**: For each identified name, determine their roles based on the context provided in the query. Roles are typically indicated by titles, positions, or actions associated with the individual. Be specific:
   - Include any additional context that provides more detail about the role, such as the type of school or organization.
   - If a person's role is associated with an organization, include this affiliation in the role description.
   - Ensure roles are detailed and specific, avoiding unnecessary redundancy.

3. **Construct JSON Objects**: Create a JSON object for each name wi

In [11]:
metric = "mbj"
ret = extract_best_prompt(archive_dir="EXP/peopleV2-V7/seed_932203", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_peopleV2-V7_seed_932203-{metric}.md")

# candidates: 95
best score: 0.9321428571428572 for candidate idx: 11
<final-instructions>
1. **Identify Individuals**: Extract all names of people mentioned in the query, ensuring cultural sensitivity. This includes handling naming conventions like adding "ová" for female names in Czech or other linguistic specificities.

2. **Determine Roles**: For each identified individual, determine their roles or positions based on the context. Roles should be specific and include affiliations or organizations when mentioned or implied in the query.

3. **Construct Objects**: For each person, create an object with "name" and "roles" fields. The "name" is the extracted individual, and "roles" is an array of strings describing their roles in the language of the query.

4. **Handle No Individuals**: If no individuals are mentioned, return an empty array.

5. **Multiple Roles**: If a person has multiple roles, list each role as a separate string in the "roles" array.

6. **Case Sensitivity**: Ensure 

In [22]:
task = "locs-V7"
# seed = 7542400
seed = 8982160
metric = "oa"
# metric = "mbj"
ret = extract_best_prompt(archive_dir=f"EXP/{task}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 60
best score: 0.7906523250257587 for candidate idx: 52
To transform a query into an answer, follow these steps:

1. **Identify Geopolitical Entities (GPEs):** Extract all proper nouns from the query that represent countries, cities, regions, or other geopolitical entities. If a GPE is implied by the context but not explicitly mentioned, include it in the output.

2. **Categorize Entities:** Classify each extracted entity into the appropriate type from the given enum list (e.g., "gpe", "region"). Cities are considered GPEs. Use "region" for larger areas like provinces or states, and "loc" for specific places such as areas within a city or historical sites.

3. **Include Abbreviations:** If an entity has a commonly recognized abbreviation within the query's language (e.g., "ČR" for "Česká republika"), include it within the same object as the full name. Avoid including international codes unless they are widely recognized and relevant to the context.

4. **Structure the Out

In [28]:
task = "orgs-V7"
# seed = 2963657
seed = 504127
# metric = "oa"
metric = "mbj"
ret = extract_best_prompt(archive_dir=f"EXP/{task}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 9
best score: 0.7457142857142858 for candidate idx: 4
To transform a query into the corresponding answer, follow these steps:

1. **Extract Entities**: Identify all entities (organizations, parties, institutions) from the query text.
2. **Identify Abbreviations**: Determine if each entity has an associated abbreviation.
3. **Classify Type**: Categorize each entity into one of the predefined types (e.g., gov, political).
4. **Structure JSON**: Format each entity as a JSON object with name, abbreviation, and type.

For example:
- Extract "ANO" as a political party with abbreviation "ANO".
- Structure it as `{"name": "", "abbreviation": "ANO", "type": "political"}`.

This method systematically converts any given query into the required JSON format by identifying and categorizing each relevant entity within the text.


In [29]:
task_full = "eventsV4events-V1d"
task = "eventsV4events"
seed = 1174239
metric = "mbj"
ret = extract_best_prompt(archive_dir=f"EXP/{task_full}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 85
skipping incomplete candidate
best score: 0.8057142857142857 for candidate idx: 46
<final-instructions>
To transform a query into a structured answer, follow these organized steps:

1. **Extract Key Information**: Identify the main events and supporting details from the query. Focus on significant happenings and relevant entities (people, locations, organizations).

2. **Structure Events**:
   - **Main Events**: Create concise statements that capture the primary action or occurrence. Avoid including unnecessary context or explanations.
   - **Subevents**: Use subevents to provide critical additional details or direct consequences of the main event. Include specific numbers, percentages, or significant statements here.

3. **Include Entities**: Incorporate extracted people, locations, and organizations into events and subevents to add context without overcomplicating.

4. **Organize Hierarchy**: Ensure each main event is clear and focused. Use subevents to elaborate wit

In [11]:
task_full = "eventsV3people-V1d"
task = "eventsV3people"
seed = 4359670
# seed = 7490847
# seed = 9593683
metric = "oa"
ret = extract_best_prompt(archive_dir=f"EXP/{task_full}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 100
best score: 0.9821428571428571 for candidate idx: 11
To transform a query into an answer, follow these steps:

1. **Extract Events**: Identify each event listed under `<events>` in the query. Each event has an `id` and a `text` describing the event.

2. **Identify People**: For each event, determine which people from the `<people>` section are relevant. This includes:
   - Any person explicitly mentioned in the event's `text`.
   - Any person whose role or actions are implied by the event's `text`.
   - All individuals involved in collective decisions or actions, such as EU officials proposing sanctions.

3. **Map People to Events**: Create a JSON object where each entry in the `people` array corresponds to an event by its `id`. Include the `person id`s of those involved in each event, ensuring that all relevant individuals are linked, even if multiple people are part of the same decision-making process.

4. **Handle No People**: If an event doesn't involve specific p

In [12]:
print(ret["split"]["tst"][0]["query"])

<date>Čt, 06 dub 2017 18:47:00</date>
<text>Před 100 lety vstoupily USA do první světové války. Ztratily 115 tisíc mužů, konflikt ale zemi posílil

OBRAZEM. Do první světové války se před 100 lety, 6. dubna 1917, zapojily také Spojené státy americké. Bezprostředním důvodem bylo potopení pěti amerických obchodních lodí německými ponorkami v březnu téhož roku. Veřejné mínění k podpoře vstupu do války změnilo již potopení britské lodi Lusitania v květnu 1915, na níž zahynulo přes 1200 lidí z toho 128 Američanů. USA v první světové válce přišly o 116 516 mužů. Většina jich je pohřbena ve Francii.

První světová válka (1914 až 1918) zasáhla téměř celý svět. Bojovalo se na třech světadílech, na všech světových mořích a konflikt, v němž zahynulo téměř deset milionů vojáků a 8,5 milionu civilistů, postihl 38 států. Válka zásadně změnila politické uspořádání Evropy. Rozpadly se staré monarchie a vznikla řada nových států (mimo jiné Československo), válka přinesla nástup bolševiků v Rusku, do po

In [9]:
task_full = "eventsV3orgs-V1d"
task = "eventsV3orgs"
# seed = 9097730
seed = 9554819
metric = "oa"
ret = extract_best_prompt(archive_dir=f"EXP/{task_full}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 100
best score: 0.9603174603174603 for candidate idx: 15
To transform a query into the corresponding answer, follow these steps:

### Step 1: Extract Organizations
- Parse the `<organizations>` section to identify all organizations mentioned. Each organization has an `id`, `name`, `abbreviation`, and `type`. These will be used to map events to their respective organizations.

### Step 2: Analyze Each Event
- For each `<event>` in the `<events>` section, read its `text` to determine which organizations are directly involved in that specific event. An organization should only be linked to an event if it is explicitly mentioned in the event's context or action.

### Step 3: Map Events to Organizations
- For each event, create an entry in the `orgs` array within the answer. The entry includes the `event_id` and a list of `org` IDs that are directly relevant to that event. If no organizations are involved in an event, leave the `orgs` array empty.

### Example Mapping:
- **Eve

In [12]:
task_full = "event_temp_val_hclimb_V1"
task = "eventsV4temp"
# seed = 2331996
# seed = 290135
# seed = 3236339
seed = 6389609
# seed = 9412114
metric = "oa"
ret = extract_best_prompt(archive_dir=f"EXP/{task_full}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 100
best score: 0.8538095238095238 for candidate idx: 92
<instructions>
To transform a query into an answer, follow these steps:

1. **Extract Events and Subevents**: Parse each `<event>` and its nested `<subevents>` from the query. Each event has an `id` and `text`, while subevents have `subid` and `text`.

2. **Determine Time Information**:
   - **time_reported**: Use the date from the query's `<date>` tag in `YYYY-MM-DD` format.
   - **time_start** and **time_end** for events:
     - If specific dates are mentioned (e.g., "2018-10-24"), use them.
     - If only a year is mentioned, set both `time_start` and `time_end` to that year (e.g., "2018").
     - If only a month is mentioned, set both `time_start` and `time_end` to that month (e.g., "2018-10").
     - If no specific date is mentioned, set both `time_start` and `time_end` to "NA".
     - If an event describes a record starting from a specific date (e.g., "since 2006"), set `time_start` to that date and `time_end`

In [13]:
print(ret["split"]["tst"][0]["query"])

<date>Čtvrtek, 06 dubna 2017 (2017-04-06) 18:47:00</date>
<text>Před 100 lety vstoupily USA do první světové války. Ztratily 115 tisíc mužů, konflikt ale zemi posílil

OBRAZEM. Do první světové války se před 100 lety, 6. dubna 1917, zapojily také Spojené státy americké. Bezprostředním důvodem bylo potopení pěti amerických obchodních lodí německými ponorkami v březnu téhož roku. Veřejné mínění k podpoře vstupu do války změnilo již potopení britské lodi Lusitania v květnu 1915, na níž zahynulo přes 1200 lidí z toho 128 Američanů. USA v první světové válce přišly o 116 516 mužů. Většina jich je pohřbena ve Francii.

První světová válka (1914 až 1918) zasáhla téměř celý svět. Bojovalo se na třech světadílech, na všech světových mořích a konflikt, v němž zahynulo téměř deset milionů vojáků a 8,5 milionu civilistů, postihl 38 států. Válka zásadně změnila politické uspořádání Evropy. Rozpadly se staré monarchie a vznikla řada nových států (mimo jiné Československo), válka přinesla nástup bolš

In [15]:
task_full = "eventsV3locations-V1d"
task = "eventsV3locs"
# seed = 1952463
# seed = 7070210
seed = 7602201
metric = "oa"
ret = extract_best_prompt(archive_dir=f"EXP/{task_full}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 100
best score: 0.9672619047619045 for candidate idx: 33
{
  "type": "object",
  "properties": {
    "locations": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "event_id": {
            "type": "integer"
          },
          "locations": {
            "type": "array",
            "items": {
              "type": "integer"
            }
          }
        }
      }
    }
  },
  "required": [
    "locations"
  ]
}

To transform a query into an answer, follow these steps:

1. **Extract Location IDs**: Identify all unique location IDs from the `<locations>` section of the query. These IDs will be used to map events to their respective locations.

2. **Analyze Each Event**: For each `<event>` in the query, determine which locations are relevant by checking if the event text explicitly mentions a location or clearly implies it through context. Include both the location where the event takes place and the home country of

In [16]:
print(ret["split"]["tst"][0]["query"])

<date>Čt, 06 dub 2017 18:47:00</date>
<text>Před 100 lety vstoupily USA do první světové války. Ztratily 115 tisíc mužů, konflikt ale zemi posílil

OBRAZEM. Do první světové války se před 100 lety, 6. dubna 1917, zapojily také Spojené státy americké. Bezprostředním důvodem bylo potopení pěti amerických obchodních lodí německými ponorkami v březnu téhož roku. Veřejné mínění k podpoře vstupu do války změnilo již potopení britské lodi Lusitania v květnu 1915, na níž zahynulo přes 1200 lidí z toho 128 Američanů. USA v první světové válce přišly o 116 516 mužů. Většina jich je pohřbena ve Francii.

První světová válka (1914 až 1918) zasáhla téměř celý svět. Bojovalo se na třech světadílech, na všech světových mořích a konflikt, v němž zahynulo téměř deset milionů vojáků a 8,5 milionu civilistů, postihl 38 států. Válka zásadně změnila politické uspořádání Evropy. Rozpadly se staré monarchie a vznikla řada nových států (mimo jiné Československo), válka přinesla nástup bolševiků v Rusku, do po

In [20]:
task_full = "eventsV3attributions-V1d"
task = "eventsV3attributions"
seed = 1315744
# seed = 3492160
# seed = 7218922
metric = "oa"
ret = extract_best_prompt(archive_dir=f"EXP/{task_full}/seed_{seed}", split="tst", metric=metric, 
                          candidate2prompt=candidate2prompt_dseek, out_file=f"data/pipeline/prompts/prompt_{task}-{metric}.md")

# candidates: 100
best score: 0.8306547619047617 for candidate idx: 85
<instructions>
To transform a query into the corresponding answer, follow these steps:

1. **Extract Organizations and People**: Identify all organizations and people mentioned in the query's `<organizations>` and `<people>` sections. Note their respective IDs.

2. **Analyze Each Event**: For each event described in the query's `<events>` section, determine which organizations or people are explicitly mentioned as sources or reporters of the information. Consider the entire text of the query when identifying sources, including mentions outside the event descriptions but within the query.

3. **Map Attributions**: Link each event to the relevant organization or person IDs based on their role as sources or reporters. Include IDs of entities explicitly cited in the query as providers of information. If a source is mentioned anywhere in the query text (not just within the event description), it should be considered for 

In [21]:
print(ret["split"]["tst"][0]["query"])

<date>Čt, 06 dub 2017 18:47:00</date>
<text>Před 100 lety vstoupily USA do první světové války. Ztratily 115 tisíc mužů, konflikt ale zemi posílil

OBRAZEM. Do první světové války se před 100 lety, 6. dubna 1917, zapojily také Spojené státy americké. Bezprostředním důvodem bylo potopení pěti amerických obchodních lodí německými ponorkami v březnu téhož roku. Veřejné mínění k podpoře vstupu do války změnilo již potopení britské lodi Lusitania v květnu 1915, na níž zahynulo přes 1200 lidí z toho 128 Američanů. USA v první světové válce přišly o 116 516 mužů. Většina jich je pohřbena ve Francii.

První světová válka (1914 až 1918) zasáhla téměř celý svět. Bojovalo se na třech světadílech, na všech světových mořích a konflikt, v němž zahynulo téměř deset milionů vojáků a 8,5 milionu civilistů, postihl 38 států. Válka zásadně změnila politické uspořádání Evropy. Rozpadly se staré monarchie a vznikla řada nových států (mimo jiné Československo), válka přinesla nástup bolševiků v Rusku, do po

In [3]:
import requests

# Define the server URL
server_url = "http://g02:8333/v1/models"

# Make a GET request to the /v1/models endpoint
response = requests.get(server_url)
response.json()

{'object': 'list',
 'data': [{'id': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
   'object': 'model',
   'created': 1741288987,
   'owned_by': 'vllm',
   'root': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
   'parent': None,
   'max_model_len': 65536,
   'permission': [{'id': 'modelperm-ae6b18ae9d5b414aa9a4a7099232032e',
     'object': 'model_permission',
     'created': 1741288987,
     'allow_create_engine': False,
     'allow_sampling': True,
     'allow_logprobs': True,
     'allow_search_indices': False,
     'allow_view': True,
     'allow_fine_tuning': False,
     'organization': '*',
     'group': None,
     'is_blocking': False}]}]}

In [86]:
# openai.api_base = "http://localhost:8881/v1"

from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(base_url="http://localhost:8881/v1")

schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "surname": {"type": "string"},
        "birth_date": {"type": "string"},
    },
    "required": [
            "name",
            "surname",
            "birth_date"
        ],
    "additionalProperties": False
}

response = client.chat.completions.create(
    model="dseek-lamma8B",
    messages=[{"role": "user", "content": "Define Churchill as JSON."}],
    max_tokens=500,
    response_format={
        "type": "json_object",
        # "json_object": {"name": "bio", "schema": json.dumps(schema), "strict": True},
        "schema": schema,
        "strict": True
    },
)


# Print the generated text
print(response.choices[0].message.content)

{
  "name": "Winston Churchill",
  "surname": "Churchill",
  "birth_date": "November 25, 1874 (Winston Churchill's birth date is November 25, 1874, and he was born in Oxford, England, United Kingdom.)"
}



In [87]:
?write_json

[0;31mSignature:[0m
[0mwrite_json[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfname[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindent[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmkdir[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Writes JSON file as UTF8.

Args:
    fname (Union[str, Path]): JSON file path
    data (Any): Data to write
    indent (int, optional): Indent characters. Defaults to 3.
    mkdir (bool, optional): Create parent directory if not exists. Defaults to False.
[0;31mFile:[0m      ~/devel/python/FC/VENV/vllm/lib/python3.11/site-packages/aic_nlp_utils/json.py
[0;31mType:[0m      funct

In [71]:
s = '{\n  "type": "array",\n  "items": [\n    {\n      "name": "Ivan Lokajová",\n      "roles": ["infiltrátor", "podnikatelka v oblasti spirituální businessu"]\n    },\n    {\n      "name": "Dr. Soni Hájková",\n      "roles": ["holistická léčitelka", "zakladatelka přírodního centra"]\n    }\n  ]\n}'
print(s)

json.loads(s)

{
  "type": "array",
  "items": [
    {
      "name": "Ivan Lokajová",
      "roles": ["infiltrátor", "podnikatelka v oblasti spirituální businessu"]
    },
    {
      "name": "Dr. Soni Hájková",
      "roles": ["holistická léčitelka", "zakladatelka přírodního centra"]
    }
  ]
}


{'type': 'array',
 'items': [{'name': 'Ivan Lokajová',
   'roles': ['infiltrátor', 'podnikatelka v oblasti spirituální businessu']},
  {'name': 'Dr. Soni Hájková',
   'roles': ['holistická léčitelka', 'zakladatelka přírodního centra']}]}

# Validation

In [9]:
data = read_jsonl("data/labeled_datasets/events_V3_attributions.jsonl")
schema = {
    "type": "object",
    "properties": {
        "attributions": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "event_id": {"type": "integer", "score": "exact"},
                    "attribution": {
                        "type": "array",
                        "items": {"type": "integer", "score": "exact"},
                        "order": "align",
                    },
                },
                "required": ["event_id", "attribution"],
            },
            "order": "align",
        },
    },
    "required": ["attributions"],
}

validate(data[0]["answer"], schema)

# Object Aligner

In [None]:

from prompt_opt.metrics.object_aligner import ObjectAligner

gold = {
  "attributions": [
    {
      "event_id": 1,
      "attribution": []
    },
    {
      "event_id": 2,
      "attribution": []
    },
    {
      "event_id": 3,
      "attribution": []
    }
  ]
}

pred = {
  "attributions": [
    {
      "event_id": 1,
      "attribution": [
        1817
      ]
    },
    {
      "event_id": 2,
      "attribution": [
        1817
      ]
    },
    {
      "event_id": 3,
      "attribution": []
    }
  ]
}


oa_metric_schema = {
    "type": "object",
    "properties": {
        "attributions": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "event_id": {"type": "integer", "score": "exact"},
                    "attribution": {
                        "type": "array",
                        "items": {"type": "integer", "score": "exact"},
                        "order": "align",
                    },
                },
                 "keyImportance": 0.0,
                "required": ["event_id", "attribution"],
            },
            "order": "align",
        },
    },
     "keyImportance": 0.0,
    "required": ["attributions"],
}

oa = ObjectAligner("oa", oa_metric_schema)
ret = oa.metric(gold, pred)
print(ret["score"])
print(ret["reasoning"])

0.6666666666666666
The predicted output scores overall 67%, let us align the predicted output to the gold and analyze the differences:
  KEY = The predicted key "attributions" exactly matches the gold.
  VALUE = The predicted list scores 67%:
      KEY = The predicted key "attribution" exactly matches the gold.
      VALUE = The predicted list scores 0%:
        The predicted list item "1817" is excessive, it was not in the gold.

      KEY = The predicted key "attribution" exactly matches the gold.
      VALUE = The predicted list scores 0%:
        The predicted list item "1817" is excessive, it was not in the gold.

      KEY = The predicted key "attribution" exactly matches the gold.
      VALUE = The predicted list perfectly matches the gold one:


# OLD

In [3]:
archive[0]

{'messages': [{'role': 'user',
   'content': '# Instructions\nI will give you several examples of query and answer text pairs.\nThe answer uses JSON format with the following schema:\n\n{\n  "type": "array",\n  "items": {\n    "type": "object",\n    "properties": {\n      "event": {\n        "type": "string"\n      },\n      "people": {\n        "type": "array",\n        "items": {\n          "type": "integer"\n        }\n      },\n      "locations": {\n        "type": "array",\n        "items": {\n          "type": "integer"\n        }\n      },\n      "orgs": {\n        "type": "array",\n        "items": {\n          "type": "integer"\n        }\n      },\n      "attributions": {\n        "type": "array",\n        "items": {\n          "type": "integer"\n        }\n      },\n      "future": {\n        "type": "boolean"\n      }\n    },\n    "required": [\n      "event",\n      "people",\n      "locations",\n      "orgs",\n      "attributions"\n    ]\n  }\n}\n\nYour task is to give me

In [11]:
archive[0]["trn"]

[{'think': '**Step 1: Identify the Key Elements of the Query**\n\nFrom the given query, we can extract the following key elements:\n\n* Date of the event: Út, 22 úno 2022 16:21:00\n* Text describing the event: The text describes the European Union imposing sanctions on Russian representatives and banks, with Hungary being cautious about supporting the sanctions.\n* People involved in the event:\n\t+ Ursula von der Leyenová (ID: 2374)\n\t+ Charles Michel (ID: 1784)\n\t+ Viktor Orbán (ID: 6301)\n\t+ Vladimir Putin (ID: 1600)\n\n**Step 2: Analyze the Text to Extract Key Events and People**\n\nUsing NLP techniques such as NER and POS tagging, we can identify the key events and people mentioned in the text:\n\n* Key events:\n\t+ European Union imposing sanctions on Russian representatives and banks\n\t+ Hungary being cautious about supporting the sanctions\n* People involved in the events:\n\t+ Ursula von der Leyenová (ID: 2374)\n\t+ Charles Michel (ID: 1784)\n\t+ Viktor Orbán (ID: 6301)\n\

In [None]:
archive_jsonl = "/home/drchajan/devel/python/FC/prompt_opt/EXP/sir_phrase-V3/seed_5373459/archive.jsonl"
archive = read_jsonl(archive_jsonl)

In [3]:
for c in archive:
    trn_len = len(c.get('trn', []))
    dev_len = len(c.get('dev', []))
    tst_len = len(c.get('tst', []))
    print(f'{c["id"]} ({c.get("parent_id")}) trn={trn_len}, dev={dev_len}, tst={tst_len}')

1 (None) trn=4, dev=8, tst=8
2 (None) trn=4, dev=8, tst=8
3 (None) trn=4, dev=8, tst=8
4 (None) trn=4, dev=8, tst=8
5 (None) trn=4, dev=8, tst=8
6 (None) trn=4, dev=8, tst=8
7 (None) trn=4, dev=8, tst=8
8 (None) trn=4, dev=8, tst=8
9 (None) trn=4, dev=8, tst=8
10 (None) trn=4, dev=8, tst=8
11 (4) trn=4, dev=8, tst=8
12 (4) trn=4, dev=8, tst=8
13 (4) trn=4, dev=8, tst=8
14 (4) trn=4, dev=8, tst=8
15 (4) trn=4, dev=8, tst=8
16 (10) trn=4, dev=8, tst=8
17 (10) trn=4, dev=8, tst=8
18 (10) trn=4, dev=8, tst=8
19 (10) trn=4, dev=8, tst=8
20 (10) trn=4, dev=8, tst=8
21 (15) trn=4, dev=8, tst=8
22 (15) trn=4, dev=8, tst=8
23 (15) trn=4, dev=8, tst=8
24 (15) trn=4, dev=8, tst=8
25 (15) trn=4, dev=8, tst=8
26 (14) trn=4, dev=8, tst=8
27 (14) trn=4, dev=8, tst=8
28 (14) trn=4, dev=8, tst=8
29 (14) trn=4, dev=8, tst=8
30 (14) trn=4, dev=8, tst=8
31 (30) trn=4, dev=8, tst=8
32 (30) trn=4, dev=8, tst=8
33 (30) trn=4, dev=8, tst=8
34 (30) trn=4, dev=8, tst=8
35 (30) trn=4, dev=8, tst=8
36 (29) trn=4,

In [7]:
def candidate_info(archive, select_split, split, score_key):
    rank_idxs = rank_candidates(archive, split=select_split, score_key=score_key)
    print("rank indices:", rank_idxs)
    for ri in rank_idxs:
        c = archive[ri]
        select_score = get_candidate_score(c, split=select_split, score_key=score_key)
        test_score = get_candidate_score(c, split=split, score_key=score_key)
        print(f"score: {select_score:.3f}/{test_score:.3f}")
        print()
        prompt = candidate2prompt_md(c)
        pf(prompt)
        print()
        for ex in c[split]:
            gold = ex["gold"]
            pred = ex["pred"]
            query = ex["query"]
            lquery = query.lower()
            print(f' score: {ex["eval"][score_key]["score"]:.3f}')
            if isinstance(gold, str):
                print(" GOLD:", gold)
                print(" PRED:", pred)
            else:
                print("GOLD:")
                pp(gold)
                print("PRED:")
                pp(pred)
            # not_in_query = [p for p in pred if p.lower() not in lquery]
            # print(f"  not in query: {not_in_query}")
            print()
            pf(query)
            print()
        print("="*120)
    
# candidate_info(archive[:11], select_split="tst", split="tst", score_key="oa-07")
candidate_info(archive[99:], select_split="tst", split="tst", score_key="oa")

[32m2024-12-11 06:21:29.966[0m | [34m[1mDEBUG   [0m | [36mprompt_opt.optimizers.predict_evaluate[0m:[36mget_candidate_score[0m:[36m129[0m - [34m[1mcanidate id=100(89), dict_keys(['messages', 'parent_id', 'id', 'trn', 'dev', 'tst'])[0m
[32m2024-12-11 06:21:29.966[0m | [34m[1mDEBUG   [0m | [36mprompt_opt.optimizers.predict_evaluate[0m:[36mget_candidate_score[0m:[36m129[0m - [34m[1mcanidate id=100(89), dict_keys(['messages', 'parent_id', 'id', 'trn', 'dev', 'tst'])[0m
[32m2024-12-11 06:21:29.967[0m | [34m[1mDEBUG   [0m | [36mprompt_opt.optimizers.predict_evaluate[0m:[36mget_candidate_score[0m:[36m129[0m - [34m[1mcanidate id=100(89), dict_keys(['messages', 'parent_id', 'id', 'trn', 'dev', 'tst'])[0m


rank indices: [0]
score: 0.497/0.497

**Query-to-Answer Transformation Instructions**

**Step 1: Identify Main Events**

* Read the query text and identify the most important events or actions.
* Use natural language processing (NLP) techniques to extract relevant information from the text.
* Prioritize events based on their relevance to the main topic.
* Include specific details such as dates, percentages, and statistics where relevant.
* Focus on the main event or action, rather than including additional details that are not relevant to the event.

**Step 2: Extract Relevant Details**

* Use NLP techniques to extract relevant details from the text, such as names of people involved in the events.
* Identify key entities involved in the events and include them in the event descriptions.
* Prioritize extracted details based on their relevance to the main events or actions.

**Step 3: Construct Event Descriptions**

* Use the extracted information from Steps 1 and 2 to construct a brief 

In [5]:
idx = 1
candidate = archive[idx]

In [6]:
candidate

{'messages': [{'role': 'system',
   'content': 'You are an AI assistant that uses a Chain of Thought (CoT) approach with reflection to respond to user input. Follow these steps:\n\n1. Think through the problem step by step; use the "Thinking" section to mark this stage.\n2. Reflect on your thinking to check for any errors or improvements using the "Reflection" section.\n3. Make any necessary adjustments based on your reflection.\n4. Provide your final, concise response in the "Response" section.\n\nImportant: The "Thinking" and "Reflection" sections are only for your internal reasoning process. \nDo not include any part of the final response in these sections. \nThe actual response to the user input must be entirely contained within the "Response" section.\n\nIt is absolutely CRITICAL that all your (the assistant\'s) outputs use Markdown containing exactly three consequent sections "Thinking", "Reflection", and "Response" as follows:\n\n# Thinking\n<Your step-by-step reasoning goes her

In [21]:
def get_candidate_score(candidate, split, score_key):
    sample_scores = [sample["eval"][score_key]['score'] for sample in candidate[split]]
    return np.mean(sample_scores)

def rank_candidates(candidates, split, score_key):
    candidate_scores = [get_candidate_score(candidate, split, score_key) for candidate in candidates]
    return np.argsort(candidate_scores, kind="stable")[::-1]
    
rank_candidates(archive, "tst", 'oa-07')

array([1, 0])

In [19]:
?np.argsort

[0;31mSignature:[0m       [0mnp[0m[0;34m.[0m[0margsort[0m[0;34m([0m[0ma[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0mkind[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0morder[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mCall signature:[0m  [0mnp[0m[0;34m.[0m[0margsort[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            _ArrayFunctionDispatcher
[0;31mString form:[0m     <function argsort at 0x7fb6640d3920>
[0;31mFile:[0m            ~/devel/python/FC/VENV/vllm/lib/python3.11/site-packages/numpy/core/fromnumeric.py
[0;31mDocstring:[0m      
Returns the indices that would sort an array.

Perform an indirect sort along the given axis using the algorithm specified
by the `kind` keyword. It returns an array of indices of the same shape as
`a` that index data along the given axis in sorted order.

Parameters
----