In [159]:
import os
import re
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import openai
import random


os.environ["OPENAI_KEY"] = ""

## Prompts

In [2]:
FEW_SHOT_EXAMPLES_ENTS = """
                  EXAMPLE#1
                   QUESTION: What does DiCaprio's full name sound like?
                   CANDIDATE ENTITIES: 
                   Q116673393: Sibon irmelindicaprioae -- species of snake
                    Q38111: Leonardo DiCaprio -- American actor and film producer (born 1974)
                    Q11461: sound -- vibration that propagates as an acoustic wave
                    Q36860035: DiCaprio -- family name
                    Q25349951: Martin Scorsese and Leonardo DiCaprio -- collaborations
                    Q56653813: DiCaprio 2 -- album by J.I.D
                   RELEVANT ENTITIES: Q38111.
                   
                   
                   EXAMPLE#2
                   QUESTION: In which city near Moscow is the new Jerusalem monastery located?
                    CANDIDATE ENTITIES: 
                    Q55502: Kingdom of Jerusalem -- medieval Christian kingdom in the Middle East
                    Q773979: New Jerusalem Monastery -- monastery in Moscow Oblast, Russia
                    Q1218: Jerusalem -- city in the Middle East, holy to the three Abrahamic religions
                    Q649: Moscow -- capital and most populous city of Russia
                    Q515: city -- large human settlement
                    Q13164: Moscow State University -- university in Moscow, Russia
                    Q6760: UTC+03:00 -- identifier for a time offset from UTC of +3
                    Q10540001: Jerusalem -- family name                    
                    RELEVANT ENTITIES: Q773979, Q649.

                    
                    EXAMPLE#3
                    QUESTION: What capital stands on the banks of Potalaka?
                    CANDIDATE ENTITIES: 
                    Q11626848: Mount Potalaka -- the mythical dwelling of the Buddhist bodhisattva Avalokiteśvara, said to exist in India
                    Q22687: bank -- financial institution that accepts deposits
                    Q60756888: Potalaka Guanyin -- sculpture by unknown artist (1965.556)
                    Q179444: Potomac River -- river in the mid-Atlantic United States
                    Q193893: capital -- upper part of a column (architecture)
                    Q60: New York City -- most populous city in the United States
                    Q5119: capital city -- primary governing city of a top-level (country) or first-level and second-level subdivision (country, state, province, regency, etc) political entity
                    **Reasoning:**  
                    - "Capital" refers to a **governing city**, not architectural elements (Q193893).  
                    - "Potalaka" (Q11626848) is a **mythical** location, so no real-world capital is directly linked.  
                    - Since no entity directly matches the question, **the most general applicable concept** is "capital city" (Q5119).
                    RELEVANT ENTITIES: Q179444, Q5119.
                    
                    
                    EXAMPLE#4 
                    QUESTION: What is the official language of Brazil?  
                    CANDIDATE ENTITIES:  
                    Q750553: Spanish language -- Romance language originating in Spain  
                    Q5146: Portuguese language -- Romance language, official in Portugal and Brazil  
                    Q155: Brazil -- country in South America  
                    Q483110: Brasília -- capital city of Brazil
                    **Reasoning**:  
                    - "Official language" refers to the main language used for governance.  
                    - The description of Q5146 explicitly states it is official in Brazil.  
                    - Q750553 (Spanish) is incorrect because it is **not** the official language.  
                    - Q155 and Q483110 are related but not **languages**. 
                    RELEVANT ENTITIES: Q5146.  
"""

SYSTEM_PROMPT_ENTS = f"""You are an experienced WikiData user. 
                   You receive a QUESTION and a SET of extracted Wikidata CANDIDATE ENTITIES, retrieved from an index based on similarity between the QUESTION and the ENTITY. 
                   Each entity has a wikidata ID, label and description in a form of Wikidata ID: label -- description. 
                   Your task is to select only the most semantically relevant entities based on their labels and descriptions.
                   
                   Instruction:
                    -- Ignore entities that are loosely related, homonyms, or have indirect contextual links to the QUESTION.
                    -- If no entity perfectly matches the question, select the best available entity based on contextual closeness.
                    -- If multiple entities seem relevant, prefer those with a stronger direct connection to the query subject.
                    -- If there are ambiguous entities that relate to the different countries, ALWAYS choose the entity that is related to Russia, based on description.
                    -- Some questions may have multiple correct entities. Return all entities that are directly relevant. Do NOT include entities that are only indirectly related (e.g., broad categories like 'city' instead of a specific location).
                    -- DO NOT reject entity, if it fits to the entity mentioned in query, but DOES NOT DIRECTLY answer the question.
                    -- Always include reasoning in the beginning of the responce.
                    -- Always place "RELEVANT ENTITIES: ..." at the end of your response.

                   Here are examples:
                   {FEW_SHOT_EXAMPLES_ENTS}
                   """

USER_PROMPT_ENTS = """QUESTION: {question}\nCANDIDATE ENTITIES:\n{candidate_list_str}"""

In [3]:
FEW_SHOT_EXAMPLES_PROP = """
                    EXAMPLE#1
                    QUESTION: How much does Nurmagomedov weigh?
                    CANDIDATE PROPERTIES:
                    P166: award received -- award or recognition received by a person, organization or creative work
                    P54: member of sports team -- sports teams or clubs that the subject represents or represented
                    P1082: population -- number of people inhabiting the place; number of people of subject
                    P1351: number of points/goals/set scored -- goals / points scored in a match or an event used as qualifier to the participant. Use P1358 for league points.
                    P1350: number of matches played/races/starts -- matches or games a player or a team played during an event. Also a total number of matches a player officially appeared in during the whole career.
                    P2121: prize money -- amount in a specific currency
                    P585: point in time -- date something took place, existed or a statement was true; for providing time use the "refine date" property (P4241)
                    P2067: mass -- mass (in colloquial usage also known as weight) of the item
                    P1412: languages spoken, written or signed -- language(s) that a person or a people speaks, writes or signs, including the native language(s)
                    P2046: area -- area occupied by an object
                    RELEVANT PROPERTIES: P2067


                    EXAMPLE#2
                    QUESTION: Where is the Academy of Sciences of Armenia located
                    CANDIDATE PROPERTIES:
                    P463: member of -- organization, club or musical group to which the subject belongs. Do not use for membership in ethnic or social groups, nor for holding a political position, such as a member of parliament (use P39 for that)
                    P31: instance of -- that class of which this subject is a particular example and member; different from P279 (subclass of); for example: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)
                    P19: place of birth -- most specific known birth location of a person, animal or fictional character
                    P580: start time -- time an entity begins to exist or a statement starts being valid
                    P582: end time -- moment when an entity ceases to exist or a statement stops being valid
                    P69: educated at -- educational institution attended by subject
                    P159: headquarters location -- city or town where an organization's headquarters is or has been situated. Use P276 qualifier for specific building
                    P585: point in time -- date something took place, existed or a statement was true; for providing time use the "refine date" property (P4241)
                    RELEVANT PROPERTIES: P159


                    EXAMPLE#3
                    QUESTION: What is the singing voice of Dmitri Hvorostovsky?
                    CANDIDATE PROPERTIES:
                    P412: voice type -- person's voice type. expected values: soprano, mezzo-soprano, contralto, countertenor, tenor, baritone, bass (and derivatives)
                    P725: voice actor -- performer of a spoken role in a creative work such as animation, video game, radio drama, or dubbing over [use "character role" (P453) as qualifier] [use "cast member" (P161) for live acting]
                    P175: performer -- actor, musician, band or other performer associated with this role or musical work
                    P453: character role -- specific role played or filled by subject -- use only as qualifier of "cast member" (P161), "voice actor" (P725)
                    P179: part of the series -- series which contains the subject
                    P674: characters -- characters which appear in this item (like plays, operas, operettas, books, comics, films, TV series, video games)
                    P1441: present in work -- this (fictional or fictionalized) entity, place, or person appears in that work as part of the narration (use P2860 for works citing other works, P361/P1433 for works being part of other works, P1343 for entities described in non-fictional accounts)
                    **Reasoning:** 
                    - the question asks about voice's characteristics of some person
                    - P412 is relevant, because it's a property described a voice type, which is voice's characteristic
                    - P725, P175, P674 represent properties described a person role in something, not related to voice
                    - other properties also represent something not similar to voice 
                    RELEVANT PROPERTIES: P412
                   
                   
                    EXAMPLE#4
                    QUESTION: Who was S. V. Mikhalkov's co-author in writing the text of the anthem of the Soviet Union?
                    CANDIDATE PROPERTIES:
                    P31: instance of -- that class of which this subject is a particular example and member; different from P279 (subclass of); for example: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)
                    P86: composer -- person(s) who wrote the music [for lyricist, use "lyrics by" (P676)]
                    P1412: languages spoken, written or signed -- language(s) that a person or a people speaks, writes or signs, including the native language(s)
                    P92: main regulatory text -- text setting the main rules by which the subject is regulated
                    P50: author -- main creator(s) of a written work (use on works, not humans); use P2093 (author name string) when Wikidata item is unknown or does not exist
                    P155: follows -- immediately prior item in a series of which the subject is a part, preferably use as qualifier of P179 [if the subject has replaced the preceding item, e.g. political offices, use "replaces" (P1365)]
                    P676: lyricist -- author of song lyrics
                    **Reasoning:** 
                    - the question asks about an co-author of a some national anthem's text, not the music itself!
                    - description of P676 makes it clear that it represent "author of song lyrics" - that's what we needed
                    - P31's description is too general and not correspond to songs's author
                    - P86, P50 are close, but they describe a composer/author of written work, not a lyrics writer
                    - other properties also describe something another than lyric's writer
                    RELEVANT PROPERTIES: P676


                    EXAMPLE#5
                    QUESTION: In which country did the great Russian chess player Alexander Alekhine end his life?
                    CANDIDATE PROPERTIES:
                    P20: place of death -- most specific known (e.g. city instead of country, or hospital instead of city) death location of a person, animal or fictional character
                    P27: country of citizenship -- the object is a country that recognizes the subject as its citizen
                    P582: end time -- moment when an entity ceases to exist or a statement stops being valid
                    P31: instance of -- that class of which this subject is a particular example and member; different from P279 (subclass of); for example: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)
                    P276: location -- location of the object, structure or event. In the case of an administrative entity as containing item use P131. For statistical entities use P8138. In the case of a geographic entity use P706. Use P7153 for locations associated with the object
                    P26: spouse -- the subject has the object as their spouse (husband, wife, partner, etc.). Use "unmarried partner" (P451) for non-married companions
                    P17: country -- sovereign state that this item is in (not to be used for human beings)
                    P39: position held -- subject currently or formerly holds the object position or public office
                    P580: start time -- time an entity begins to exist or a statement starts being valid
                    P19: place of birth -- most specific known birth location of a person, animal or fictional character
                    **Reasoning:**
                    - the question asks about location of somebody's death and it's country where it happened
                    - in the candidates we can clearly see P20 and P17, which respresent **place of death** and **contry**
                    - P276 and P582 are close, but too common for this QUESTION
                    - other has no relation to QUESTION's PROPERTIES
                    RELEVANT PROPERTIES: P20, P17
"""

SYSTEM_PROMPT_PROP = f"""You are an experienced WikiData user. 
                   You receive a QUESTION and a SET of extracted Wikidata CANDIDATE PROPERTIES, retrieved from an index based on similarity between the QUESTION and the PROPERTY. 
                   Each property has a wikidata ID, label and description in a form of Wikidata ID: label -- description. 
                   Your task is to select only the most semantically relevant properties based on their labels and descriptions.
                   
                   Instruction:
                    -- Ignore properties that are loosely related, homonyms, or have indirect contextual links to the QUESTION.
                    -- If no property perfectly matches the question, select the best available property based on contextual closeness.
                    -- If multiple properties seem relevant, prefer those with a stronger direct connection to the query subject.
                    -- Some questions may have multiple correct properties. Return all properties that are directly relevant. 
                    -- Always include reasoning in the beginning of the responce.
                    -- Always place "RELEVANT PROPERTIES: ..." at the end of your response.

                   Here are examples:
                   {FEW_SHOT_EXAMPLES_PROP}
                   """

USER_PROMPT_PROP = """QUESTION: {question}\nCANDIDATE PROPERTIES:\n{candidate_list_str}"""

## Functions

In [4]:
'''
Prompt and Preprocessing Functions
'''

def get_default(dic, field, default=''):
    return dic[field] if field in dic else default

    
def prepare_candidates(sample, retriever_output):    
    sample_id = sample['id']
    # relevant_ids = list(sample['id2alias'].keys())
    retrived_cands = retriever_output[str(sample_id)]
    retrived_ids = retrived_cands.keys()
    retrived_ids = list(filter(lambda x: x is not None, retrived_ids))
    
    candidates_set = list(set(retrived_ids
                              # + relevant_ids
                             ))
    # HERE I DO SHUFFLING, SINCE I DO NOT KNOW THE ORDER FROM RETRIEVER
    random.shuffle(candidates_set)
    
    entities_list = [f"{key}: {get_default(retrived_cands[key], 'label')} -- {get_default(retrived_cands[key], 'description')}" for key in retrived_ids]
    entities_str = "\n".join(entities_list)
    return entities_str
        

def construct_user_prompt(sample, retriever_output, user_prompt):
    question = sample['en_question']
    candidates_set_string = prepare_candidates(sample, retriever_output)
    user_prompt = user_prompt.format(question=question, candidate_list_str=candidates_set_string)
    return user_prompt


def create_batch_file(batch_data, batch_retriver, system_prompt, user_prompt):
    file = []
    for i in range(len(batch_data)):
        messages=[{"role": "system", "content": system_prompt},
                  {"role": "user", "content": construct_user_prompt(batch_data[i], batch_retriver, user_prompt)}]
        new_request = {"custom_id": f"request_{i}", 
                       "method": "POST", 
                       "url": "/v1/chat/completions", 
                       "body": {"model": "gpt-4-1106-preview", 
                                "messages": messages, 
                                "temperature": 0,
                                "max_tokens": 400,
                                "top_p": 1,
                                "frequency_penalty": 0,
                                "presence_penalty": 0,
                                # "response_format": {"type": "json_object"}
                               }}
        file.append(new_request)
    return file

In [74]:
'''
Run OpenAI Functions
'''
def ask_openai_batch(file_path):
    client = openai.OpenAI(
        api_key=os.environ.get("OPENAI_KEY"),
    )

    batch_input_file = client.files.create(file=open(file_path, 'rb'), purpose='batch')
    batch_input_file_id = batch_input_file.id

    response_batch = None
    idx = 0
    while response_batch is None and idx < 10:
        try:
            response_batch = client.batches.create(
                input_file_id=batch_input_file_id,
                endpoint="/v1/chat/completions",
                completion_window='24h',
            )
            idx += 1
            
        except openai.APIConnectionError as e:
            idx += 1
            print(e)

    if idx == 10:
        return "This sentence was not judged."

    return response_batch


def check_batch(batch_id):
    client = openai.OpenAI(
        api_key=os.environ.get("OPENAI_KEY"),
    )

    return client.batches.retrieve(batch_id)

def get_response_batch(batch_id):
    client = openai.OpenAI(
        api_key=os.environ.get("OPENAI_KEY"),
    )
    file_response = client.files.content(batch_id)
        
    response_data = []
    for line in file_response.iter_lines():
        answer_dict = eval(line.replace('null', 'None'))
        answer_str = answer_dict['response']['body']['choices'][0]['message']['content']
        response_data.append(answer_str)

    return response_data

def get_usage_info(batch_id):
    client = openai.OpenAI(
        api_key=os.environ.get("OPENAI_KEY"),
    )
    file_response = client.files.content(check_batch(id_).output_file_id)

    prompt_tokens_total, completion_tokens_total = 0, 0
    for line in file_response.iter_lines():
        answer_dict = eval(line.replace('null', 'None'))
        usage = answer_dict['response']['body']['usage']
        prompt_tokens_total += usage['prompt_tokens']
        completion_tokens_total += usage['completion_tokens']
    return {
        'prompt_tokens': prompt_tokens_total, 
        'completion_tokens': completion_tokens_total
    }

In [6]:
'''
Eval Functions
'''
def extract_wikidata_ids(text, search_pattern):
    pattern = search_pattern + r"\s*([\w, ]+)"    
    match = re.search(pattern, text)
    
    if match:
        # Extract the IDs and return them as a list
        wikidata_ids = match.group(1).split(', ')
        return wikidata_ids
    else:
        return []
        
# RELEVANT ENTITIES: / RELEVANT PROPERTIES:
def parse_responce(responce_str, search_pattern):
    if search_pattern == 'entity':
        search_pattern = 'RELEVANT ENTITIES:'
    elif search_pattern == 'property':
        search_pattern = 'RELEVANT PROPERTIES:'
    else:
        raise Exception('Choose correct seacrh pattern')
    
    # parse main case
    if search_pattern in responce_str:
        wikidata_ids = extract_wikidata_ids(responce_str, search_pattern)
        return wikidata_ids
    # I hope there are no other cases, but still -- better to ch
    else:
        return None


def get_metrics(eval_data, response_data, search_pattern, gold_key):

    assert len(eval_data) == len(response_data)

    get_gold = lambda x: x[gold_key]['query'].keys()
    
    overall_precision, overall_recall, overall_f1 = 0, 0, 0
    failed_examples = []
    error_generations, incomplete_generation = [], []
    false_positive_generations = []
    for idx, pair in enumerate(zip(eval_data, response_data)):
        sample, gpt_response = pair
        extracted_candidates = parse_responce(gpt_response, search_pattern)

        gold_ids = get_gold(sample)
        
        if extracted_candidates:
            true_positives = set(extracted_candidates) & set(gold_ids)
    
            precision = len(true_positives) / len(extracted_candidates) if extracted_candidates else 0.0
            if precision == 0:
                false_positive_generations.append([idx, gpt_response, gold_ids])
    
            # Recall: Proportion of gold entities that are correctly predicted
            recall = len(true_positives) / len(gold_ids) if gold_ids else 0.0
            if recall == 0:
                error_generations.append([idx, gpt_response, gold_ids])
            if recall != 1:
                incomplete_generation.append([idx, gpt_response, gold_ids])
    
            # F1-Score: Harmonic mean of Precision and Recall
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
    
            overall_precision += precision
            overall_recall += recall
            overall_f1 += f1
        else:
            failed_examples.append(gpt_response)
        
    overall_precision /= len(response_data)
    overall_recall /= len(response_data)
    overall_f1 /= len(response_data)
    
    print('Precision: ', overall_precision)
    print('Recall: ', overall_recall)
    print('F1: ', overall_f1)


def get_result(data, retriver_output, response_arr, search_pattern):
    result = {}
    for idx, pair in enumerate(zip(data, response_arr)):
        sample, gpt_response = pair
        extracted_candidates = parse_responce(gpt_response, search_pattern)
        uid = str(sample['id'])
        result[uid] = {
            'question_eng': sample['en_question'],
            'query': sample['query'],
            'candidates': {cand: retriver_output[uid][cand]['label']
                           for cand in extracted_candidates if cand in retriver_output[uid]} if extracted_candidates else {}
        }
    return result

## Data

In [195]:
task_name = 'entity' # entity / property
data_name = 'rubq'
retriver_out_path = 'data/retriver_out/rubq/rubq_test_entities_retrieval.json'
data_path = 'data/preprocessed/rubq/rubq_test.json'
topk = 100

retriver_out = json.load(open(retriver_out_path))
retriver_out = {k: dict(list(v.items())[:topk]) for k, v in retriver_out.items()}

data = json.load(open(data_path))['dataset']
eval_data = [sample for sample in data if str(sample['id']) in retriver_out]

if task_name == 'entity':
    SYSTEM_PROMPT = SYSTEM_PROMPT_ENTS
    USER_PROMPT = USER_PROMPT_ENTS
    retr_field = 'entities'
elif task_name == 'property':
    SYSTEM_PROMPT = SYSTEM_PROMPT_PROP
    USER_PROMPT = USER_PROMPT_PROP
    retr_field = 'relations'
else:
    raise Exception('Choose correct task_name')

print(
    f'len(data) = {len(data)}', 
    f'len(eval) = {len(eval_data)}',
    f'retr {topk} = {np.mean([len(
        set(x[retr_field]['query'].keys()) & 
        set(retriver_out[str(x['id'])])
    ) > 0 for x in eval_data])}',
    sep='\n'
)

len(data) = 480
len(eval) = 479
retr 100 = 0.7369519832985386


## Post to OpenAI

In [79]:
price_per_1m = {'prompt': 5, 'completion': 15}

In [43]:
batch_file = create_batch_file(eval_data, retriver_out, SYSTEM_PROMPT, USER_PROMPT)

In [79]:
idx = 69
print(eval_data[idx]['entities']['query'].keys())
print(eval_data[idx]['relations']['query'].keys())
print('======')
print(batch_file[idx]['body']['messages'][1]['content'])

dict_keys(['Q12994', 'Q157070'])
dict_keys(['P582', 'P131'])
QUESTION: When did Bruges die in Flanders County?
CANDIDATE ENTITIES:
Q663749: judicial arrondissement of Bruges -- former judicial arrondissement of Belgium
Q111814592: why some whales die on land: first whale did so -- narrative motif documented in Thompson's Motif-Index of Folk-Literature
Q104472038: The funeral of Charles I, Count of Flanders, celebrated in Bruges in the Church of St. Christopher on 22 April 1127 -- painting by Jan Van Beers
Q17101798: Exposition des primitifs flamands à Bruges -- exhibition in 1902 about Early Netherlandish Painting, Bruges,
Q47578200: Did Jesus die of a 'broken heart'? -- scientific article published in August 2009
Q157070: County of Flanders -- A county and historic territory in the Low Countries
Q35088344: Characteristics of Chinese rural young suicides: who did not have a strong intent to die. -- scientific article
Q40798683: 'In an important way, I did die': uncertainty and revival 

In [44]:
batch_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_{task_name}_{topk}.jsonl')

if not os.path.exists(os.path.dirname(batch_file_path)):
    os.makedirs(os.path.dirname(batch_file_path))

with open(batch_file_path, 'w') as jsonl_file:
    for entry in batch_file:
        jsonl_file.write(json.dumps(entry) + '\n')

In [45]:
batch_response = ask_openai_batch(batch_file_path)

In [46]:
batch_response

Batch(id='batch_67d40bb8a3088190a1efb5ea07bd70ac', completion_window='24h', created_at=1741949880, endpoint='/v1/chat/completions', input_file_id='file-6GfyrKnkWnutYNhUQayAhF', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742036280, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

## gpt-4-1106-preview

### Rubq Entity 100

In [196]:
id_ = 'batch_67c8cd0d2808819088783fa5f60e8285'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=479, failed=0, total=479))

In [197]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.5997448387845049
Recall:  0.6256089074460681
F1:  0.6015243397289326


In [198]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [199]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [200]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 1789258, 'completion_tokens': 48322}
price = 9.67$


### Rubq Entity 10

In [190]:
id_ = 'batch_67c8d69d880481908a7a56d38a85445c'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=479, failed=0, total=479))

In [191]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.5207111376213673
Recall:  0.5511482254697286
F1:  0.5258342446233887


In [192]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [193]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [194]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 651320, 'completion_tokens': 47531}
price = 3.97$


### Rubq Property 100

In [182]:
id_ = 'batch_67c8dc8911a48190853a7856a5936e84'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=474, failed=0, total=474))

In [184]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.7297669278681936
Recall:  0.7626582278481012
F1:  0.7283453887884268


In [185]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [186]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [187]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 2341091, 'completion_tokens': 59164}
price = 12.59$


### Rubq Property 10

In [177]:
id_ = 'batch_67c8e18605988190af52f49d7747c0fc'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=474, failed=0, total=474))

In [178]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.7324191279887483
Recall:  0.7563291139240507
F1:  0.7312939521800285


In [179]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [180]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [181]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 1122868, 'completion_tokens': 55037}
price = 6.44$


### Lcquad Entity 100

In [169]:
id_ = 'batch_67c8e547658481908cb0ab9d2126a95b'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=4540, failed=0, total=4540))

In [171]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.5937634780815261
Recall:  0.5361600587371511
F1:  0.5413891375925214


In [172]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [173]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [174]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 16954379, 'completion_tokens': 570771}
price = 93.33$


### Lcquad Entity 10

In [164]:
id_ = 'batch_67c8ec2b8dcc8190b64f039cc0fdf2a5'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=4540, failed=0, total=4540))

In [165]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.5292405251381022
Recall:  0.4805066079295155
F1:  0.4865486997094956


In [166]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [167]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [168]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 6182021, 'completion_tokens': 509800}
price = 38.56$


### Lcquad Property 100

In [154]:
id_ = 'batch_67c9879c57088190a4e013e99451a318'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=4541, failed=0, total=4541))

In [155]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.7227689167129817
Recall:  0.608199368714674
F1:  0.6294626352873383


In [156]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [157]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [158]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 22346997, 'completion_tokens': 678813}
price = 121.92$


### Lcquad Property 10 

In [147]:
id_ = 'batch_67c9925f48d4819095e49188abb1eccd'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=4541, failed=0, total=4541))

In [149]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.7320597518901855
Recall:  0.5993540336196135
F1:  0.6343499963297295


In [150]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [151]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [152]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 10758067, 'completion_tokens': 651753}
price = 63.57$


### Pat Entity 100 

In [141]:
id_ = 'batch_67c9a4b588ec8190b5482f5e69e83e39'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=1199, failed=0, total=1199))

In [143]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.44856825132054484
Recall:  0.45287739783152625
F1:  0.4499582985821518


In [144]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [145]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [146]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 3993032, 'completion_tokens': 139411}
price = 22.06$


### Pat Entity 10 

In [135]:
id_ = 'batch_67cac8421b148190af92580078ea6f20'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=1199, failed=0, total=1199))

In [136]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.4359883236030025
Recall:  0.44286905754795663
F1:  0.43819849874895755


In [137]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [138]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [139]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 1605520, 'completion_tokens': 128690}
price = 9.96$


### Pat Property 100 

In [127]:
id_ = 'batch_67d2d7dfea508190b5fc8e2c74c1513f'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=1199, failed=0, total=1199))

In [128]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.6513622463163748
Recall:  0.25604670558799014
F1:  0.35431642771092237


In [129]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [130]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [131]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 5934881, 'completion_tokens': 174872}
price = 32.3$


### Pat Property 10 

In [118]:
id_ = 'batch_67d2f1cba2dc819095224c4246eb1436'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=1199, failed=0, total=1199))

In [121]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.7322073950514317
Recall:  0.2970530998053915
F1:  0.41236347750109015


In [122]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [123]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [124]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 2787812, 'completion_tokens': 149742}
price = 16.19$


### Qald Entity 100 

In [105]:
id_ = 'batch_67d3ffc087488190bd4a5b1b2b9e774c'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=386, failed=0, total=386))

In [17]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.6102208240809276
Recall:  0.5256908462867014
F1:  0.5385635126243932


In [19]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [20]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [106]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 1434749, 'completion_tokens': 44114}
price = 7.84$


### Qald Entity 10 

In [103]:
id_ = 'batch_67d4060f338c8190958d3167191e71d7'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=386, failed=0, total=386))

In [28]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.5639464594127805
Recall:  0.4844559585492228
F1:  0.5057101361505504


In [29]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [30]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [104]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 522706, 'completion_tokens': 39543}
price = 3.21$


### Qald Property 100 

In [101]:
id_ = 'batch_67d409fd2fb08190864a022ee64f685d'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=384, failed=0, total=384))

In [39]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.7109809027777777
Recall:  0.6230468749999999
F1:  0.6288917824074071


In [40]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [41]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [102]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 1905308, 'completion_tokens': 55194}
price = 10.35$


### Qald Property 10 

In [99]:
id_ = 'batch_67d40bb8a3088190a1efb5ea07bd70ac'
check_batch(id_).status, check_batch(id_).request_counts

('completed', BatchRequestCounts(completed=384, failed=0, total=384))

In [50]:
out_id_ = check_batch(id_).output_file_id
response_list = get_response_batch(out_id_)
result = get_result(eval_data, retriver_out, response_list, search_pattern=task_name)
get_metrics(eval_data, response_list, search_pattern=task_name, gold_key=retr_field)

Precision:  0.683376736111111
Recall:  0.5577256944444444
F1:  0.5875124007936506


In [51]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'{data_name}_result_{task_name}_{topk}.jsonl')

json.dump(result, open(result_file_path, 'w'))

In [52]:
response_file_path = os.path.join(os.path.dirname(retriver_out_path), "gpt-4-1106-preview",
                               f'response_{task_name}_{topk}.jsonl')

json.dump(response_list, open(response_file_path, 'w'))

In [100]:
usage = get_usage_info(id_)
print(
    f"usage = {usage}",
    f"price = {round(price_per_1m['prompt'] * usage['prompt_tokens']/1e6 + price_per_1m['completion'] * usage['completion_tokens']/1e6, 2)}$",
    sep='\n'
)

usage = {'prompt_tokens': 910185, 'completion_tokens': 53658}
price = 5.36$


## get error file

In [91]:
client = openai.OpenAI(
    api_key=os.environ.get("OPENAI_KEY"),
)
file_response = client.files.content(check_batch('batch_67cacfcc55cc81909fae43bfc1f4fe2c').error_file_id)

In [92]:
file_response.text

'{"id": "batch_req_67cad4c2ed648190a7c2db6ae406883a", "custom_id": "request_32", "response": {"status_code": 429, "request_id": "49e94329ca3590edcb57443f136b0c00", "body": {"error": {"message": "You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.", "type": "insufficient_quota", "param": null, "code": "insufficient_quota"}}}, "error": null}\n{"id": "batch_req_67cad4cbc730819097373e4d33c1b755", "custom_id": "request_147", "response": {"status_code": 429, "request_id": "d1c7c1ac87ff143e5cca484c9f76779e", "body": {"error": {"message": "You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.", "type": "insufficient_quota", "param": null, "code": "insufficient_quota"}}}, "error": null}\n{"id": "batch_req_67cad4d880188190

In [89]:
client = openai.OpenAI(
    api_key=os.environ.get("OPENAI_KEY"),
)

In [90]:
client.batches.cancel("batch_67cacfcc55cc81909fae43bfc1f4fe2c")

ConflictError: Error code: 409 - {'error': {'message': "Cannot cancel a batch with status 'completed'.", 'type': 'invalid_request_error', 'param': None, 'code': None}}