In [None]:
import os
import re
import json
import subprocess
from tqdm import tqdm
import pandas as pd
import numpy as np
import openai
import random

## Prompts

In [None]:
FEW_SHOT_EXAMPLES_ENTS = """
                  EXAMPLE#1
                   QUESTION: What does DiCaprio's full name sound like?
                   CANDIDATE ENTITIES: 
                   Q116673393: Sibon irmelindicaprioae -- species of snake
                    Q38111: Leonardo DiCaprio -- American actor and film producer (born 1974)
                    Q11461: sound -- vibration that propagates as an acoustic wave
                    Q36860035: DiCaprio -- family name
                    Q25349951: Martin Scorsese and Leonardo DiCaprio -- collaborations
                    Q56653813: DiCaprio 2 -- album by J.I.D
                   RELEVANT ENTITIES: Q38111.
                   
                   
                   EXAMPLE#2
                   QUESTION: In which city near Moscow is the new Jerusalem monastery located?
                    CANDIDATE ENTITIES: 
                    Q55502: Kingdom of Jerusalem -- medieval Christian kingdom in the Middle East
                    Q773979: New Jerusalem Monastery -- monastery in Moscow Oblast, Russia
                    Q1218: Jerusalem -- city in the Middle East, holy to the three Abrahamic religions
                    Q649: Moscow -- capital and most populous city of Russia
                    Q515: city -- large human settlement
                    Q13164: Moscow State University -- university in Moscow, Russia
                    Q6760: UTC+03:00 -- identifier for a time offset from UTC of +3
                    Q10540001: Jerusalem -- family name                    
                    RELEVANT ENTITIES: Q773979, Q649.

                    
                    EXAMPLE#3
                    QUESTION: What capital stands on the banks of Potalaka?
                    CANDIDATE ENTITIES: 
                    Q11626848: Mount Potalaka -- the mythical dwelling of the Buddhist bodhisattva Avalokiteśvara, said to exist in India
                    Q22687: bank -- financial institution that accepts deposits
                    Q60756888: Potalaka Guanyin -- sculpture by unknown artist (1965.556)
                    Q179444: Potomac River -- river in the mid-Atlantic United States
                    Q193893: capital -- upper part of a column (architecture)
                    Q60: New York City -- most populous city in the United States
                    Q5119: capital city -- primary governing city of a top-level (country) or first-level and second-level subdivision (country, state, province, regency, etc) political entity
                    **Reasoning:**  
                    - "Capital" refers to a **governing city**, not architectural elements (Q193893).  
                    - "Potalaka" (Q11626848) is a **mythical** location, so no real-world capital is directly linked.  
                    - Since no entity directly matches the question, **the most general applicable concept** is "capital city" (Q5119).
                    RELEVANT ENTITIES: Q179444, Q5119.
                    
                    
                    EXAMPLE#4 
                    QUESTION: What is the official language of Brazil?  
                    CANDIDATE ENTITIES:  
                    Q750553: Spanish language -- Romance language originating in Spain  
                    Q5146: Portuguese language -- Romance language, official in Portugal and Brazil  
                    Q155: Brazil -- country in South America  
                    Q483110: Brasília -- capital city of Brazil
                    **Reasoning**:  
                    - "Official language" refers to the main language used for governance.  
                    - The description of Q5146 explicitly states it is official in Brazil.  
                    - Q750553 (Spanish) is incorrect because it is **not** the official language.  
                    - Q155 and Q483110 are related but not **languages**. 
                    RELEVANT ENTITIES: Q5146.  

                    EXAMPLE#5
                    QUESTION: What city was Karl Marx born in?
                    CANDIDATE ENTITIES:
                    Q61412: Eleanor Marx -- English-born activist and daughter of Karl Marx (1855–1898)
                    Q105367341: Karl Marx -- Holocaust victim (born 1890)
                    Q9061: Karl Marx -- German-born philosopher (1818–1883)
                    Q26828710: Karl Marx City -- 2016 film directed by Michael Tucker and Petra Epperlein
                    Q3138: Trier -- city in Rhineland-Palatinate, Germany
                    **Reasoning**:
                    - "born in" refers to a some person, not other objects like films or cities
                    - there is no information which can help us with a choice of people with similar names
                    - Q9061 is much more popular that Q105367341 - so we will choose it
                    - Q3138 seems to be a ANSWER to the QUESTION, but not an ENTITY mentioned in QUESTION - we reject it
                    RELEVANT ENTITIES: Q9061.  
"""

SYSTEM_PROMPT_ENTS = f"""You are an experienced WikiData user. 
                   You receive a QUESTION and a SET of extracted Wikidata CANDIDATE ENTITIES, retrieved from an index based on similarity between the QUESTION and the ENTITY. 
                   Each entity has a wikidata ID, label and description in a form of Wikidata ID: label -- description. 
                   Your task is to select only the most semantically relevant entities based on their labels and descriptions.
                   
                   Instruction:
                    -- Do not answer the question - you don't need find an entity representing an answer, you need to find an entities mentioned in that qustion.
                    -- Ignore entities that are loosely related, homonyms, or have indirect contextual links to the QUESTION.
                    -- If no entity perfectly matches the question, select the best available entity based on contextual closeness.
                    -- If multiple entities seem relevant, prefer those with a stronger direct connection to the query subject.
                    -- If there are ambiguous entities that relate to the different countries, ALWAYS choose the entity that is related to Russia, based on description.
                    -- Some questions may have multiple correct entities. Return all entities that are directly relevant. Do NOT include entities that are only indirectly related (e.g., broad categories like 'city' instead of a specific location).
                    -- DO NOT reject entity, if it fits to the entity mentioned in query, but DOES NOT DIRECTLY answer the question.
                    -- Always include reasoning in the beginning of the responce.
                    -- Always place "RELEVANT ENTITIES: ..." at the end of your response.

                   Here are examples:
                   {FEW_SHOT_EXAMPLES_ENTS}
                   """

USER_PROMPT_ENTS = """QUESTION: {question}\nCANDIDATE ENTITIES:\n{candidate_list_str}"""

In [None]:
FEW_SHOT_EXAMPLES_PROP = """
                    EXAMPLE#1
                    QUESTION: How much does Nurmagomedov weigh?
                    CANDIDATE PROPERTIES:
                    P166: award received -- award or recognition received by a person, organization or creative work
                    P54: member of sports team -- sports teams or clubs that the subject represents or represented
                    P1082: population -- number of people inhabiting the place; number of people of subject
                    P1351: number of points/goals/set scored -- goals / points scored in a match or an event used as qualifier to the participant. Use P1358 for league points.
                    P1350: number of matches played/races/starts -- matches or games a player or a team played during an event. Also a total number of matches a player officially appeared in during the whole career.
                    P2121: prize money -- amount in a specific currency
                    P585: point in time -- date something took place, existed or a statement was true; for providing time use the "refine date" property (P4241)
                    P2067: mass -- mass (in colloquial usage also known as weight) of the item
                    P1412: languages spoken, written or signed -- language(s) that a person or a people speaks, writes or signs, including the native language(s)
                    P2046: area -- area occupied by an object
                    RELEVANT PROPERTIES: P2067


                    EXAMPLE#2
                    QUESTION: Where is the Academy of Sciences of Armenia located
                    CANDIDATE PROPERTIES:
                    P463: member of -- organization, club or musical group to which the subject belongs. Do not use for membership in ethnic or social groups, nor for holding a political position, such as a member of parliament (use P39 for that)
                    P31: instance of -- that class of which this subject is a particular example and member; different from P279 (subclass of); for example: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)
                    P19: place of birth -- most specific known birth location of a person, animal or fictional character
                    P580: start time -- time an entity begins to exist or a statement starts being valid
                    P582: end time -- moment when an entity ceases to exist or a statement stops being valid
                    P69: educated at -- educational institution attended by subject
                    P159: headquarters location -- city or town where an organization's headquarters is or has been situated. Use P276 qualifier for specific building
                    P585: point in time -- date something took place, existed or a statement was true; for providing time use the "refine date" property (P4241)
                    RELEVANT PROPERTIES: P159


                    EXAMPLE#3
                    QUESTION: What is the singing voice of Dmitri Hvorostovsky?
                    CANDIDATE PROPERTIES:
                    P412: voice type -- person's voice type. expected values: soprano, mezzo-soprano, contralto, countertenor, tenor, baritone, bass (and derivatives)
                    P725: voice actor -- performer of a spoken role in a creative work such as animation, video game, radio drama, or dubbing over [use "character role" (P453) as qualifier] [use "cast member" (P161) for live acting]
                    P175: performer -- actor, musician, band or other performer associated with this role or musical work
                    P453: character role -- specific role played or filled by subject -- use only as qualifier of "cast member" (P161), "voice actor" (P725)
                    P179: part of the series -- series which contains the subject
                    P674: characters -- characters which appear in this item (like plays, operas, operettas, books, comics, films, TV series, video games)
                    P1441: present in work -- this (fictional or fictionalized) entity, place, or person appears in that work as part of the narration (use P2860 for works citing other works, P361/P1433 for works being part of other works, P1343 for entities described in non-fictional accounts)
                    **Reasoning:** 
                    - the question asks about voice's characteristics of some person
                    - P412 is relevant, because it's a property described a voice type, which is voice's characteristic
                    - P725, P175, P674 represent properties described a person role in something, not related to voice
                    - other properties also represent something not similar to voice 
                    RELEVANT PROPERTIES: P412
                   
                   
                    EXAMPLE#4
                    QUESTION: Who was S. V. Mikhalkov's co-author in writing the text of the anthem of the Soviet Union?
                    CANDIDATE PROPERTIES:
                    P31: instance of -- that class of which this subject is a particular example and member; different from P279 (subclass of); for example: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)
                    P86: composer -- person(s) who wrote the music [for lyricist, use "lyrics by" (P676)]
                    P1412: languages spoken, written or signed -- language(s) that a person or a people speaks, writes or signs, including the native language(s)
                    P92: main regulatory text -- text setting the main rules by which the subject is regulated
                    P50: author -- main creator(s) of a written work (use on works, not humans); use P2093 (author name string) when Wikidata item is unknown or does not exist
                    P155: follows -- immediately prior item in a series of which the subject is a part, preferably use as qualifier of P179 [if the subject has replaced the preceding item, e.g. political offices, use "replaces" (P1365)]
                    P676: lyricist -- author of song lyrics
                    **Reasoning:** 
                    - the question asks about an co-author of a some national anthem's text, not the music itself!
                    - description of P676 makes it clear that it represent "author of song lyrics" - that's what we needed
                    - P31's description is too general and not correspond to songs's author
                    - P86, P50 are close, but they describe a composer/author of written work, not a lyrics writer
                    - other properties also describe something another than lyric's writer
                    RELEVANT PROPERTIES: P676


                    EXAMPLE#5
                    QUESTION: In which country did the great Russian chess player Alexander Alekhine end his life?
                    CANDIDATE PROPERTIES:
                    P20: place of death -- most specific known (e.g. city instead of country, or hospital instead of city) death location of a person, animal or fictional character
                    P27: country of citizenship -- the object is a country that recognizes the subject as its citizen
                    P582: end time -- moment when an entity ceases to exist or a statement stops being valid
                    P31: instance of -- that class of which this subject is a particular example and member; different from P279 (subclass of); for example: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)
                    P276: location -- location of the object, structure or event. In the case of an administrative entity as containing item use P131. For statistical entities use P8138. In the case of a geographic entity use P706. Use P7153 for locations associated with the object
                    P26: spouse -- the subject has the object as their spouse (husband, wife, partner, etc.). Use "unmarried partner" (P451) for non-married companions
                    P17: country -- sovereign state that this item is in (not to be used for human beings)
                    P39: position held -- subject currently or formerly holds the object position or public office
                    P580: start time -- time an entity begins to exist or a statement starts being valid
                    P19: place of birth -- most specific known birth location of a person, animal or fictional character
                    **Reasoning:**
                    - the question asks about location of somebody's death and it's country where it happened
                    - in the candidates we can clearly see P20 and P17, which respresent **place of death** and **contry**
                    - P276 and P582 are close, but too common for this QUESTION
                    - other has no relation to QUESTION's PROPERTIES
                    RELEVANT PROPERTIES: P20, P17
"""

SYSTEM_PROMPT_PROP = f"""You are an experienced WikiData user. 
                   You receive a QUESTION and a SET of extracted Wikidata CANDIDATE PROPERTIES, retrieved from an index based on similarity between the QUESTION and the PROPERTY. 
                   Each property has a wikidata ID, label and description in a form of Wikidata ID: label -- description. 
                   Your task is to select only the most semantically relevant properties based on their labels and descriptions.
                   
                   Instruction:
                    -- Ignore properties that are loosely related, homonyms, or have indirect contextual links to the QUESTION.
                    -- If no property perfectly matches the question, select the best available property based on contextual closeness.
                    -- If multiple properties seem relevant, prefer those with a stronger direct connection to the query subject.
                    -- Some questions may have multiple correct properties. Return all properties that are directly relevant. 
                    -- Always include reasoning in the beginning of the responce.
                    -- Always place "RELEVANT PROPERTIES: ..." at the end of your response.

                   Here are examples:
                   {FEW_SHOT_EXAMPLES_PROP}
                   """

USER_PROMPT_PROP = """QUESTION: {question}\nCANDIDATE PROPERTIES:\n{candidate_list_str}"""

## Functions

In [None]:
'''
Prompt and Preprocessing Functions
'''

def get_default(dic, field, default=''):
    return dic[field] if field in dic else default

    
def prepare_candidates(sample, retriever_output):    
    sample_id = sample['id']
    # relevant_ids = list(sample['id2alias'].keys())
    retrived_cands = retriever_output[str(sample_id)]
    retrived_ids = retrived_cands.keys()
    retrived_ids = list(filter(lambda x: x is not None, retrived_ids))
    
    candidates_set = list(set(retrived_ids
                              # + relevant_ids
                             ))
    # HERE I DO SHUFFLING, SINCE I DO NOT KNOW THE ORDER FROM RETRIEVER
    random.shuffle(candidates_set)
    
    entities_list = [f"{key}: {get_default(retrived_cands[key], 'label')} -- {get_default(retrived_cands[key], 'description')}" for key in retrived_ids]
    entities_str = "\n".join(entities_list)
    return entities_str
        

def construct_user_prompt(sample, retriever_output, user_prompt):
    question = sample['en_question']
    candidates_set_string = prepare_candidates(sample, retriever_output)
    user_prompt = user_prompt.format(question=question, candidate_list_str=candidates_set_string)
    return user_prompt

In [None]:
'''
Run OpenAI Functions
'''
def run_sample_with_openai(sample, retriever_output, system_prompt, user_prompt):
    user_prompt = construct_user_prompt(sample, retriever_output, user_prompt)
    
    response = client.chat.completions.create(
        model=vllm_model_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0,   # Reduce randomness for consistent entity selection
        max_tokens=4000,  # Adjust if the response is cut off
        top_p=1,         # Avoid sampling randomness
        frequency_penalty=0,  
        presence_penalty=0  
    )
    
    return response.choices[0].message.content

In [None]:
'''
Eval Functions
'''
def extract_wikidata_ids(text, search_pattern):
    pattern = search_pattern + r"\s*([\w, ]+)"   
    text = text.replace('*','')
    match = re.search(pattern, text)
    
    if match:
        # Extract the IDs and return them as a list
        wikidata_ids = match.group(1).split(', ')
        return wikidata_ids
    else:
        return []
        
# RELEVANT ENTITIES: / RELEVANT PROPERTIES:
def parse_responce(responce_str, search_pattern):
    if search_pattern == 'entity':
        search_pattern = 'RELEVANT ENTITIES:'
    elif search_pattern == 'property':
        search_pattern = 'RELEVANT PROPERTIES:'
    else:
        raise Exception('Choose correct seacrh pattern')
    
    # parse main case
    if search_pattern in responce_str:
        wikidata_ids = extract_wikidata_ids(responce_str, search_pattern)
        return wikidata_ids
    # I hope there are no other cases, but still -- better to ch
    else:
        return None


def get_metrics(eval_data, response_data, search_pattern, gold_key):

    assert len(eval_data) == len(response_data)
    get_gold = lambda x: x[gold_key]['query'].keys()
    
    overall_precision, overall_recall, overall_f1 = 0, 0, 0
    failed_examples = []
    error_generations, incomplete_generation = [], []
    false_positive_generations = []
    for idx, pair in enumerate(zip(eval_data, response_data)):
        sample, gpt_response = pair
        extracted_candidates = parse_responce(gpt_response, search_pattern)

        gold_ids = get_gold(sample)
        
        if extracted_candidates:
            true_positives = set(extracted_candidates) & set(gold_ids)
    
            precision = len(true_positives) / len(extracted_candidates) if extracted_candidates else 0.0
            if precision == 0:
                false_positive_generations.append([idx, gpt_response, gold_ids])
    
            # Recall: Proportion of gold entities that are correctly predicted
            recall = len(true_positives) / len(gold_ids) if gold_ids else 0.0
            if recall == 0:
                error_generations.append([idx, gpt_response, gold_ids])
            if recall != 1:
                incomplete_generation.append([idx, gpt_response, gold_ids])
    
            # F1-Score: Harmonic mean of Precision and Recall
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
    
            overall_precision += precision
            overall_recall += recall
            overall_f1 += f1
        else:
            failed_examples.append(gpt_response)
        
    overall_precision /= len(response_data)
    overall_recall /= len(response_data)
    overall_f1 /= len(response_data)
    
    print('Precision: ', overall_precision)
    print('Recall: ', overall_recall)
    print('F1: ', overall_f1)


def get_result(data, retriver_output, response_arr, search_pattern):
    result = {}
    for idx, pair in enumerate(zip(data, response_arr)):
        sample, gpt_response = pair
        extracted_candidates = parse_responce(gpt_response, search_pattern)
        uid = str(sample['id'])
        result[uid] = {
            'question_eng': sample['en_question'],
            'query': sample['query'],
            'candidates': {cand: retriver_output[uid][cand]['label']
                           for cand in extracted_candidates if cand in retriver_output[uid]} if extracted_candidates else {}
        }
    return result


'''
Test Functions
'''
def test_prompt(data, retriever_output, user_prompt, idx):
    sample = data[idx]
    print('SPARQL: ', sample['query'])
    print()
    result_user_prompt = construct_user_prompt(sample, retriever_output, user_prompt)
    print(result_user_prompt[:])


def test_openai(data, retriver_output, system_prompt, user_prompt, idx):
    sample = data[idx]
    print('QUESTION: ', sample['en_question'])
    print('SPARQL: ', sample['query'])
    
    openai_responce = run_sample_with_openai(sample, retriver_output, system_prompt, user_prompt)
    print()
    print(openai_responce)

## Run VLLm Server

__Запуск VLLm-сервера__
- `export VLLM_USE_V1=0`
- `vllm serve <model_name> --task generate --port 8000 --tensor-parallel-size <gpu_amount> --trust-remote-code`

vllm serve deepseek-ai/DeepSeek-R1-Distill-Llama-70B --task generate --port 8000 --tensor-parallel-size 8 --trust-remote-code

Сейчас запуск на:
- microsoft/Phi-3-mini-128k-instruct - 8GPU - 4B+128k
- deepseek-ai/DeepSeek-R1-Distill-Llama-70B - при VLLM_USE_V1=1 - 8GPU - 70B+128k

In [None]:
# идем и запускаем сервак

openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

result_vllm = subprocess.run(['curl','http://localhost:8000/v1/models'], stdout=subprocess.PIPE)
vllm_server_info = eval(result_vllm.stdout.decode("utf-8").replace('null', 'None').replace('false', 'False').replace('true', 'True'))
vllm_model_name = vllm_server_info['data'][0]['id']

vllm_model_name, vllm_server_info

In [None]:
client = openai.OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

## Data

In [None]:
task_name = 'property' # entity / property
data_name = 'pat'
retriver_out_path = 'data/retriver_out/pat/pat_test_predicates_retrieval_new.json'
data_path = 'data/preprocessed/pat/pat_test_new.json'
topk = 10

retriver_out = json.load(open(retriver_out_path))
retriver_out = {k: dict(list(v.items())[:topk]) for k, v in retriver_out.items()}

data = json.load(open(data_path))['dataset']
eval_data = [sample for sample in data if str(sample['id']) in retriver_out]

if task_name == 'entity':
    SYSTEM_PROMPT = SYSTEM_PROMPT_ENTS
    USER_PROMPT = USER_PROMPT_ENTS
    retr_field = 'entities'
elif task_name == 'property':
    SYSTEM_PROMPT = SYSTEM_PROMPT_PROP
    USER_PROMPT = USER_PROMPT_PROP
    retr_field = 'relations'
else:
    raise Exception('Choose correct task_name')

print(
    f'len(data) = {len(data)}', 
    f'len(eval) = {len(eval_data)}',
    f'retr {topk} = {np.mean([len(
        set(x[retr_field]['query'].keys()) & 
        set(retriver_out[str(x['id'])])
    ) > 0 for x in eval_data])}',
    sep='\n'
)

## Inference VLLm

In [None]:
test_prompt(eval_data, retriver_out, USER_PROMPT, 102)

In [None]:
idx = 44 #100
test_openai(eval_data, retriver_out, SYSTEM_PROMPT, USER_PROMPT, idx)

In [None]:
get_gold = lambda x: x[retr_field]['query'].keys()

In [None]:
get_gold(eval_data[idx])

In [None]:
parse_responce(run_sample_with_openai(eval_data[idx], retriver_out, SYSTEM_PROMPT, USER_PROMPT), task_name)

## microsoft/Phi-3-mini-128k-instruct

### Rubq Entity 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Rubq Entity 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Rubq Property 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Rubq Property 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

## deepseek-ai/DeepSeek-R1-Distill-Llama-70B

### Rubq Entity 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Rubq Entity 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Rubq Property 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Rubq Property 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Lcquad Entity 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Lcquad Entity 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Lcquad Property 100

In [None]:
# lcquad property 100
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Lcquad Property 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Pat Entity 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Pat Entity 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Pat Entity 10 NEW

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}_NEW.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Pat Property 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Pat Property 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Pat Property 10 NEW

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}_NEW.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Qald Entity 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Qald Entity 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Qald Property 100

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

### Qald Property 10

In [None]:
responce_list = []
for i in tqdm(range(len(eval_data[:]))):
    sample = eval_data[i]
    result = run_sample_with_openai(sample, retriver_out, SYSTEM_PROMPT, USER_PROMPT)
    responce_list.append(result)

In [None]:
result = get_result(eval_data, retriver_out, responce_list, search_pattern=task_name)
get_metrics(eval_data, responce_list, search_pattern=task_name, gold_key=retr_field)

In [None]:
result_file_path = os.path.join(os.path.dirname(retriver_out_path), vllm_model_name.split('/')[-1],
                                f'{data_name}_result_{task_name}_{topk}.json')

if not os.path.exists(os.path.dirname(result_file_path)):
    os.makedirs(os.path.dirname(result_file_path))

json.dump(result, open(result_file_path, 'w'))

## Create output

In [None]:
import tarfile
def create_archive(source_dir, archive_path):
    # Создаем объект архива
    with tarfile.open(archive_path, 'w') as tar:
        for root, dirs, files in os.walk(source_dir):
            for file in files:
                # Получаем полный путь до файла
                full_path = os.path.join(root, file)
                
                # Добавляем файл в архив, сохраняя только его имя
                tar.add(full_path, arcname=os.path.basename(file))

In [None]:
result_path = '/home/jovyan/msbutko/open_kgqa_utils/data/retriver_out/rubq/DeepSeek-R1-Distill-Llama-70B/'
tar_name = 'out_' + '_'.join(os.path.dirname(result_path).split('/')[-2:]) + '.tar'
create_archive(result_path, tar_name)

In [None]:
result_path = '/home/jovyan/msbutko/open_kgqa_utils/data/retriver_out/lcquad/DeepSeek-R1-Distill-Llama-70B/'
tar_name = 'out_' + '_'.join(os.path.dirname(result_path).split('/')[-2:]) + '.tar'
create_archive(result_path, tar_name)

In [None]:
result_path = '/home/jovyan/msbutko/open_kgqa_utils/data/retriver_out/pat/DeepSeek-R1-Distill-Llama-70B/'
tar_name = 'out_' + '_'.join(os.path.dirname(result_path).split('/')[-2:]) + '.tar'
create_archive(result_path, tar_name)

In [None]:
result_path = '/home/jovyan/msbutko/open_kgqa_utils/data/retriver_out/qald/DeepSeek-R1-Distill-Llama-70B/'
tar_name = 'out_' + '_'.join(os.path.dirname(result_path).split('/')[-2:]) + '.tar'
create_archive(result_path, tar_name)