In [None]:
import requests
import pandas as pd
import os
import json

In [None]:
# change to llama_stage1_inference.jsonl in case you are running for llama2. This is the output generated from stage 1.
# in case you are planning to train the adapter, then you can run this on train.jsonl and val.jsonl
with open('beluga_stage1_inference.jsonl', 'r') as json_file: 
    json_data = json.load(json_file)

df = pd.DataFrame(json_data)

In [None]:
def search_wikidata_entities(search_string):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "search": search_string
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status() 

        data = response.json()
        search_results = data.get("search", [])
        formatted_results = []
        WikiTitleIndex = []

        for result in search_results:
            label = result.get("display", {}).get("label", {}).get("value", "")
            description = result.get("display", {}).get("description", {}).get("value", "")
            label_desc = f"{label} - {description}"
            formatted_results.append(label_desc)
            WikiTitleIndex.append(result["id"])

        return formatted_results, WikiTitleIndex
    except requests.exceptions.RequestException as e:
        print(f"Error: Request failed - {e}")
        return [], []

In [None]:
def apply_search(row):
    search_strings = row["ObjectEntities"]

    formatted_results = []
    WikiTitleIndexes = []

    for search in search_strings:
        if not search:
            formatted_results.append([None])
            continue
        formatted_result, WikiTitleIndex = search_wikidata_entities(search)
        formatted_results.append(formatted_result)
        WikiTitleIndexes.append(WikiTitleIndex)

    return formatted_results, WikiTitleIndexes

In [None]:
for index, row in df.iterrows():
    print(index)

    formatted_results, WikiTitleIndexes = apply_search(row)

    if "WikiTitles" not in df.columns:
        df["WikiTitles"] = None
    if "WikiTitleIndexes" not in df.columns:
        df["WikiTitleIndexes"] = None

    df.at[index, "WikiTitles"] = formatted_results
    df.at[index, "WikiTitleIndexes"] = WikiTitleIndexes

print("Processing completed.")

In [None]:
#this is the path which stage2 inference file is expecting
df.to_csv("beluga_with_wikidata_info.csv") #change to llama_with_wikidata_info.csv if running for llama2
#in case you ran it for generating training files, then change this to train_df_with_candidates.csv and val_df_with_candidates.csv