In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, time, json
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

client = OpenAI(
    api_key=os.getenv("OPENAI_KEY"),
)

MODEL_NAME = "gpt-4"

In [75]:
country_title = pd.read_csv("../../data/raw/country_title.csv")
country_title_unique = country_title.drop_duplicates(subset=["country"])

print(f"filtered from {country_title.shape[0]} to {country_title_unique.shape[0]}")

progress = tqdm(country_title_unique.iterrows(), desc = "Generating", total = country_title_unique.shape[0]) 

print("Generating ...")

for index, row in progress:
    country_csv = row["country"]
    role = row["title"]
    progress.set_description(f"{country_csv} - {role}")

    prompt = f'''Give me the names of {role}s of {country_csv} in csv till the year 2022. The format is below, please add the names of the column as well.
    name, start year, end year
    '''

    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
            # {"role": "user", "content": "Give me 10 words in english with their german translations."},
        ],
        temperature=0,
        max_tokens=7500,
    )

    with open(f"../../data/raw/country_leaders/{country_csv}.csv", "w") as f:
        f.write(response.choices[0].message.content)

    time.sleep(2)

filtered from 234 to 148


Generating:   0%|          | 0/148 [00:00<?, ?it/s]

Generating ...


In [3]:
# country = "Bangladesh"
# role = "Prime Minister"

# prompt = f'''Give me the names of {role}s of {country} in csv till the year 2022. The format is below, please add the names of the column as well.
# name, start year, end year
# '''

# response = client.chat.completions.create(
#     model=MODEL_NAME,
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": prompt},
#         # {"role": "user", "content": "Give me 10 words in english with their german translations."},
#     ],
#     temperature=0,
#     max_tokens=3500,
# )

In [4]:
# print(response.choices[0].message.content)

In [18]:
def remove_quotes(x):
    if isinstance(x, str) == False:
        return x
    if x.startswith('"') == False:
        return x
    return x.replace('"', '')

role_name_df = pd.read_csv("../../data/raw/country_leaders/0_country_title.csv")
role_name_dict = {}
for index, row in role_name_df.iterrows():
    role_name_dict[row["country"]] = row["title"]

all_countries = list(os.listdir("../../data/raw/country_leaders"))
samples = []

for country_csv in all_countries:
    if country_csv.startswith("0_"):
        continue

    leaders_df = pd.read_csv(f"../../data/raw/country_leaders/{country_csv}")

    country_name = country_csv.split(".")[0]
    role = role_name_dict[country_name]

    print(f"{country_name} - {role}")
    

    for index, row in leaders_df.iterrows():
        for year in range(
            row[" start year"] + 1, # this is usually the election year. 2 people may share the role, thus skip it 
            row[" end year"]        # don't include the last year, again election year
        ):
            samples.append(
                {
                    "subject": remove_quotes(country_name),
                    "object": remove_quotes(row["name"]),
                    "<role>": remove_quotes(role),
                    "<var>": str(remove_quotes(year)),
                }
            )

Somalia - President
New Zealand - Prime Minister
Sierra Leone - President
Gambia - President
Singapore - Prime Minister
Bosnia and Herzegovina - Chairman of the Presidency
Mozambique - President
South Africa - President
South Sudan - President
Madagascar - President
Yemen - President
Nepal - Prime Minister
Canada - Prime Minister
Czech Republic - Prime Minister
Turkmenistan - President
Kosovo - Prime Minister
Egypt - President
Indonesia - President
Russia - President
Mauritania - President
Central African Republic - President
Venezuela - President
Rwanda - President
Ethiopia - Prime Minister
Colombia - President
Sudan - Prime Minister
Comoros - President
Tunisia - President
Cyprus - President
Thailand - Prime Minister
Kuwait - Emir
Namibia - President
Turkey - President
Peru - President
Australia - Prime Minister
Portugal - Prime Minister
Kazakhstan - President
Slovakia - Prime Minister
Cambodia - Prime Minister
Zimbabwe - President
Oman - Sultan
Djibouti - President
Serbia - President

In [19]:
# len(samples)
json.dump(samples, open("data.json", "w"), indent=4)

In [22]:
relations = list(os.listdir("../../data"))
relations.remove("raw")

relations

['head_of_government.json']