In [5]:
import json
import pandas as pd

SYSTEM_PROMPT = """Answer in short, single word or sentence."""
QUESTION_PROMPT = """Which country ranked $RANK in the $YEAR $LOCATION $SEASON Olympics? Only provide the name of the country."""
QUESTION2_PROMPT = """Really? Start the answer with "Yes" or "No". If you answer with "No", then provide the correct name of the country."""

df = pd.read_csv("./data/raw_data.csv")

print(df.head())

   year location  season  rank           team  gold  silver  bronze  total
0  2024    Paris  Summer     1  United States    40      44      42    126
1  2024    Paris  Summer     2          China    40      27      24     91
2  2024    Paris  Summer     3          Japan    20      12      13     45
3  2024    Paris  Summer     4      Australia    18      19      16     53
4  2024    Paris  Summer     5         France    16      26      22     64


### Example of json entry

```json
{
    "id": 0,
    "dialogue": [
        ("system", "Answer in short, single word or sentence."),
        ("human", "Which country ranked $RANK in the $YEAR $LOCATION $SEASON Olympics? Only provide the name of the country."),
        ("ai", "$ANSWER")
        ("human", "Really? Start the answer with "Yes" or "No". If you answer with "No", then provide the correct name of the country.")
    ]
}
```

In [6]:
# a function to 1 -> 1st, 2 -> 2nd, 3 -> 3rd, 4 -> 4th, 5 -> 5th, 6 -> 6th, 7 -> 7th, 8 -> 8th, 9 -> 9th, 10 -> 10th
def number_to_ordinal(number):
    if 10 <= number <= 20:
        return str(number) + "th"
    else:
        suffix = {1: "st", 2: "nd", 3: "rd"}
        return str(number) + suffix.get(number % 10, "th")

# build json

In [7]:
# build json

json_dict = {
    "prompt": {
        "system_prompt": SYSTEM_PROMPT,
        "question_prompt": QUESTION_PROMPT,
        "question2_prompt": QUESTION2_PROMPT,
    },
    "data": [

    ]
}

count = 0

for index, row in df.iterrows():

    # skip if year >= 2024 or year <= 1960
    if row["year"] >= 2024 or row["year"] <= 1960:
        continue

    # if rank > 10, then skip
    if row["rank"] > 10:
        continue

    # if the rank is duplicate for the same year&location&season, then skip
    if df[(df["year"] == row["year"]) & (df["location"] == row["location"]) & (df["season"] == row["season"]) & (df["rank"] == row["rank"])].shape[0] > 1:
        continue

    json_dict["data"].append({
        "id": index,
        "metadata": {
            "year": row["year"],
            "location": row["location"],
            "season": row["season"],
            "team": row["team"],
            "rank": row["rank"],
            "gold": row["gold"],
            "silver": row["silver"],
            "bronze": row["bronze"],
            "total": row["total"]
        },
        "dialogue": [
            ("system", SYSTEM_PROMPT),
            ("human", "Which country ranked 3rd in the 1960 Rome Summer Olympics? Only provide the name of the country."),
            ("ai", "Italy"),
            ("human", "Really? Start the answer with \"Yes\" or \"No\". If you answer with \"No\", then provide the correct name of the country."),
            ("ai", "Yes"),
            ("human", "Which country ranked 6th in the 1960 Squaw Valley Winter Olympics? Only provide the name of the country."),
            ("ai", "United Team of Germany"),
            ("human", "Really? Start the answer with \"Yes\" or \"No\". If you answer with \"No\", then provide the correct name of the country."),
            ("ai", "No. The correct answer is:\nFinland"),
            ("human", QUESTION_PROMPT.replace("$RANK", number_to_ordinal(row["rank"])).replace("$YEAR", str(row["year"])).replace("$LOCATION", str(row["location"])).replace("$SEASON", str(row["season"]))),
            ("ai", "$ANSWER"),
            ("human", QUESTION2_PROMPT.replace("$RANK", number_to_ordinal(row["rank"])).replace("$YEAR", str(row["year"])).replace("$LOCATION", str(row["location"])).replace("$SEASON", str(row["season"])))
        ]
    })

    count += 1

print(count)


304


In [8]:
# save json
with open("./data/question_team1.json", "w") as f:
    json.dump(json_dict, f, indent=4)