In [1]:
import pandas as pd

# Read in the datasets
noc_regions_data = pd.read_csv("datasets/noc_regions-parsed.csv")
nocs_list = pd.read_csv("datasets/nocs_list.csv")

iso_data = pd.read_csv("datasets/iso_codes.csv")

In [2]:
# get the number of rows for each dataset
print("Number of rows in noc_regions.csv: ", noc_regions_data.shape[0])
print("Number of rows in regions_iso.csv: ", iso_data.shape[0])
print("Number of rows in nocs_list.csv: ", nocs_list.shape[0])

Number of rows in noc_regions.csv:  230
Number of rows in regions_iso.csv:  252
Number of rows in nocs_list.csv:  209


In [3]:
# Merge the dataframes using an outer join
merged_data = pd.merge(noc_regions_data, nocs_list, on="NOC", how="outer")
merged_data.drop(columns=["notes"], inplace=True)
merged_data

Unnamed: 0,NOC,region,continent,name
0,AFG,Afghanistan,Asia,Afghanistan
1,AHO,Curaçao,,
2,ALB,Albania,Europe,Albania
3,ALG,Algeria,Africa,Algeria
4,AND,Andorra,Europe,Andorra
...,...,...,...,...
228,YEM,Yemen,Asia,Yemen
229,YMD,Yemen,,
230,YUG,Serbia,,
231,ZAM,Zambia,Africa,Zambia


In [4]:
# set 'name' the same value as 'region' if 'name' is NaN
for i in range(merged_data.shape[0]):
    if pd.isnull(merged_data.loc[i, "name"]):
        merged_data.loc[i, "name"] = merged_data.loc[i, "region"]


In [5]:
# get all entries with nan values in name
nan_entries = merged_data[merged_data["name"].isna()]
nan_entries


Unnamed: 0,NOC,region,continent,name
171,ROT,,,
216,UNK,,,


In [6]:
# set 'name' as 'N/A' for all entries with NaN values in name
for i in range(merged_data.shape[0]):
    if pd.isnull(merged_data.loc[i, "name"]):
        merged_data.loc[i, "name"] = "N/A"


In [7]:
# Merge the dataframes using an outer join
merged_data_iso = pd.merge(
    merged_data, iso_data, left_on="region", right_on="name", how="outer"
)
# Unify 'name_x' and 'name_y' into a single 'name' column
merged_data_iso["name"] = merged_data_iso["name_x"].combine_first(
    merged_data_iso["name_y"]
)

# Drop the original 'name_x' and 'name_y' columns
merged_data_iso = merged_data_iso.drop(["name_x", "name_y"], axis=1)

merged_data_iso

Unnamed: 0,NOC,region,continent,ISO,name
0,AFG,Afghanistan,Asia,AFG,Afghanistan
1,ALB,Albania,Europe,ALB,Albania
2,ALG,Algeria,Africa,DZA,Algeria
3,ASA,American Samoa,Oceania,ASM,American Samoa
4,AND,Andorra,Europe,AND,Andorra
...,...,...,...,...,...
273,EOR,,,,IOC Refugee Olympic Team
274,IOP,,,,Independent Olympic Participants
275,LBN,,Asia,,Lebanon
276,ROT,,,,


In [8]:
import numpy as np

# Fill missing values in 'name' column with a placeholder
merged_data_iso["name"] = merged_data_iso["name"].fillna("missing")

# Fill missing values for the same country with the existing values (except for NOC)
merged_data_iso = merged_data_iso.groupby("name").apply(
    lambda group: group.ffill().bfill()
)

# Replace the placeholder with np.nan
merged_data_iso["name"] = merged_data_iso["name"].replace("missing", np.nan)

# Reset the index
merged_data_iso = merged_data_iso.reset_index(drop=True)

  lambda group: group.ffill().bfill()
  merged_data_iso = merged_data_iso.groupby("name").apply(


In [9]:
merged_data_iso.drop(columns=["region"], inplace=True)

In [10]:
merged_data_iso

Unnamed: 0,NOC,continent,ISO,name
0,AFG,Asia,AFG,Afghanistan
1,ALB,Europe,ALB,Albania
2,ALG,Africa,DZA,Algeria
3,ASA,Oceania,ASM,American Samoa
4,AND,Europe,AND,Andorra
...,...,...,...,...
273,YMD,Asia,YEM,Yemen
274,ZAM,Africa,ZMB,Zambia
275,RHO,Africa,ZWE,Zimbabwe
276,ZIM,Africa,ZWE,Zimbabwe


In [11]:
continentes = {
    "Anguilla": "Americas",
    "Antarctica": "Antarctica",
    "Bonaire, Sint Eustatius and Saba": "Americas",
    "Bouvet Island": "Antarctica",
    "British Indian Ocean Territory": "Asia",
    "Christmas Island": "Oceania",
    "Cocos (Keeling) Islands": "Asia",
    "Curaçao": "Americas",
    "Czechia": "Europe",
    "Falkland Islands (Malvinas)": "Americas",
    "Faroe Islands": "Europe",
    "French Guiana": "Americas",
    "French Polynesia": "Oceania",
    "French Southern Territories": "Antarctica",
    "Gibraltar": "Europe",
    "Greenland": "Americas",
    "Guadeloupe": "Americas",
    "Guernsey": "Europe",
    "Heard Island and McDonald Islands": "Antarctica",
    "Holy See": "Europe",
    "IOC Refugee Olympic Team": "No continent",
    "Independent Olympic Athletes": "No continent",
    "Independent Olympic Participants": "No continent",
    "Isle of Man": "Europe",
    "Jersey": "Europe",
    "Macao": "Asia",
    "Martinique": "Americas",
    "Mayotte": "Africa",
    "Montserrat": "Americas",
    "New Caledonia": "Oceania",
    "Niue": "Oceania",
    "Norfolk Island": "Oceania",
    "Northern Mariana Islands": "Oceania",
    "Pitcairn": "Oceania",
    "Réunion": "Africa",
    "Saint Barthélemy": "Americas",
    "Saint Helena, Ascension and Tristan da Cunha": "Africa",
    "Saint Martin (French part)": "Americas",
    "Saint Pierre and Miquelon": "Americas",
    "Sint Maarten (Dutch part)": "Americas",
    "South Georgia and the South Sandwich Islands": "Antarctica",
    "Svalbard and Jan Mayen": "Europe",
    "Tokelau": "Oceania",
    "Turks and Caicos Islands": "Americas",
    "United States Minor Outlying Islands": "Oceania",
    "Viet Nam": "Asia",
    "Wallis and Futuna": "Oceania",
    "Western Sahara": "Africa",
    "Åland Islands": "Europe",
}
# segun el diccionario continents, se asigna el continente correspondiente a cada pais
merged_data_iso["continent"] = merged_data_iso["continent"].fillna(
    merged_data_iso["name"].map(continentes)
)
merged_data_iso


Unnamed: 0,NOC,continent,ISO,name
0,AFG,Asia,AFG,Afghanistan
1,ALB,Europe,ALB,Albania
2,ALG,Africa,DZA,Algeria
3,ASA,Oceania,ASM,American Samoa
4,AND,Europe,AND,Andorra
...,...,...,...,...
273,YMD,Asia,YEM,Yemen
274,ZAM,Africa,ZMB,Zambia
275,RHO,Africa,ZWE,Zimbabwe
276,ZIM,Africa,ZWE,Zimbabwe


In [12]:
# get all entries with missing iso
missing_iso = merged_data_iso[merged_data_iso["ISO"].isna()]
missing_iso


Unnamed: 0,NOC,continent,ISO,name
114,EOR,No continent,,IOC Refugee Olympic Team
116,IOA,No continent,,Independent Olympic Athletes
117,IOP,No continent,,Independent Olympic Participants
168,ROT,,,
169,UNK,,,


In [13]:
# set 'ISO' as 'N/A' for all entries with NaN values in ISO
for i in range(merged_data_iso.shape[0]):
    if pd.isnull(merged_data_iso.loc[i, "ISO"]):
        merged_data_iso.loc[i, "ISO"] = "N/A"


In [14]:
merged_data_iso.to_csv("datasets/iso_noc-merged.csv", index=False)