In [1]:
import pandas as pd
from functools import reduce

# Read in the datasets
noc_regions_data = pd.read_csv("datasets/noc_regions-parsed.csv")
iso_data = pd.read_csv("datasets/iso_codes.csv")
nocs_list = pd.read_csv("datasets/nocs_list.csv")

In [2]:
# get the number of rows for each dataset
print("Number of rows in noc_regions.csv: ", noc_regions_data.shape[0])
print("Number of rows in regions_iso.csv: ", iso_data.shape[0])
print("Number of rows in nocs_list.csv: ", nocs_list.shape[0])


Number of rows in noc_regions.csv:  230
Number of rows in regions_iso.csv:  250
Number of rows in nocs_list.csv:  209


In [3]:
# Merge the dataframes using an outer join
merged_data = pd.merge(noc_regions_data, nocs_list, on="NOC", how="outer")
merged_data.drop(columns=["notes"], inplace=True)
merged_data


Unnamed: 0,NOC,region,continent,name
0,AFG,Afghanistan,Asia,Afghanistan
1,AHO,Curaçao,,
2,ALB,Albania,Europe,Albania
3,ALG,Algeria,Africa,Algeria
4,AND,Andorra,Europe,Andorra
...,...,...,...,...
228,YEM,Yemen,Asia,Yemen
229,YMD,Yemen,,
230,YUG,Serbia,,
231,ZAM,Zambia,Africa,Zambia


In [4]:
# Merge the dataframes using an outer join
merged_data_iso = pd.merge(
    merged_data, iso_data, left_on="region", right_on="name", how="outer"
)
# Unify 'name_x' and 'name_y' into a single 'name' column
merged_data_iso["name"] = merged_data_iso["name_x"].combine_first(
    merged_data_iso["name_y"]
)

# Drop the original 'name_x' and 'name_y' columns
merged_data_iso = merged_data_iso.drop(["name_x", "name_y"], axis=1)

merged_data_iso


Unnamed: 0,NOC,region,continent,ISO,name
0,AFG,Afghanistan,Asia,AFG,Afghanistan
1,ALB,Albania,Europe,ALB,Albania
2,ALG,Algeria,Africa,DZA,Algeria
3,ASA,American Samoa,Oceania,ASM,American Samoa
4,AND,Andorra,Europe,AND,Andorra
...,...,...,...,...,...
271,EOR,,,,IOC Refugee Olympic Team
272,IOP,,,,Independent Olympic Participants
273,LBN,,Asia,,Lebanon
274,ROT,,,,


In [5]:
# get all countries where the name is Anguilla
merged_data_iso[merged_data_iso["name"] == "Yemen"]


Unnamed: 0,NOC,region,continent,ISO,name
264,YAR,Yemen,,YEM,Yemen
265,YEM,Yemen,Asia,YEM,Yemen
266,YMD,Yemen,,YEM,Yemen


In [6]:
# Fill missing values for the same country with the existing values (except for NOC)
merged_data_iso = (
    merged_data_iso.groupby("name")
    .apply(lambda group: group.ffill().bfill())
    .reset_index(drop=True)
)


  .apply(lambda group: group.ffill().bfill())
  .apply(lambda group: group.ffill().bfill())


In [7]:
merged_data_iso[merged_data_iso["name"] == "Yemen"]

Unnamed: 0,NOC,region,continent,ISO,name
267,YAR,Yemen,Asia,YEM,Yemen
268,YEM,Yemen,Asia,YEM,Yemen
269,YMD,Yemen,Asia,YEM,Yemen


In [8]:
merged_data_iso

Unnamed: 0,NOC,region,continent,ISO,name
0,AFG,Afghanistan,Asia,AFG,Afghanistan
1,ALB,Albania,Europe,ALB,Albania
2,ALG,Algeria,Africa,DZA,Algeria
3,ASA,American Samoa,Oceania,ASM,American Samoa
4,AND,Andorra,Europe,AND,Andorra
...,...,...,...,...,...
269,YMD,Yemen,Asia,YEM,Yemen
270,ZAM,Zambia,Africa,ZMB,Zambia
271,RHO,Zimbabwe,Africa,ZWE,Zimbabwe
272,ZIM,Zimbabwe,Africa,ZWE,Zimbabwe


In [9]:
merged_data_iso.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   NOC        231 non-null    object
 1   region     229 non-null    object
 2   continent  224 non-null    object
 3   ISO        271 non-null    object
 4   name       274 non-null    object
dtypes: object(5)
memory usage: 10.8+ KB


In [10]:
merged_data_iso.to_csv("datasets/merged_data.csv", index=False)