# ETL process by using Python:

## • 1. Extract
### 1(1). Extract the csv files that I'm going to use, and transform them into dataframes:

In [1]:
import pandas as pd
df_olympic_medals = pd.read_csv("olympic_medals.csv")
df_continent = pd.read_csv("list-of-countries_areas-by-continent-2024.csv")
df_olympic_hosts = pd.read_csv("olympic_hosts.csv")
df_mental_illness = pd.read_csv("mental-illness.csv")

### 1(2). Check the columns that contain null values for the above four dataframes:

In [2]:
# check the columns that contain null value for df_olympic_medals. If the columns with null values are needed later on, then these null values would be processed accordingly."
null_counts_olympic_medals = df_olympic_medals[["discipline_title", "slug_game", "event_title", "event_gender", "medal_type", "participant_type", "participant_title", 
                                                "athlete_url", "athlete_full_name", "country_name", "country_code", "country_3_letter_code"]].isnull().sum()
null_columns_olympic_medals = null_counts_olympic_medals[null_counts_olympic_medals > 0]
print(null_columns_olympic_medals)

participant_title    15113
athlete_url           4670
athlete_full_name     3624
country_code          1502
dtype: int64


In [3]:
# check the columns that contain null value for df_continent. If the columns with null values are needed later on, then these null values would be processed accordingly."
null_counts_continent = df_continent[["country", "region"]].isnull().sum()
null_columns_continent = null_counts_continent[null_counts_continent > 0]
print(null_columns_continent)

Series([], dtype: int64)


In [4]:
# check the columns that contain null value for df_olympic_medals. If the columns with null values are needed later on, then these null values would be processed accordingly."
null_counts_olympic_hosts = df_olympic_hosts[["game_slug", "game_end_date", "game_start_date", "game_location", "game_name", "game_season", "game_year"]].isnull().sum()
null_columns_olympic_hosts = null_counts_olympic_hosts[null_counts_olympic_hosts > 0]
print(null_columns_olympic_hosts)

Series([], dtype: int64)


In [5]:
# check the columns that contain null value for df_mental_illness. If the columns with null values are needed later on, then these null values would be processed accordingly."
null_counts_mental_illness = df_mental_illness[["Entity", "Code", "Year", "DALYs from depressive disorders per 100,000 people in, both sexes aged age-standardized", 
                                                "DALYs from schizophrenia per 100,000 people in, both sexes aged age-standardized", 
                                                "DALYs from bipolar disorder per 100,000 people in, both sexes aged age-standardized", 
                                                "DALYs from eating disorders per 100,000 people in, both sexes aged age-standardized", 
                                                "DALYs from anxiety disorders per 100,000 people in, both sexes aged age-standardized"]].isnull().sum()
null_columns_mental_illness = null_counts_mental_illness[null_counts_mental_illness > 0]
print(null_columns_mental_illness)

Code    690
dtype: int64


## • 2. Tranform

### 2(1). First Dataframe:
#### 2(1-1). For the first dataframe that I need, I obtained it by merging df_olympic_medals, df_olympic_hosts, and df_continent, and preprocessing the data during this process.

In [6]:
# Fistly, Merging the df_olympic_medals and df_olympic_hosts dataframes and storing the result in a new dataframe:
merged_medals_hosts = pd.merge(df_olympic_medals, df_olympic_hosts, how="inner", left_on="slug_game", right_on="game_slug")

''' Since there are some missing values in the athlete_full_name column, so I filled them with "Unknown"
(I didn't drop the rows because I think it's important to keep the information of other columns):'''
merged_medals_hosts["athlete_full_name"] = merged_medals_hosts["athlete_full_name"].fillna("Unknown")

''' There are #NAME? in the athlete_full_name column, so I dropped them.
(I drop these rows because I checked that the rows with #NAME? are actually have the same information with other rows, 
so these kind of rows are actually duplicates):'''
merged_medals_hosts = merged_medals_hosts[merged_medals_hosts["athlete_full_name"] != "#NAME?"]

# Reorder the columns and keep only the columns I need:
df_merged_medals_hosts = merged_medals_hosts[["country_name", "slug_game", "discipline_title","event_title", "event_gender", 
                                              "medal_type", "participant_type", "athlete_full_name", "game_year", "game_season"]]

# Print df_merged_data_medals_hosts:
print(df_merged_medals_hosts)

        country_name     slug_game discipline_title  \
0              Italy  beijing-2022          Curling   
1              Italy  beijing-2022          Curling   
2             Norway  beijing-2022          Curling   
3             Norway  beijing-2022          Curling   
4             Sweden  beijing-2022          Curling   
...              ...           ...              ...   
21692        Denmark   athens-1896    Weightlifting   
21693         Greece   athens-1896    Weightlifting   
21694        Denmark   athens-1896    Weightlifting   
21695  Great Britain   athens-1896    Weightlifting   
21696         Greece   athens-1896    Weightlifting   

                           event_title event_gender medal_type  \
0                        Mixed Doubles        Mixed       GOLD   
1                        Mixed Doubles        Mixed       GOLD   
2                        Mixed Doubles        Mixed     SILVER   
3                        Mixed Doubles        Mixed     SILVER   
4        

In [7]:
''' check the columns that contain null value for df_merged_medals_hosts. 
If the columns with null values are needed later on, then these null values would be processed accordingly.'''
null_counts_merged_medals_hosts = df_merged_medals_hosts[["country_name", "slug_game", "discipline_title","event_title", "event_gender", "medal_type", 
                                                          "participant_type", "athlete_full_name", "game_year", "game_season"]].isnull().sum()
null_columns_merged_medals_hosts = null_counts_merged_medals_hosts[null_counts_merged_medals_hosts > 0]
print(null_columns_merged_medals_hosts)

Series([], dtype: int64)


In [8]:
# After merging the df_olympic_medals and df_olympic_hosts dataframes, I then merged the result with the df_continent dataframe.

''' Before merging, I checked the country names in the df_merged_data_medals_hosts dataframe and the country names in the df_continent dataframe, 
    and found that there are some inconsistent country names, so I created a dictionary to map the old country names to the new country names:'''
country_name_mapping = {
    "People's Republic of China": "China",
    "Great Britain": "United Kingdom",
    "United States": "United States of America",
    "Soviet Union": "Russia",
    "Republic of Korea": "South Korea",
    "Democratic People's Republic of Korea": "North Korea",
    "Islamic Republic of": "Iran",
    "Viet Nam": "Vietnam",
    "Czechoslovakia": "Czech Republic",
    # Later can add more mappings here
}
df_merged_medals_hosts["country_name"] = df_merged_medals_hosts["country_name"].map(country_name_mapping).fillna(df_merged_medals_hosts["country_name"])

# Merging the df_merged_data_medals_hosts and df_continent dataframes:
df_merged_medals_hosts_continent = pd.merge(df_merged_medals_hosts, df_continent, how="left", left_on="country_name", right_on="country")

# Keep only the columns I need and rename the columns:
df_merged_medals_hosts_continent = df_merged_medals_hosts_continent[["country_name", "slug_game", "discipline_title", "event_title", "event_gender", 
                                                                     "participant_type", "athlete_full_name", "medal_type", "game_year", "game_season", "region"]]
df_merged_medals_hosts_continent.columns = ["Country Name", "Slug Game", "Discipline Title", "Event Title", "Event Gender", "Participant Type", 
                                            "Athlete Full Name", "Medal Type", "Game Year", "Game Season", "Continent"]

# Reorder the columns(I reordered the columns to make every dimension colse to each other):
new_order = ["Country Name", "Continent", "Game Season", "Slug Game", "Discipline Title","Participant Type", "Event Gender", "Event Title", 
             "Game Year", "Athlete Full Name", "Medal Type"]
df_merged_medals_hosts_continent = df_merged_medals_hosts_continent[new_order]

print(df_merged_medals_hosts_continent)

         Country Name Continent Game Season     Slug Game Discipline Title  \
0               Italy    Europe      Winter  beijing-2022          Curling   
1               Italy    Europe      Winter  beijing-2022          Curling   
2              Norway    Europe      Winter  beijing-2022          Curling   
3              Norway    Europe      Winter  beijing-2022          Curling   
4              Sweden    Europe      Winter  beijing-2022          Curling   
...               ...       ...         ...           ...              ...   
21679         Denmark    Europe      Summer   athens-1896    Weightlifting   
21680          Greece    Europe      Summer   athens-1896    Weightlifting   
21681         Denmark    Europe      Summer   athens-1896    Weightlifting   
21682  United Kingdom    Europe      Summer   athens-1896    Weightlifting   
21683          Greece    Europe      Summer   athens-1896    Weightlifting   

      Participant Type Event Gender                      Event 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_medals_hosts["country_name"] = df_merged_medals_hosts["country_name"].map(country_name_mapping).fillna(df_merged_medals_hosts["country_name"])


In [9]:
'''check the columns that contain null value for df_merged_medals_hosts_continet. 
If the columns with null values are needed later on, then these null values would be processed accordingly.'''
null_counts_merged_medals_hosts_continent = df_merged_medals_hosts_continent[["Country Name", "Continent", "Game Season", "Slug Game", "Discipline Title","Participant Type", 
                                                                              "Event Gender", "Event Title", "Game Year", "Athlete Full Name", "Medal Type"]].isnull().sum()
null_columns_merged_medals_hosts_continent = null_counts_merged_medals_hosts_continent[null_counts_merged_medals_hosts_continent > 0]
print(null_columns_merged_medals_hosts_continent)

Continent    5104
dtype: int64


In [10]:
# Fill the missing values in the Continet column with "Unknown"(I didn't drop the rows here is because I think the other columns' information is important to keep):
df_merged_medals_hosts_continent["Continent"] = df_merged_medals_hosts_continent["Continent"].fillna("Unknown")

# Drop the duplicates:
df_merged_medals_hosts_continent.drop_duplicates(inplace=True)

# Print the final result of my firt dataframe:
print(df_merged_medals_hosts_continent)

         Country Name Continent Game Season     Slug Game Discipline Title  \
0               Italy    Europe      Winter  beijing-2022          Curling   
1               Italy    Europe      Winter  beijing-2022          Curling   
2              Norway    Europe      Winter  beijing-2022          Curling   
3              Norway    Europe      Winter  beijing-2022          Curling   
4              Sweden    Europe      Winter  beijing-2022          Curling   
...               ...       ...         ...           ...              ...   
21679         Denmark    Europe      Summer   athens-1896    Weightlifting   
21680          Greece    Europe      Summer   athens-1896    Weightlifting   
21681         Denmark    Europe      Summer   athens-1896    Weightlifting   
21682  United Kingdom    Europe      Summer   athens-1896    Weightlifting   
21683          Greece    Europe      Summer   athens-1896    Weightlifting   

      Participant Type Event Gender                      Event 

In [28]:
df_merged_medals_hosts_continent.to_csv("merged_medals_hosts_continent.csv", index = False)

#### 2(1-2). Then, I'm going to create six dimension tables and one fact table based on the data/columns in df_merged_medals_hosts_continent dataframe:

##### - Dimension Tables(six)
##### (a). Create Athlete Dimension Table:

In [11]:
# Copy the athlete names from the df_merged_medals_hosts_continent dataframe(.copy method is to make sure that the original dataframe is not changed):
athlete_names = df_merged_medals_hosts_continent["Athlete Full Name"].copy()

# Get unique athlete names(drop duplicates):
unique_athletes = athlete_names.unique()

# Generate unique athlete IDs starting from 1 for each unique athlete name:
athlete_ids = range(1, len(unique_athletes) + 1)

# Create a DataFrame containing every unique athlete name and its corresponding athlete ID:
df_athlete_dimension = pd.DataFrame({"Athlete ID": athlete_ids, "Athlete Full Name": unique_athletes})

print(df_athlete_dimension)

       Athlete ID        Athlete Full Name
0               1     Stefania CONSTANTINI
1               2             Amos MOSANER
2               3         Kristin SKASLIEN
3               4       Magnus NEDREGOTTEN
4               5            Almida DE VAL
...           ...                      ...
12883       12884  George Stuart ROBERTSON
12884       12885          Georgios TSITAS
12885       12886   Stefanos Khristopoulos
12886       12887        Launceston ELLIOT
12887       12888  Alexandros Nikolopoulos

[12888 rows x 2 columns]


In [6]:
# Transform the df_athlete_dimension dataframe to a csv file:
df_athlete_dimension.to_csv("athlete_dimension.csv", index = False)

##### (b). Create Medal Type Dimension Table:

In [12]:
# copy the country name column frme the df_merged_medals_hosts_continent dataframe(.copy method is to make sure that the original dataframe is not changed):
medal_types = df_merged_medals_hosts_continent["Medal Type"]

# get the unique medal types(drop duplicates):
unique_medal_types = medal_types.unique()

# Generate unique medal type IDs starting from 1 for each unique medal type:
medal_type_ids = range(1, len(unique_medal_types) + 1)

# create a new DataFrame containing the every unique medal type and the corresponding unique ID:
df_medal_type_dimension = pd.DataFrame({"Medal Type ID": medal_type_ids, "Medal Type": unique_medal_types})

print(df_medal_type_dimension)

   Medal Type ID Medal Type
0              1       GOLD
1              2     SILVER
2              3     BRONZE


In [13]:
# Transform the df_medal_type_dimension dataframe to a csv file:
df_medal_type_dimension.to_csv("medal_type_dimension.csv", index=False)

##### (c). Create Time Dimension Table:

In [13]:
# copy the game year column from the df_merged_medals_hosts_continent dataframe:
game_years = df_merged_medals_hosts_continent["Game Year"].copy()

# get the unique game years:
unique_game_years = game_years.unique()

# generate unique game year IDs starting from 1 for each unique game year:
game_year_ids = range(1, len(unique_game_years) + 1)

# create a DataFrame containing the game year and the corresponding unique ID
df_time_dimension = pd.DataFrame({"Game Year ID": game_year_ids, "Game Year": unique_game_years})

print(df_time_dimension)

    Game Year ID  Game Year
0              1       2022
1              2       2020
2              3       2018
3              4       2016
4              5       2014
5              6       2012
6              7       2010
7              8       2008
8              9       2006
9             10       2004
10            11       2002
11            12       2000
12            13       1998
13            14       1996
14            15       1994
15            16       1992
16            17       1988
17            18       1984
18            19       1980
19            20       1976
20            21       1972
21            22       1968
22            23       1964
23            24       1960
24            25       1956
25            26       1952
26            27       1948
27            28       1936
28            29       1932
29            30       1928
30            31       1924
31            32       1920
32            33       1912
33            34       1908
34            35    

In [10]:
# Transform the df_time_dimension dataframe to a csv file:
df_time_dimension.to_csv("time_dimension.csv", index=False)

##### (d). Create Discipline Dimension Table:

In [14]:
# Copy the discipline title, participant type, Event Gender, and Event Title columns from the df_merged_medals_hosts_continent dataframe:
df_discipline_dimension = df_merged_medals_hosts_continent[["Discipline Title", "Participant Type", "Event Gender",  "Event Title"]].copy()

# drop duplicates to get unique discipline and event combinations:
df_discipline_dimension = df_discipline_dimension.drop_duplicates()

# Generate unique IDs starting from 1 for each unique discipline and event combination:
df_discipline_dimension["Discipline ID"] = range(1, len(df_discipline_dimension) + 1)

# set Disciplin ID as the index of the DataFrame:
df_discipline_dimension.set_index("Discipline ID", inplace=True)

print(df_discipline_dimension)

               Discipline Title Participant Type Event Gender  \
Discipline ID                                                   
1                       Curling         GameTeam        Mixed   
2                       Curling         GameTeam        Women   
3                       Curling         GameTeam          Men   
4              Freestyle Skiing          Athlete          Men   
5              Freestyle Skiing          Athlete          Men   
...                         ...              ...          ...   
1585                   Shooting          Athlete          Men   
1586                   Shooting          Athlete          Men   
1587                   Shooting          Athlete          Men   
1588                  Wrestling          Athlete          Men   
1589              Weightlifting          Athlete          Men   

                                    Event Title  
Discipline ID                                    
1                                 Mixed Doubles  
2   

In [12]:
# Transform the df_discipline_dimension dataframe to a csv file:
df_discipline_dimension.to_csv("discipline_dimension.csv")

##### (e). Create Olympic Game Dimension table:

In [15]:
# Copy Game Season and Slug Game columns from the df_merged_medals_hosts_continent dataframe:
df_olympic_game_dimension = df_merged_medals_hosts_continent[["Game Season", "Slug Game"]].copy()

# drop duplicates to get unique olympic games and seasons combinations:
df_olympic_game_dimension = df_olympic_game_dimension.drop_duplicates()

# Generate unique IDs starting from 1 for each unique olympic game and season combination:
df_olympic_game_dimension["Olympic Game ID"] = range(1, len(df_olympic_game_dimension) + 1)

# set Olympic Game ID as the index of df_olympic_game_dimension DataFrame:
df_olympic_game_dimension.set_index("Olympic Game ID", inplace=True)

print(df_olympic_game_dimension)

                Game Season                    Slug Game
Olympic Game ID                                         
1                    Winter                 beijing-2022
2                    Summer                   tokyo-2020
3                    Winter             pyeongchang-2018
4                    Summer                     rio-2016
5                    Winter                   sochi-2014
6                    Summer                  london-2012
7                    Winter               vancouver-2010
8                    Summer                 beijing-2008
9                    Winter                   turin-2006
10                   Summer                  athens-2004
11                   Winter          salt-lake-city-2002
12                   Summer                  sydney-2000
13                   Winter                  nagano-1998
14                   Summer                 atlanta-1996
15                   Winter             lillehammer-1994
16                   Summer    

In [14]:
# Transform the df_olympic_game_dimension dataframe to a csv file:
df_olympic_game_dimension.to_csv("olympic_game_dimension.csv")

##### (f). Create Location Dimension Table:

In [16]:
# Copy the Country Name and Continent columns from the df_merged_medals_hosts_continent dataframe:
df_location_dimension = df_merged_medals_hosts_continent[["Continent", "Country Name"]].copy()

# drop duplicates to get unique countries and its continents
df_location_dimension = df_location_dimension.drop_duplicates()

# assign a unique ID to each country and continent combination:
df_location_dimension["Location ID"] = range(1, len(df_location_dimension) + 1)

# set Location ID as the index of the DataFrame:
df_location_dimension.set_index("Location ID", inplace=True)

print(df_location_dimension)

                 Continent    Country Name
Location ID                               
1                   Europe           Italy
2                   Europe          Norway
3                   Europe          Sweden
4                   Europe  United Kingdom
5                     Asia           Japan
...                    ...             ...
149                   Asia            Iraq
150          North America           Haiti
151                Unknown     Australasia
152                Unknown         Bohemia
153                Unknown             MIX

[153 rows x 2 columns]


In [16]:
# Transform the df_location_dimension dataframe to a csv file:
df_location_dimension.to_csv("location_dimension.csv")

##### - Fact Table(one)
##### (a). Medals Fact Table:

In [17]:
''' Here, firstly I'm going to add the unique ID columns for "Game Year", "Athlete Full Name", and "Medal Type", 
and then also for the combination of ["Country Name" and "Continent"], ["Game Season" and "Slug Game"], 
and ["Discipline Title", "Participant Type", "Event Gender", "Event Title"]. '''

# Copy the df_merged_medals_hosts_continent dataframe to a new dataframe called df_merged_medals_hosts_continent_add_id_columns:
df_merged_medals_hosts_continent_add_id_columns = df_merged_medals_hosts_continent.copy()

# Generate unique IDs for "Game Year" column starting from 1 for each unique game year:
df_merged_medals_hosts_continent_add_id_columns["Game Year ID"] = pd.factorize(df_merged_medals_hosts_continent_add_id_columns["Game Year"])[0] + 1

# Generate unique IDs for "Athlete Full Name" column starting from 1 for each unique athlete name:
df_merged_medals_hosts_continent_add_id_columns["Athlete ID"] = pd.factorize(df_merged_medals_hosts_continent_add_id_columns["Athlete Full Name"])[0] + 1

# Generate unique IDs for "Medal Type" column starting from 1 for each unique medal type:
df_merged_medals_hosts_continent_add_id_columns["Medal Type ID"] = pd.factorize(df_merged_medals_hosts_continent_add_id_columns["Medal Type"])[0] + 1

# Generate unique IDs for the combination of "Country Name" and "Continent" columns starting from 1 for each unique country and continent combination:
df_merged_medals_hosts_continent_add_id_columns["Location ID"] = pd.factorize(df_merged_medals_hosts_continent_add_id_columns["Country Name"].astype(str) + "_" + 
                                                                              df_merged_medals_hosts_continent_add_id_columns["Continent"].astype(str))[0] + 1

# Generate unique IDs for the combination of "Game Season" and "Slug Game" columns starting from 1 for each unique slug game and game season combination:
df_merged_medals_hosts_continent_add_id_columns["Olympic Game ID"] = pd.factorize(df_merged_medals_hosts_continent_add_id_columns["Game Season"].astype(str) + "_" + 
                                                                                  df_merged_medals_hosts_continent_add_id_columns["Slug Game"].astype(str))[0] + 1

''' Generate unique IDs for the combination of "Discipline Title", "Participant Type", 
"Event Gender", and "Event Title" columns starting from 1 for each unique discipline and event combination:'''
df_merged_medals_hosts_continent_add_id_columns["Discipline ID"] = pd.factorize(df_merged_medals_hosts_continent_add_id_columns["Discipline Title"].astype(str) + "_" + 
                                                                                df_merged_medals_hosts_continent_add_id_columns["Participant Type"].astype(str) + "_" + 
                                                                                df_merged_medals_hosts_continent_add_id_columns["Event Gender"].astype(str) + "_" + 
                                                                                df_merged_medals_hosts_continent_add_id_columns["Event Title"].astype(str))[0] + 1


''' After above steps, now I get a dataframe "df_merged_medals_hosts_continent_add_id_columns" which contains 
    the whole data from the original dataframe "df_merged_medals_hosts_continent" 
    and also the unique ID columns for "Game Year", "Athlete Full Name", "Medal Type", 
    ["Country Name", "Continent"], ["Game Season", "Slug Game"], and ["Discipline Title", "Participant Type", "Event Gender", "Event Title"]. '''

# Drop the columns that are not needed in the final fact table(only keep the unique ID columns):
df_medals_fact = df_merged_medals_hosts_continent_add_id_columns.drop(["Country Name", "Continent", "Game Season", "Slug Game", "Discipline Title", "Participant Type", 
                                                                       "Event Gender", "Event Title", "Game Year", "Athlete Full Name", "Medal Type"], axis=1)

print(df_medals_fact)

# Transform the df_medals_fact dataframe to a csv file, which is the final fact table:
df_medals_fact.to_csv("medals_fact.csv", index=False)

       Game Year ID  Athlete ID  Medal Type ID  Location ID  Olympic Game ID  \
0                 1           1              1            1                1   
1                 1           2              1            1                1   
2                 1           3              2            2                1   
3                 1           4              2            2                1   
4                 1           5              3            3                1   
...             ...         ...            ...          ...              ...   
21679            37       12877              2           36               53   
21680            37       12888              3           69               53   
21681            37       12877              1           36               53   
21682            37       12887              2            4               53   
21683            37       12825              3           69               53   

       Discipline ID  
0               

### 2(2). Second Dataframe:
#### 2(2-1). For the second dataframe that I need to use, I obtained it by merging df_olympic_hosts, df_mental_illness, and df_continent, and cleaning and preprocessing the data during this process.

In [18]:
''' firstly, I clean the data in the df_olympic_hosts dataframe by selecting the rows with game_year between 1990 and 2019, 
    because the data I need to analyze is only from 1990 to 2019(df_mental_illness data is only from 1990 to 2019): '''
df_olympic_hosts_1990_2019 = df_olympic_hosts[df_olympic_hosts["game_year"].between(1990, 2019)]

In [19]:
# After I filtered the df_olympic_hosts dataframe, I then merged the df_olympic_hosts_1990_2019 and df_continent dataframes by this part of the code:

# mapping the inconsistent country names to keep the country names consistent between the two dataframes:
country_mapping = {
    "South Korea": "Republic of Korea",
    "United Kingdom": "Great Britain",
    "Russia": "Russian Federation",
    # Add more mappings as needed
}
df_mental_illness["Entity"] = df_mental_illness["Entity"].map(country_mapping).fillna(df_mental_illness["Entity"])

# Merge the df_mental_illness and df_olympic_hosts_1990_2019 dataframes:
df_host_country_mental_illness_DALYs = pd.merge(df_mental_illness, df_olympic_hosts_1990_2019, how="inner", left_on="Entity", right_on="game_location")

# Filter out entities not present in game_location, because the data I need to analyse is only for the host countries(game location) of the Olympic Games:
df_host_country_mental_illness_DALYs = df_host_country_mental_illness_DALYs[df_host_country_mental_illness_DALYs["Entity"].isin(df_olympic_hosts["game_location"])]

# Reorder columns and keep only the columns I need:
df_host_country_mental_illness_DALYs = df_host_country_mental_illness_DALYs[["Year", "Entity", 
                           "DALYs from depressive disorders per 100,000 people in, both sexes aged age-standardized",
                           "DALYs from schizophrenia per 100,000 people in, both sexes aged age-standardized",
                           "DALYs from bipolar disorder per 100,000 people in, both sexes aged age-standardized",
                           "DALYs from eating disorders per 100,000 people in, both sexes aged age-standardized",
                           "DALYs from anxiety disorders per 100,000 people in, both sexes aged age-standardized",
                           "game_year"]]

# Define a function to map the game_year column, based on the Year and game_year columns to determine if the country is the host country of the Olympic Games for this year:
def map_game_year(row):
    if row["Year"] == row["game_year"]:
        return "Host Country of the Olympic Games for this year"
    else:
        return "Non-host Country of the Olympic Games for this year"

# Apply the above function to create the game_year column:
df_host_country_mental_illness_DALYs["game_year"] = df_host_country_mental_illness_DALYs.apply(map_game_year, axis=1)

# Rename columns, change "Entity" to "Country", and "game_year" to "Host Type":
df_host_country_mental_illness_DALYs.columns = ["Year", "Country", "DALYs from depressive disorders", 
                       "DALYs from schizophrenia", "DALYs from bipolar disorder", 
                       "DALYs from eating disorders", "DALYs from anxiety disorders", "Host Type"]

# Delete duplicates rows based on the condition(for those contries that have been host more than one once):
df_host_country_mental_illness_DALYs.drop_duplicates(inplace=True)
condition = ((df_host_country_mental_illness_DALYs["Host Type"] == "Non-host Country of the Olympic Games for this year") &
             (df_host_country_mental_illness_DALYs.duplicated(subset=["Year", "DALYs from depressive disorders", "DALYs from schizophrenia", 
                                       "DALYs from bipolar disorder", "DALYs from eating disorders", 
                                       "DALYs from anxiety disorders"], keep=False)))
df_host_country_mental_illness_DALYs = df_host_country_mental_illness_DALYs[~condition]

print(df_host_country_mental_illness_DALYs)

     Year        Country  DALYs from depressive disorders  \
0    1990      Australia                        799.47360   
1    1991      Australia                        805.67530   
2    1992      Australia                        811.40607   
3    1993      Australia                        816.30450   
4    1994      Australia                        820.75977   
..    ...            ...                              ...   
440  2015  United States                        779.26184   
442  2016  United States                        776.84106   
444  2017  United States                        775.51465   
446  2018  United States                        774.79320   
448  2019  United States                        774.97437   

     DALYs from schizophrenia  DALYs from bipolar disorder  \
0                   247.54398                    239.77686   
1                   247.39116                    239.85439   
2                   247.53279                    239.99638   
3                  

In [20]:
''' I got the dataframe that merges the df_mental_illness and df_olympic_hosts_1990_2019 dataframes, 
then I need to add the continent information to the df_host_country_mental_illness_DALYs dataframe: '''

# mapping the inconsistent country names to keep the country names consistent between the two dataframes:
country_mapping = {
    "Republic of Korea": "South Korea",
    "Russian Federation": "Russia",
    "Great Britain": "United Kingdom",
    # Add more mappings as needed
}
df_host_country_mental_illness_DALYs["Country"] = df_host_country_mental_illness_DALYs["Country"].replace(country_mapping)

# Add the continent information to the df_host_country_mental_illness_DALYs dataframe:
df_host_country_mental_illness_DALYs["Continent"] = df_host_country_mental_illness_DALYs["Country"].map(df_continent.set_index("country")["region"])
df_host_country_mental_illness_DALYs.insert(2, "Continent", df_host_country_mental_illness_DALYs.pop("Continent")) # Moving the 'Continent" column to the 3rd column

# 
print(df_host_country_mental_illness_DALYs)

     Year        Country      Continent  DALYs from depressive disorders  \
0    1990      Australia        Oceania                        799.47360   
1    1991      Australia        Oceania                        805.67530   
2    1992      Australia        Oceania                        811.40607   
3    1993      Australia        Oceania                        816.30450   
4    1994      Australia        Oceania                        820.75977   
..    ...            ...            ...                              ...   
440  2015  United States  North America                        779.26184   
442  2016  United States  North America                        776.84106   
444  2017  United States  North America                        775.51465   
446  2018  United States  North America                        774.79320   
448  2019  United States  North America                        774.97437   

     DALYs from schizophrenia  DALYs from bipolar disorder  \
0                   247.5

In [21]:
''' check the columns that contain null value for df_host_country_mental_illness_DALYs. 
If the columns with null values are needed later on, then these null values would be processed accordingly.'''
null_counts_host_country_mental_illness_DALYs = df_host_country_mental_illness_DALYs[["Year","Country", "Continent", "DALYs from depressive disorders", 
                                                                                      "DALYs from schizophrenia", "DALYs from bipolar disorder", "DALYs from eating disorders", 
                                                                                      "DALYs from anxiety disorders"]].isnull().sum()
null_columns_host_country_mental_illness_DALYs = null_counts_host_country_mental_illness_DALYs[null_counts_host_country_mental_illness_DALYs > 0]
print(null_columns_host_country_mental_illness_DALYs)

Series([], dtype: int64)


#### 2(2-2). Then, I'm going to create dimension tables and fact table based on df_host_country_mental_illness_DALYs

##### - Dimension Tables(three)
##### (a). Year Dimension Table:

In [22]:
# copy the "Year" column from the df_host_country_mental_illness_DALYs dataframe:
year = df_host_country_mental_illness_DALYs["Year"].copy()

# get the unique years(drop duplicates):
unique_years = year.unique()

# Generate unique year IDs starting from 1 for each unique year:
year_ids = range(1, len(unique_years) + 1)

# create a new DataFrame containing the year and the corresponding unique ID:
df_year_dimension = pd.DataFrame({"Year ID": year_ids, "Year": unique_years})

# print the result
print(df_year_dimension)

    Year ID  Year
0         1  1990
1         2  1991
2         3  1992
3         4  1993
4         5  1994
5         6  1995
6         7  1996
7         8  1997
8         9  1998
9        10  1999
10       11  2000
11       12  2001
12       13  2002
13       14  2003
14       15  2004
15       16  2005
16       17  2006
17       18  2007
18       19  2008
19       20  2009
20       21  2010
21       22  2011
22       23  2012
23       24  2013
24       25  2014
25       26  2015
26       27  2016
27       28  2017
28       29  2018
29       30  2019


In [23]:
# Transform the df_year_dimension dataframe to a csv file:
df_year_dimension.to_csv("year_dimension.csv", index=False)

(b). Mental Illness Location Dimension Table:

In [23]:
# copy the "Continent" and "Country" columns from the df_host_country_mental_illness_DALYs dataframe:
df_mental_illness_location_dimension = df_host_country_mental_illness_DALYs[["Continent", "Country"]].copy()

# drop duplicates to get unique countries and its continents:
df_mental_illness_location_dimension = df_mental_illness_location_dimension.drop_duplicates()

# Generate unique IDs starting from 1 for each unique country and continent combination:
df_mental_illness_location_dimension["Mental Illness Location ID"] = range(1, len(df_mental_illness_location_dimension) + 1)

# set the "Mental Illness Location ID" as the index of df_mental_illness_location_dimension DataFrame:
df_mental_illness_location_dimension.set_index("Mental Illness Location ID", inplace=True)

print(df_mental_illness_location_dimension)

                                Continent         Country
Mental Illness Location ID                               
1                                 Oceania       Australia
2                           South America          Brazil
3                           North America          Canada
4                                    Asia           China
5                                  Europe          France
6                                  Europe          Greece
7                                  Europe           Italy
8                                    Asia           Japan
9                                  Europe          Norway
10                                 Europe          Russia
11                                   Asia     South Korea
12                                 Europe           Spain
13                                 Europe  United Kingdom
14                          North America   United States


In [29]:
# Transform the df_mental_illness_location_dimension dataframe to a csv file:
df_mental_illness_location_dimension.to_csv("mental_illness_location_dimension.csv")

##### (c). Host Type Dimension Table:

In [24]:
# Copy the "Host Type" column from the df_host_country_mental_illness_DALYs dataframe:
host_type = df_host_country_mental_illness_DALYs["Host Type"].copy()

# get the unique host types(drop duplicates):
unique_host_type = host_type.unique()

# Generate unique host type IDs starting from 1 for each unique host type:
host_type_ids = range(1, len(unique_host_type) + 1)

# create a new DataFrame containing the host type and the corresponding unique ID:
df_host_type_dimension = pd.DataFrame({"Host Type ID": host_type_ids, "Host Type": unique_host_type})

print(df_host_type_dimension)

   Host Type ID                                          Host Type
0             1  Non-host Country of the Olympic Games for this...
1             2    Host Country of the Olympic Games for this year


In [68]:
# Transform the df_host_type_dimension dataframe to a csv file:
df_host_type_dimension.to_csv("host_type_dimension.csv", index=False)

##### - Fact Table(one)
##### (a). Mental Illness DALYs Fact Table:

In [25]:
''' Here, firstly I'm going to add the unique ID columns for "Year" and "Host Type", 
and also for the combination of ["Country" and "Continent"] to the df_host_country_mental_illness_DALYs dataframe. '''

# Copy the df_host_country_mental_illness_DALYs dataframe to a new dataframe called df_host_country_mental_illness_DALYs_add_id_columns:
df_host_country_mental_illness_DALYs_add_id_columns = df_host_country_mental_illness_DALYs.copy()

# Generate unique IDs for "Year" column starting from 1 for each unique year:
df_host_country_mental_illness_DALYs_add_id_columns["Year ID"] = pd.factorize(df_host_country_mental_illness_DALYs_add_id_columns["Year"])[0] + 1

# Generate unique IDs for the combination of "Country" and "Continent" columns starting from 1 for each unique country and continent combination:
df_host_country_mental_illness_DALYs_add_id_columns["Mental Illness Location ID"] = pd.factorize(df_host_country_mental_illness_DALYs_add_id_columns["Country"].astype(str) 
                                                                                                 + "_" + df_host_country_mental_illness_DALYs_add_id_columns["Continent"].astype(str))[0] + 1

# Generate unique IDs for "Host Type" column starting from 1 for each unique host type:
df_host_country_mental_illness_DALYs_add_id_columns["Host Type ID"] = pd.factorize(df_host_country_mental_illness_DALYs_add_id_columns["Host Type"])[0] + 1

''' After above steps, now I get a dataframe "df_host_country_mental_illness_DALYs_add_id_columns" which contains
    the whole data from the original dataframe "df_host_country_mental_illness_DALYs"
    and also the unique ID columns for "Year", ["Country", "Continent"], and "Host Type". '''

# Drop the columns that are not needed in the final fact table(only keep the unique ID columns):
df_mental_illness_DALYs_fact = df_host_country_mental_illness_DALYs_add_id_columns.drop(["Year", "Country", "Continent", "Host Type"], axis=1)

# Reorder the columns:
df_mental_illness_DALYs_fact = df_mental_illness_DALYs_fact[["Year ID", "Mental Illness Location ID", "Host Type ID", "DALYs from depressive disorders", 
                                                             "DALYs from schizophrenia", "DALYs from bipolar disorder", "DALYs from eating disorders", 
                                                             "DALYs from anxiety disorders"]]

print(df_mental_illness_DALYs_fact)

     Year ID  Mental Illness Location ID  Host Type ID  \
0          1                           1             1   
1          2                           1             1   
2          3                           1             1   
3          4                           1             1   
4          5                           1             1   
..       ...                         ...           ...   
440       26                          14             1   
442       27                          14             1   
444       28                          14             1   
446       29                          14             1   
448       30                          14             1   

     DALYs from depressive disorders  DALYs from schizophrenia  \
0                          799.47360                 247.54398   
1                          805.67530                 247.39116   
2                          811.40607                 247.53279   
3                          816.30450   

In [97]:
# Transform the df_mental_illness_DALYs_fact dataframe to a csv file, which is the final fact table:
df_mental_illness_DALYs_fact.to_csv("mental_illness_DALYs_fact.csv", index=False)

## • 3. Load

### 3(1). Dump data into tables in PostgreSQL database:

#### 3(1-1). "Olympic_Medals_Count" PostgreSQL database:

In [26]:
# Create a database connection engine to connect the "Olympic_Medals_Count" PostgreSQL database:
from sqlalchemy import create_engine
engine = create_engine("postgresql://postgres:postgres@pgdb:5432/Olympic_Medals_Count")

In [99]:
# dump the data in df_athlete_dimension to "Olympic_Medals_Count" PostgreSQL database:
df_athlete_dimension.to_sql("dim_athlete", con = engine, if_exists = "append", index = False)

888

In [100]:
# dump the data in df_medal_type_dimension to "Olympic_Medals_Count" PostgreSQL database:
df_medal_type_dimension.to_sql("dim_medal_types", con = engine, if_exists = "append", index = False)

3

In [101]:
# dump the data in df_time_dimension to "Olympic_Medals_Count" PostgreSQL database:
df_time_dimension.to_sql("dim_time", con = engine, if_exists = "append", index = False)

37

In [102]:
# dump the data in df_discipline_dimension to "Olympic_Medals_Count" PostgreSQL database:
df_discipline_dimension.to_sql("dim_discipline", con = engine, if_exists = "append", index = True)

589

In [103]:
# dump the data in df_olympic_game_dimension to "Olympic_Medals_Count" PostgreSQL database:
df_olympic_game_dimension.to_sql("dim_olympic_game", con = engine, if_exists = "append", index = True)

53

In [104]:
# dump the data in df_location_dimension to "Olympic_Medals_Count" PostgreSQL database:
df_location_dimension.to_sql("dim_location", con = engine, if_exists = "append", index = True)

153

In [105]:
# dump the data in df_medals_fact to "Olympic_Medals_Count" PostgreSQL database:
df_medals_fact.to_sql("fact_medals_count", con = engine, if_exists = "append", index = False)

680

#### 3(2-1). "Mental_Illness_DALYs" PostgreSQL database:

In [106]:
# Create a database connection engine to connect the "Mental_Illness_DALYs" PostgreSQL database:
engine = create_engine("postgresql://postgres:postgres@pgdb:5432/Mental_Illness_DALYs")

In [107]:
# dump the data in df_year_dimension to "Mental_Illness_DALYs" PostgreSQL database:
df_year_dimension.to_sql("dim_year", con = engine, if_exists = "append", index = False)

30

In [108]:
# dump the data in df_mental_illness_location_dimension to "Mental_Illness_DALYs" PostgreSQL database:
df_mental_illness_location_dimension.to_sql("dim_mental_illness_location", con = engine, if_exists = "append", index = True)

14

In [109]:
# dump the data in df_host_type_dimension to "Mental_Illness_DALYs" PostgreSQL database:
df_host_type_dimension.to_sql("dim_host_types", con = engine, if_exists = "append", index = False)

2

In [110]:
# dump the data df_mental_illness_DALYs_fact to "Mental_Illness_DALYs" PostgreSQL database:
df_mental_illness_DALYs_fact.to_sql("fact_mental_illness_dalys", con = engine, if_exists = "append", index = False)

420