In [None]:
# Connect to Database

In [152]:
# psycopg2 is the python package to connect with postgresql server
import psycopg2
from psycopg2 import OperationalError
import pandas as pd
 
# Connect to Olympics Database
import os
import psycopg2
from psycopg2 import OperationalError
from dotenv import load_dotenv
import psycopg2
from psycopg2 import OperationalError
 
# Load environment variables
load_dotenv()

def create_connection():
    connection = None
    try:
        connection = psycopg2.connect(
            database=os.getenv('DB_NAME'),
            user=os.getenv('DB_USER'),
            password=os.getenv('DB_PASSWORD'),
            host=os.getenv('DB_HOST'),
            port=os.getenv('DB_PORT'),
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return connection

# Create the connection
connection = create_connection()

Connection to PostgreSQL DB successful


# Import pre-processed Data from Excel

In [153]:
# load data
economic_data = pd.read_csv("Olympics/Economic data.csv")
medals_data = pd.read_csv("Olympics/olympic_medals.csv")
lifeExpectancy_data = pd.read_csv("Olympics/life-expectancy.csv")
countriesByContdata = pd.read_csv("Olympics/list-of-countries_areas-by-continent-2024.csv")
mental_illness_data = pd.read_csv("Olympics/mental-illness.csv")
hostsData = pd.read_csv("Olympics/olympic_hosts.csv")


In [154]:
economic_data.head()
medals_data.head()
#globalPop_data.head()
lifeExpectancy_data.head()
#countriesByContdata.head()
#mental_illness_data.head()
#hostsData.head()

Unnamed: 0,Entity,Code,Year,Period life expectancy at birth - Sex: all - Age: 0
0,Afghanistan,AFG,1950,27.7275
1,Afghanistan,AFG,1951,27.9634
2,Afghanistan,AFG,1952,28.4456
3,Afghanistan,AFG,1953,28.9304
4,Afghanistan,AFG,1954,29.2258


#### Clean Country Codes Table

In [155]:
# add country codes column to countries by continent data using life-expectancy data as it has more countries. 
countriesByContdata_with_codes = countriesByContdata.merge(lifeExpectancy_data[['Entity', 'Code']].drop_duplicates(), left_on='country', right_on='Entity', how='left')

countriesByContdata_with_codes

Unnamed: 0,country,region,Entity,Code
0,Afghanistan,Asia,Afghanistan,AFG
1,Albania,Europe,Albania,ALB
2,Algeria,Africa,Algeria,DZA
3,American Samoa,Oceania,American Samoa,ASM
4,Andorra,Europe,Andorra,AND
...,...,...,...,...
229,Wallis and Futuna,Oceania,Wallis and Futuna,WLF
230,Western Sahara,Africa,Western Sahara,ESH
231,Yemen,Asia,Yemen,YEM
232,Zambia,Africa,Zambia,ZMB


In [156]:
# display rows with NANs and correct them
countriesByContdata_with_codes[countriesByContdata_with_codes.isna().any(axis=1)]
# manually update rows with NANs with correcy Country Code and Country Name, drop any rows with duplicates
countriesByContdata_with_codes.loc[50,['Entity','Code']] = ['Czech Republic'	,'CZE']
countriesByContdata_with_codes.loc[100,['Entity','Code']] = ['Ivory Coast'	,'CIV']
countriesByContdata_with_codes.loc[119,['Entity','Code']] = ['Macau'	,'MFM']
countriesByContdata_with_codes.loc[132,['Entity','Code']] = ['Micronesia'	,'FSM']
countriesByContdata_with_codes.loc[168,['Entity','Code']] = ['Democratic Republic of the Congo'	,'COD'] # 55 IS A DUPLICATE
countriesByContdata_with_codes.loc[176,['Entity','Code']] = ['Saint Martin'	,'MAF'] # 188 IS A DUPLICATE
countriesByContdata_with_codes.loc[207,['Entity','Code']] = ['Timor-Leste	'	,'TLS']
countriesByContdata_with_codes.loc[226,['Entity','Code']] = ['Vatican City	'	,'VAT']
countriesByContdata_with_codes[countriesByContdata_with_codes.isna().any(axis=1)] # drop the duplicated rows

update_countriesCodes = countriesByContdata_with_codes.drop([55,188])
update_countriesCodes[update_countriesCodes.isna().any(axis=1)]

cleaned_countryCodes = update_countriesCodes.drop(['Entity'], axis=1) # drop extra column - Entity from final cleaned country codes tables
cleaned_countryCodes.head()


Unnamed: 0,country,region,Code
0,Afghanistan,Asia,AFG
1,Albania,Europe,ALB
2,Algeria,Africa,DZA
3,American Samoa,Oceania,ASM
4,Andorra,Europe,AND


### Clean Athete Data
- clean Athlete Data
- separate data for athlete dimension

In [157]:
medals_data.head()
medals_data[medals_data.isna().any(axis=1)] # check for NANs or empty values 

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,participant_type,athlete_full_name,country_name,country_code,country_3_letter_code
6,Curling,beijing-2022,Women,Women,GOLD,GameTeam,,Great Britain,GB,GBR
7,Curling,beijing-2022,Women,Women,SILVER,GameTeam,,Japan,JP,JPN
8,Curling,beijing-2022,Women,Women,BRONZE,GameTeam,,Sweden,SE,SWE
9,Curling,beijing-2022,Men,Men,GOLD,GameTeam,,Sweden,SE,SWE
10,Curling,beijing-2022,Men,Men,SILVER,GameTeam,,Great Britain,GB,GBR
...,...,...,...,...,...,...,...,...,...,...
21634,Gymnastics Artistic,athens-1896,team horizontal bar men,Men,GOLD,GameTeam,,Germany,DE,GER
21682,Tennis,athens-1896,doubles men,Men,GOLD,GameTeam,John BOLAND,MIX,,MIX
21683,Tennis,athens-1896,doubles men,Men,GOLD,GameTeam,Fritz Traun,MIX,,MIX
21686,Tennis,athens-1896,doubles men,Men,BRONZE,GameTeam,Edwin FLACK,MIX,,MIX


In [158]:
# create unique name for athletes who's name is missing by joining medal type, event title, country code

def create_unique_name(row):
    if pd.isna(row['athlete_full_name']) or (row['athlete_full_name'] == '#NAME?') or (row['athlete_full_name'] == '- -'):
        return f"Athlete_{row['medal_type']}_{row['event_title']}_{row['country_code']}"
    else:
        return row['athlete_full_name']

# Apply the function to each row
medals_data['athlete_full_name'] = medals_data.apply(create_unique_name, axis=1)

cleaned_medalData = medals_data.drop(['country_code', 'participant_type'], axis=1) # remove un-necessary columns 
cleaned_medalData[cleaned_medalData.isna().any(axis=1)] # check for remaining NANs 

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,athlete_full_name,country_name,country_3_letter_code


### Clean Economic Data

In [159]:
economic_data.head()

Unnamed: 0,Time,Country Name,Country Code,Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population) [SI.POV.DDAY],GDP per capita (current US$) [NY.GDP.PCAP.CD],GDP per capita growth (annual %) [NY.GDP.PCAP.KD.ZG],Secure Internet servers (per 1 million people) [IT.NET.SECR.P6],"Mortality rate, infant (per 1,000 live births) [SP.DYN.IMRT.IN]",Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS],Domestic general government health expenditure per capita (current US$) [SH.XPD.GHED.PC.CD],Domestic private health expenditure per capita (current US$) [SH.XPD.PVTD.PC.CD],External health expenditure per capita (current US$) [SH.XPD.EHEX.PC.CD]
0,2020,Afghanistan,AFG,..,516.8667974,-5.364665931,34.94796166,44.8,15.53361392,6.1311352,61.20481468,12.95210116
1,2020,Albania,ALB,0,5343.037704,-2.745238678,884.8250911,8.4,..,154.8976524,..,..
2,2020,Algeria,DZA,..,3354.157303,-6.729941651,48.46764679,19.6,6.32117987,134.4669044,80.29914715,0.08572689
3,2020,Andorra,AND,..,37207.222,-12.73507756,9665.379665,2.7,9.05175877,2441.683051,895.234785,..
4,2020,Angola,AGO,..,1502.950754,-8.672432129,19.7436402,48.7,2.91183472,21.34275969,27.28717169,2.11269093


In [160]:
# Subset the economic_data dataframe
economic_subset = economic_data[[
    "Country Name", 
    "Country Code",
    "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]", 
    "GDP per capita (current US$) [NY.GDP.PCAP.CD]"
]]
# Renaming columns
economic_subset = economic_subset.rename(columns={
    "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]": "Health_Expenditure_GDP_Percent",
    "GDP per capita (current US$) [NY.GDP.PCAP.CD]": "GDP_Per_Capita_USD"
})

economic_subset[economic_subset.isna().any(axis=1)] # check for any NANs  - none present

Unnamed: 0,Country Name,Country Code,Health_Expenditure_GDP_Percent,GDP_Per_Capita_USD


### More cleaning
- Add in ID columns to medal Data
- Check and clean athlete, country Code ,and economy data for consistent country codes and names

In [161]:
cleaned_countryCodes.head()
cleaned_medalData.head()

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,athlete_full_name,country_name,country_3_letter_code
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Stefania CONSTANTINI,Italy,ITA
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Amos MOSANER,Italy,ITA
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Kristin SKASLIEN,Norway,NOR
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Magnus NEDREGOTTEN,Norway,NOR
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,Almida DE VAL,Sweden,SWE


In [162]:

# Rename columns for consistency
cleaned_medalData.rename(columns={'country_3_letter_code': 'country_code', 'country_name': 'country'}, inplace=True)
cleaned_countryCodes.rename(columns={'Code': 'country_code'}, inplace=True)

# Make sure country codes are in the same format
cleaned_countryCodes['country_code'] = cleaned_countryCodes['country_code'].str.upper()
cleaned_medalData['country_code'] = cleaned_medalData['country_code'].str.upper()


# add id columns in medal_data - which will be later on seprated as a fact table as it has most columns and most data
cleaned_medalData['Athlete_ID'] = ''
cleaned_medalData['Event_ID'] = ''
cleaned_medalData['Game_ID'] = ''
cleaned_medalData['Medal_ID'] = ''

#cleaned_countryCodes.head()
cleaned_medalData.head()

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,athlete_full_name,country,country_code,Athlete_ID,Event_ID,Game_ID,Medal_ID
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Stefania CONSTANTINI,Italy,ITA,,,,
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Amos MOSANER,Italy,ITA,,,,
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Kristin SKASLIEN,Norway,NOR,,,,
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Magnus NEDREGOTTEN,Norway,NOR,,,,
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,Almida DE VAL,Sweden,SWE,,,,


#### Medal Dimnesion:
- create medal dimnesion 
- map medal id from medal dimnesion to medal data / full / fact data

In [163]:
medal_dim = pd.DataFrame({
    'Medal_ID': [1, 2, 3],
    'Medal_Type': ['GOLD', 'SILVER', 'BRONZE']
})
# Map medal types to IDs
medal_id = {
    'GOLD': 1,
    'SILVER': 2,
    'BRONZE': 3
}

# Apply mapping to the medal_type column to create a new Medal_ID column
cleaned_medalData['Medal_ID'] = cleaned_medalData['medal_type'].map(medal_id)
cleaned_medalData.head()


Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,athlete_full_name,country,country_code,Athlete_ID,Event_ID,Game_ID,Medal_ID
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Stefania CONSTANTINI,Italy,ITA,,,,1
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Amos MOSANER,Italy,ITA,,,,1
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Kristin SKASLIEN,Norway,NOR,,,,2
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Magnus NEDREGOTTEN,Norway,NOR,,,,2
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,Almida DE VAL,Sweden,SWE,,,,3


In [164]:
# correct country codes for cleaned_countryCodes:
country_code_corrections = {
    'SUI': 'CHE',  # Switzerland
    'GER': 'DEU',  # Germany
    'NED': 'NLD',  # Netherlands
    'SLO': 'SVN',  # Slovenia
    'LAT': 'LVA',  # Latvia
    'BUL': 'BGR',  # Bulgaria
    'KUW': 'KWT',  # Kuwait
    'DEN': 'DNK',  # Denmark
    'POR': 'PRT',  # Portugal
    'PHI': 'PHL',  # Philippines
    'FIJ': 'FJI',  # Fiji
    'BER': 'BMU',  # Bermuda
    'RSA': 'ZAF',  # South Africa
    'CRO': 'HRV',  # Croatia
    'MAS': 'MYS',  # Malaysia
    'INA': 'IDN',  # Indonesia
    'GRE': 'GRC',  # Greece
    'MGL': 'MNG',  # Mongolia
    'NGR': 'NGA',  # Nigeria
    'BUR': 'BFA',  # Burkina Faso
    'BAH': 'BHS',  # Bahamas
    'PUR': 'PRI',  # Puerto Rico
    'BOT': 'BWA',  # Botswana
    'GRN': 'GRD',  # Grenada
    'KSA': 'SAU',  # Saudi Arabia
    'ALG': 'DZA',  # Algeria
    'UAE': 'ARE',  # United Arab Emirates
    'NIG': 'NER',  # Niger
    'GUA': 'GTM',  # Guatemala
    'MRI': 'MUS',  # Mauritius
    'SUD': 'SDN',  # Sudan
    'SAM': 'WSM',  # Samoa
    'CHI': 'CHL',  # Chile
    'ZIM': 'ZWE',  # Zimbabwe
    'TOG': 'TGO',  # Togo
    'PAR': 'PRY',  # Paraguay
    'URU': 'URY',  # Uruguay
    'SRI': 'LKA',  # Sri Lanka
    'BAR': 'BRB',  # Barbados
    'CRC': 'CRI',  # Costa Rica
    'ZAM': 'ZMB',  # Zambia
    'TGA': 'TON',  # Tonga
    'HAI': 'HTI',  # Haiti,
    "OWID_KOS": 'KOS' # Kosovo
}
# Replace the country codes in the medals Data
cleaned_countryCodes['country_code'].replace(country_code_corrections, inplace=True)
cleaned_medalData['country_code'].replace(country_code_corrections, inplace=True)
cleaned_countryCodes.drop_duplicates()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_countryCodes['country_code'].replace(country_code_corrections, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_medalData['country_code'].replace(country_code_corrections, inplace=True)


Unnamed: 0,country,region,country_code
0,Afghanistan,Asia,AFG
1,Albania,Europe,ALB
2,Algeria,Africa,DZA
3,American Samoa,Oceania,ASM
4,Andorra,Europe,AND
...,...,...,...
229,Wallis and Futuna,Oceania,WLF
230,Western Sahara,Africa,ESH
231,Yemen,Asia,YEM
232,Zambia,Africa,ZMB


Correct Country Names and Country codes for consistent country name and codes.

In [165]:
## in medals Data:
    # Great Britain GBR - Unites Kingdom
    # replace USA -- Unisted States of America with US
    # ROC - ROC with Russia
    # CHN People's Republic of China with China  CHN
    # Republic of Korea (KOR) with South Korea (KOR)
    # Democratic People's Republic of Korea (PRK) with North Korea (PRK)
    # ISLAMIC REPUBLIC OF IRAN IRI with Iran IRN
    # Replace Republic of Molddova MDA with Moldova MDA 
    # Replace Chinese Taipei TPE with TWN Taiwan
    # replace HongKong, China witth Hongkong HKG
    # Replace Côte d'Ivoire	 (CIV) with Cote d'Ivoire (CIV)
    # Replace Syrian Arab Republic SYR with Syria SYR
    # Replace 'Olympic Athletes from Russia' with Russia 'RUS' 
    # Russian Federation with Russia 'RUS' 
    # Federal Republic of Germany	 FRG with 'Germany' (DEU)
    # 'Soviet Union' (URS) REPLACE CODE WITH USSR
    # German Democratic Republic (Germany)	with Germany (  DEU)
    # Yugoslavia change to 'Serbia and Montegro'
    # Virgin Islands, US	to VIR Virgin Islands (U.S.)
    # United Republic of Tanzania	to 'Tanzania' TZA
    # replace United Arab Republic UAR WITH 'EGYPT' EGY
    # REPLACE Bohemia	 with czech republic CZE
# keep as it is :
    #Independent Olympic Athletes
    # Serbia and Montenegro
    # Unified Team
    # Netherlands Antilles
# country by cont data :
    # replace Kosovo OWID_KOS country code with KOS
    # add Czechoslovakia TCH, region = "Europe"

replacements = {
    "Great Britain": "United Kingdom",
    "USA": "United States",
    "United States of America": "United States",
    "ROC": "Russia",
    "People's Republic of China": "China",
    "Republic of Korea": "South Korea",
    "Democratic People's Republic of Korea": "North Korea",
    "Islamic Republic of Iran": "Iran",
    "Republic of Moldova": "Moldova",
    "Chinese Taipei": "Taiwan",
    "HongKong, China": "Hong Kong",
    "HongKong": "Hong Kong",
    "Côte d'Ivoire": "Cote d'Ivoire",
    "Syrian Arab Republic": "Syria",
    "Olympic Athletes from Russia": "Russia",
    "Russian Federation": "Russia",
    "Federal Republic of Germany": "Germany",
    "Soviet Union": "USSR",
    "German Democratic Republic (Germany)": "Germany",
    "Yugoslavia": "Serbia and Montenegro",
    "Virgin Islands, US": "Virgin Islands (U.S.)",
    "United Republic of Tanzania": "Tanzania",
    "United Arab Republic": "Egypt",
    "Bohemia": "Czech Republic"
}
# Update country codes where necessary
country_code_replacements = {
    'US': 'USA',
    'UK': 'GBR',
    'ROC': 'RUS',
    'IRI': 'IRN',
    'TPE': 'TWN',
    'CIV': 'CIV',
    'SYR': 'SYR',
    'RUS': 'RUS', 
    'FRG': 'DEU',
    'URS': 'USSR',
    'GDR': 'DEU',
    'YUG': 'SCG', 
    'ISV': 'VIR',
    'TAN': 'TZA',
    'UAR': 'EGY',
    'BOH': 'CZE',
    'OAR': 'RUS',
    'XKX': 'KOS',
    'VIE': 'VNM',
    
}

# Apply the replacements to the 'country_name' column
cleaned_medalData['country'].replace(replacements, inplace=True)


# Replace the country codes in the medals Data
cleaned_medalData['country_code'].replace(country_code_replacements, inplace=True)

# Replace Kosovo code in country by cont data
#cleaned_countryCodes['country_code'].replace({'OWID_KOS': 'KOS'}, inplace=True)

# Add Czechoslovakia to country by cont data
czechoslovakia = pd.DataFrame({'country': ['Czechoslovakia'], 'region': ['Europe'], 'country_code': ['TCH']})
cleaned_countryCodes = pd.concat([cleaned_countryCodes, czechoslovakia], ignore_index = True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_medalData['country'].replace(replacements, inplace=True)


#### Athlete Country Code and Athlete Dimension
  - separate data from medals data for Athlete Country Dimension - create a data frame of only athletes country name, code and region
  - create region mapping for countries that do not have regions in countrybyContient Data
  - rename columns , checks for NANs, remove duplicates 
  - separate data from medals data for Athlete Dimension

Athlete Country Dimension

In [166]:
# create a df of only athletes country name, code and region
athlete_countries_df = cleaned_medalData[['Athlete_ID','athlete_full_name','country_code', 'country']].drop_duplicates().merge(cleaned_countryCodes, left_on='country_code', right_on='country_code', how='left')

# check for NANs or countries that do not have regions
athlete_countries_df[athlete_countries_df.isna().any(axis=1)] 

# Manually create a dictionary for country code to region mappings
region_mappings = {
    'HKG': 'Asia',
    'CIV': 'Africa',
    'KOS': 'Europe',
    'IOA': 'International',  # IOA typically represents independent athletes
    'SCG': 'Europe',
    'EUN': 'International',  # EUN was used for the Unified Team in 1992, composed of athletes from former Soviet Republics
    'TCH': 'Europe',         # Czechoslovakia
    'USSR': 'Europe/Asia',   # USSR spanned Europe and Asia
    'AHO': 'North America',  # Netherlands Antilles
    'VIR': 'North America',  # Virgin Islands (U.S.)
    'WIF': 'North America/Caribbean',  # West Indies Federation
    'ANZ': 'Oceania',        # Australasia
    'MIX': 'International'   # MIX can be used for mixed teams from different nations
}

# Apply the mapping to the 'region' column where it is NaN
for code, region in region_mappings.items():
    # This assumes 'country_code_x' is the column with the country codes
    athlete_countries_df.loc[athlete_countries_df['country_code'] == code, 'region'] = region


# rename athlete_countries df columns and keep necessary columns :
athlete_countries_df = athlete_countries_df[['Athlete_ID','athlete_full_name','country_code', 'country_x', 'region']]
athlete_countries_df
#athlete_countries_df[athlete_countries_df.isna().any(axis=1)]


Unnamed: 0,Athlete_ID,athlete_full_name,country_code,country_x,region
0,,Stefania CONSTANTINI,ITA,Italy,Europe
1,,Amos MOSANER,ITA,Italy,Europe
2,,Kristin SKASLIEN,NOR,Norway,Europe
3,,Magnus NEDREGOTTEN,NOR,Norway,Europe
4,,Almida DE VAL,SWE,Sweden,Europe
...,...,...,...,...,...
15163,,George Stuart ROBERTSON,MIX,MIX,International
15164,,Georgios TSITAS,GRC,Greece,Europe
15165,,Stefanos Khristopoulos,GRC,Greece,Europe
15166,,Launceston ELLIOT,GBR,United Kingdom,Europe


Athlete Dimension:
- create a df of Athlete_ID , Athlete Name and  Athlete Gender 
- Create Athlete ID
- use event IDs to populate Athlete Gender column by priorisiting rows where Athletes event_gender is 'Men'or 'Women' 

In [167]:
# create a df of Athlete_ID , Athlete Name and  Athlete Gender 
athlete = cleaned_medalData[['Athlete_ID', 'athlete_full_name', 'event_gender', 'medal_type', 'Medal_ID']]
# Assign Athlete_ID if it's not already there. For the example, I'll just use the index.
athlete['Athlete_ID'] = athlete.index

# Create a dictionary to prioritize gender sorting
gender_priority = {'Men': 1, 'Women': 2, 'Mixed': 3, 'Open': 4}

# Add a new column for sorting by priority
athlete['gender_priority'] = athlete['event_gender'].map(gender_priority)

# Sort by 'athlete_full_name' and 'gender_priority' in ascending order
# This will put Male or Female rows before Mixed for the same athlete
athlete.sort_values(by=['athlete_full_name', 'gender_priority'], ascending=True, inplace=True)

# Drop duplicates, keeping the first entry (which should be Male/Female if it exists)
athlete.drop_duplicates(subset=['athlete_full_name'], keep='first', inplace=True)

# Create the new DataFrame with selected columns
athlete_df = athlete[['Athlete_ID', 'athlete_full_name', 'event_gender']].copy()

# Rename the columns
athlete_df.rename(columns={'athlete_full_name': 'Athlete Name', 'event_gender': 'Gender'}, inplace=True)
athlete_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athlete['Athlete_ID'] = athlete.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athlete['gender_priority'] = athlete['event_gender'].map(gender_priority)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  athlete.sort_values(by=['athlete_full_name', 'gender_priority'], ascending=True, inplace=True)
A value is trying to be set on a copy of a slice from a

Unnamed: 0,Athlete_ID,Athlete Name,Gender
7583,7583,- Zé Marco,Men
17745,17745,Aage Ernst LARSEN,Men
17869,17869,Aage Ingvar ERIKSEN,Men
14895,14895,Aagje Ada KOK,Women
18375,18375,Aarne Eemeli REINI,Men
...,...,...,...
15557,15557,Ārons Boguļubovs,Men
6458,6458,İradə Aşumova,Women
16336,16336,İsmet Atlı,Men
2485,2485,Şərif Şərifov,Men


In [168]:
athlete_countries_df

Unnamed: 0,Athlete_ID,athlete_full_name,country_code,country_x,region
0,,Stefania CONSTANTINI,ITA,Italy,Europe
1,,Amos MOSANER,ITA,Italy,Europe
2,,Kristin SKASLIEN,NOR,Norway,Europe
3,,Magnus NEDREGOTTEN,NOR,Norway,Europe
4,,Almida DE VAL,SWE,Sweden,Europe
...,...,...,...,...,...
15163,,George Stuart ROBERTSON,MIX,MIX,International
15164,,Georgios TSITAS,GRC,Greece,Europe
15165,,Stefanos Khristopoulos,GRC,Greece,Europe
15166,,Launceston ELLIOT,GBR,United Kingdom,Europe


#### Athlete Country Dimension

Clean Athlete Country Name:
- match athlete ID from athlete dimnesion to athlete country dimnesion
- check if one athlete represents two countries
- if one athlete represents two countries , example one their own country and one 'International' Region, then keep the original Athlete country only by sorting columns in such a way that 'International' region comes last

In [169]:
# match athlete name from athlete_df with athlete country df and populate athleteID:
athlete_countrydf = athlete_df.merge(athlete_countries_df, 
                             left_on='Athlete Name', 
                             right_on='athlete_full_name', 
                             how='left')
athlete_countrydf.drop_duplicates() # drop duplicates 
# check if any athlete ID appear twice due to one athlete representing multiple countries.
athlete_countrydf.groupby('Athlete_ID_x').filter(lambda x: len(x) > 1)

# Remove exact duplicates across all columns
athlete_country_df = athlete_countrydf.drop_duplicates()

# Sort the DataFrame first by 'Athlete_ID_x', then by 'region' in such a way that 'International' comes last
athlete_country_df = athlete_country_df.sort_values(by=['Athlete_ID_x', 'region'], ascending=[True, False])

# If 'International' is a less preferred option, we ensure it appears later in the sort, so it can be dropped if duplicated
athlete_country_df['sort_helper'] = athlete_country_df['region'].apply(lambda x: 0 if x != 'International' else 1)

# Sort again to ensure 'International' rows come after non-'International', for the same athlete
athlete_country_df = athlete_country_df.sort_values(by=['Athlete_ID_x', 'sort_helper'])

# Drop duplicates based on 'Athlete_ID_x' only, keeping the first occurrence (which is now the non-'International' one if available)
athlete_country_df = athlete_country_df.drop_duplicates(subset=['Athlete_ID_x'], keep='first')

# Drop the 'sort_helper' column as it's no longer needed
athlete_country_df.drop('sort_helper', axis=1, inplace=True)

athlete_country_df.groupby('Athlete_ID_x').filter(lambda x: len(x) > 1)
athlete_country_df['Athlete_ID_x'].value_counts()[athlete_country_df['Athlete_ID_x'].value_counts() > 1]

athlete_countrydf = athlete_country_df[['Athlete_ID_x', 'country_code', 'country_x', 'region']] 
athlete_countrydf
athlete_countrydf.rename(columns={"Athlete_ID_x": "Athlete_ID", "country_x": "country"}) # rename columns

# keep only necessary columns and remove duplicates
athlete_countrydf = athlete_countrydf[['country_code', 'country_x' ,'region']]
athlete_countrydf.drop_duplicates()


Unnamed: 0,country_code,country_x,region
13154,ITA,Italy,Europe
9075,NOR,Norway,Europe
552,SWE,Sweden,Europe
2319,GBR,United Kingdom,Europe
3017,JPN,Japan,Asia
...,...,...,...
13276,TZA,Tanzania,Africa
10428,GUY,Guyana,South America
26,IRQ,Iraq,Asia
12995,HTI,Haiti,North America


- Merge Athlete Dimnesion to Full Data/ Medal Data so that Athlete ID from full data can be populated
- Keep only neccessary columns

In [170]:

# First, ensure the 'Athlete Name' in athlete_df is of string type for a proper merge
athlete_df['Athlete Name'] = athlete_df['Athlete Name'].astype(str)

# Merge cleaned_medalData with athlete_df
cleaned_medalData = cleaned_medalData.merge(athlete_df, left_on='athlete_full_name', right_on='Athlete Name', how='left')
cleaned_medalData.drop(columns=['Athlete_ID_x','Athlete Name', 'Gender'], axis=1)

# Display the first few rows to verify
cleaned_medalData = cleaned_medalData[['discipline_title', 'slug_game', 'event_title', 'event_gender', 'medal_type', 'athlete_full_name', 'country', 'country_code', 'Athlete_ID_y', 'Event_ID', "Game_ID"]]
# Rename columns
cleaned_medalData.rename(columns={"Athlete_ID_y": "Athlete_ID"})

cleaned_medalData.head()

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,athlete_full_name,country,country_code,Athlete_ID_y,Event_ID,Game_ID
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Stefania CONSTANTINI,Italy,ITA,0,,
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Amos MOSANER,Italy,ITA,1,,
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Kristin SKASLIEN,Norway,NOR,2,,
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Magnus NEDREGOTTEN,Norway,NOR,3,,
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,Almida DE VAL,Sweden,SWE,4,,


Remove duplicates from athlete country dimnesion

In [171]:
athlete_countrydf = athlete_countrydf[['country_code', 'country_x' ,'region']]
athlete_countrydf.drop_duplicates()

Unnamed: 0,country_code,country_x,region
13154,ITA,Italy,Europe
9075,NOR,Norway,Europe
552,SWE,Sweden,Europe
2319,GBR,United Kingdom,Europe
3017,JPN,Japan,Asia
...,...,...,...
13276,TZA,Tanzania,Africa
10428,GUY,Guyana,South America
26,IRQ,Iraq,Asia
12995,HTI,Haiti,North America


#### Event Dimension
- create event id
- create event dimension 

In [172]:
# Create event ID and Event Dim
# Ensure all relevant columns are of type str
cleaned_medalData['discipline_title'] = cleaned_medalData['discipline_title'].astype(str)
cleaned_medalData['slug_game'] = cleaned_medalData['slug_game'].astype(str)
cleaned_medalData['event_title'] = cleaned_medalData['event_title'].astype(str)
cleaned_medalData['event_gender'] = cleaned_medalData['event_gender'].astype(str)

# Create a unique identifier for each event by concatenating the relevant columns
cleaned_medalData['event_identifier'] = (cleaned_medalData['discipline_title'] + '_' + 
                                         cleaned_medalData['slug_game'] + '_' + 
                                         cleaned_medalData['event_title'] + '_' + 
                                         cleaned_medalData['event_gender'])

# Generate Event_IDs
cleaned_medalData['Event_ID'], _ = pd.factorize(cleaned_medalData['event_identifier'])

# drop the 'event_identifier' column as it's no longer needed
cleaned_medalData.drop('event_identifier', axis=1, inplace=True)

# keep only relevant columns in final event dimension
event_dim = cleaned_medalData[['Event_ID', 'event_gender','event_title', 'discipline_title' ]]
# remove duplicates
event_dim.drop_duplicates()

Unnamed: 0,Event_ID,event_gender,event_title,discipline_title
0,0,Mixed,Mixed Doubles,Curling
6,1,Women,Women,Curling
9,2,Men,Men,Curling
12,3,Men,Men's Moguls,Freestyle Skiing
15,4,Men,Men's Freeski Halfpipe,Freestyle Skiing
...,...,...,...,...
21678,6568,Men,Singles men,Tennis
21682,6569,Men,doubles men,Tennis
21688,6570,Men,"Unlimited Class, Greco-Roman Men",Wrestling
21691,6571,Men,heavyweight - one hand lift men,Weightlifting


#### Economic Dimension
- standardise any remaining country codes or names
- replace missing GDP and health expenditure values with 0 which can be filterred out when querying
- remove duplicates
-create economic dimension IDS
- join economic subset data with medals / full/ fact data
- keep only rlevant columns in final economic dimnesion and full / fact /medals data

In [173]:

# Step 1: Standardize country codes or names if necessary (example shown for "Viet Nam")
economic_subset.replace({'Country Code': {'XKX': 'KOS'}}, inplace=True)
economic_subset.replace({'Country Code': {'VIE': 'VNM'}}, inplace=True) # Replace based on known inconsistencies

# Step 2: Perform the left join
joined_df = athlete_countrydf.merge(economic_subset, left_on='country_code', right_on='Country Code', how='left')
econ_df = joined_df[['country_code', 'country_x', 'region', 'Health_Expenditure_GDP_Percent', 'GDP_Per_Capita_USD']]
econ_df['Health_Expenditure_GDP_Percent'] = econ_df['Health_Expenditure_GDP_Percent'].fillna(0)
econ_df['GDP_Per_Capita_USD'] = econ_df['GDP_Per_Capita_USD'].fillna(0)
econ_dim = econ_df.drop_duplicates()

# create unique econ id:
# Initialize a dictionary to hold unique economic keys

economic_keys = {}
# Initialize a counter
counter = 1

for index, row in econ_dim.iterrows():
    # Generate a basic key combining country_code with an incremented value
    basic_key = f"{row['country_code'][:3]}{str(counter).zfill(3)}"
    economic_keys[index] = basic_key[:6]
    counter += 1

# Assign generated keys to a new column
econ_dim['Economic_Key'] = economic_keys.values()


joined_data = cleaned_medalData.merge(econ_dim, on='country_code', how='left')
joined_data # separate fact data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  econ_df['Health_Expenditure_GDP_Percent'] = econ_df['Health_Expenditure_GDP_Percent'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  econ_df['GDP_Per_Capita_USD'] = econ_df['GDP_Per_Capita_USD'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  econ_dim['Economic_Key'] = economic_k

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,athlete_full_name,country,country_code,Athlete_ID_y,Event_ID,Game_ID,country_x,region,Health_Expenditure_GDP_Percent,GDP_Per_Capita_USD,Economic_Key
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Stefania CONSTANTINI,Italy,ITA,0,0,,Italy,Europe,9.63378334,31918.69349,ITA001
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,Amos MOSANER,Italy,ITA,1,0,,Italy,Europe,9.63378334,31918.69349,ITA001
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Kristin SKASLIEN,Norway,NOR,2,0,,Norway,Europe,11.41755009,68340.0181,NOR002
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,Magnus NEDREGOTTEN,Norway,NOR,3,0,,Norway,Europe,11.41755009,68340.0181,NOR002
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,Almida DE VAL,Sweden,SWE,4,0,,Sweden,Europe,11.37992764,52837.90398,SWE003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Viggo JENSEN,Denmark,DNK,21671,6571,,Denmark,Europe,10.52817535,60915.4244,DNK036
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Alexandros Nikolopoulos,Greece,GRC,21693,6571,,Greece,Europe,9.50820732,17658.9473,GRC069
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Viggo JENSEN,Denmark,DNK,21671,6572,,Denmark,Europe,10.52817535,60915.4244,DNK036
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Launceston ELLIOT,United Kingdom,GBR,21691,6572,,United Kingdom,Europe,11.97759342,40318.41692,GBR004


#### Game Dimension
- separate game Dimension by slicing host data

In [174]:
hostsData.head()
# Game Dimension
game_dim = hostsData[['game_slug', 'game_name', 'game_location']]
# change dtypes:
game_dim['game_slug'] = game_dim['game_slug'].astype(str)
game_dim['game_name'] = game_dim['game_name'].astype(str)
game_dim['game_location'] = game_dim['game_location'].astype(str)

# remove duplicates to keep only unqiue rows 
game_dim.drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_dim['game_slug'] = game_dim['game_slug'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_dim['game_name'] = game_dim['game_name'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_dim['game_location'] = game_dim['game_location'].astype(str)


Unnamed: 0,game_slug,game_name,game_location
0,beijing-2022,Beijing 2022,China
1,tokyo-2020,Tokyo 2020,Japan
2,pyeongchang-2018,PyeongChang 2018,Republic of Korea
3,rio-2016,Rio 2016,Brazil
4,sochi-2014,Sochi 2014,Russian Federation
5,london-2012,London 2012,Great Britain
6,vancouver-2010,Vancouver 2010,Canada
7,beijing-2008,Beijing 2008,China
8,turin-2006,Turin 2006,Italy
9,athens-2004,Athens 2004,Greece


####  Year Dimension

In [175]:

# Date Dimension
date_dim = hostsData[['game_slug', 'game_season', 'game_year']]
# remove duplicates to keep only unqiue rows 
date_dim.drop_duplicates()
date_dim['game_slug'] = date_dim['game_slug'].astype(str)
date_dim['game_season'] = date_dim['game_season'].astype(str)
date_dim['game_year'] = date_dim['game_year'].astype(int)
# create unique date  / year id using row index 
date_dim.reset_index(inplace=True)
# Rename the 'index' column to 'year_id' and start from 1 instead of 0
date_dim.rename(columns={'index': 'year_id'}, inplace=True)
date_dim['year_id'] += 1

date_dim.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_dim['game_slug'] = date_dim['game_slug'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_dim['game_season'] = date_dim['game_season'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_dim['game_year'] = date_dim['game_year'].astype(int)
A value is trying to be set

Unnamed: 0,year_id,game_slug,game_season,game_year
0,1,beijing-2022,Winter,2022
1,2,tokyo-2020,Summer,2020
2,3,pyeongchang-2018,Winter,2018
3,4,rio-2016,Summer,2016
4,5,sochi-2014,Winter,2014


### Final Cleaned Tables

In [176]:
# 1. Athlete Country df
athlete_countrydf
# 2. Athlete df:
athlete_df
# 3. Medal Dimension:
medal_dim
# 4. event_dim:
event_dim
# 5. game_dim:
game_dim
# 6. date_dim:
date_dim
# 7. Econ_dim
econ_dim

Unnamed: 0,country_code,country_x,region,Health_Expenditure_GDP_Percent,GDP_Per_Capita_USD,Economic_Key
0,ITA,Italy,Europe,9.63378334,31918.69349,ITA001
2,NOR,Norway,Europe,11.41755009,68340.0181,NOR002
4,SWE,Sweden,Europe,11.37992764,52837.90398,SWE003
6,GBR,United Kingdom,Europe,11.97759342,40318.41692,GBR004
7,JPN,Japan,Asia,10.9042511,39986.92863,JPN005
...,...,...,...,...,...,...
9008,TZA,Tanzania,Africa,3.74659801,1104.164429,TZA143
9170,GUY,Guyana,South America,5.50864172,6863.074346,GUY144
11457,IRQ,Iraq,Asia,5.08489752,4251.337253,IRQ145
13240,HTI,Haiti,North America,3.22171094,1283.141228,HTI146


# Fact Table
- add id columns that have not been added
- drop columns not needed and only keep ID columns

In [177]:
joined_data = joined_data.merge(medal_dim, left_on='medal_type', right_on='Medal_Type', how='left')
joined_data = joined_data.merge(date_dim, left_on = "slug_game", right_on = 'game_slug', how = 'left')
fact_table = joined_data[
  ['Athlete_ID_y','country_code', 'Event_ID','Medal_ID','slug_game', 'Economic_Key', 'year_id']]
fact_table

Unnamed: 0,Athlete_ID_y,country_code,Event_ID,Medal_ID,slug_game,Economic_Key,year_id
0,0,ITA,0,1,beijing-2022,ITA001,1
1,1,ITA,0,1,beijing-2022,ITA001,1
2,2,NOR,0,2,beijing-2022,NOR002,1
3,3,NOR,0,2,beijing-2022,NOR002,1
4,4,SWE,0,3,beijing-2022,SWE003,1
...,...,...,...,...,...,...,...
21692,21671,DNK,6571,2,athens-1896,DNK036,53
21693,21693,GRC,6571,3,athens-1896,GRC069,53
21694,21671,DNK,6572,1,athens-1896,DNK036,53
21695,21691,GBR,6572,2,athens-1896,GBR004,53


# Dimension Tables

In [180]:

# replace '..' values with 0
econ_dim = econ_dim[['Economic_Key','country_code', 'Health_Expenditure_GDP_Percent', "GDP_Per_Capita_USD"]]
econ_dim[econ_dim["Health_Expenditure_GDP_Percent"]=='..']
econ_dim.replace("..", 0, inplace=True)
# keep relevant columns
econ_dim = econ_dim[['Economic_Key', 'Health_Expenditure_GDP_Percent', 'GDP_Per_Capita_USD']]
year_dim = date_dim[['year_id', 'game_season', 'game_year']]
# Rename DataFrame columns to match with PostgreSQL table column names

# Athlete Country Dimnesion:
athlete_countrydf.rename(columns={
    'country_code': 'athlete_countrycode',
    'country_x': 'country_name',
    'region': 'continent'
}, inplace=True)



# Athlete Dimension
athlete_df.rename(columns={
    'Athlete_ID': 'athlete_id',
    'Athlete Name': 'athlete_name',
    'Gender': 'gender'
}, inplace=True)

# Medal Dimnesion
medal_dim.rename(columns={
    'Medal_ID': 'medal_id',
    'Medal_Type': 'medal_type'
}, inplace=True)
# Event Dimension
event_dim.rename(columns={
    'Event_ID': 'event_id',
    'gender': 'event_gender',
    'event_title': 'title',
    'discipline_title': 'discipline'
}, inplace=True)

# Game Dimension
game_dim.rename(columns={
    'game_slug': 'game_id',
    'game_name': 'game_name',
    'game_location': 'gamelocation'
}, inplace=True)

# Year Dimnesion
year_dim.rename(columns={
    'year_id': 'year_id',
    'game_season': 'season',
    'game_year': 'year'
}, inplace=True)

year_dim['year_id'] = year_dim['year_id'].astype(int)
year_dim['season'] = year_dim['season'].astype(str)
year_dim['year'] = year_dim['year'].astype(int)


# Economic Dimension
econ_dim.rename(columns={
    'Economic_Key': 'economy_id',
    'Health_Expenditure_GDP_Percent': 'health_expenditure',
    'GDP_Per_Capita_USD': 'gdp_percapita'
}, inplace=True)



# Fact Table

fact_table.rename(columns={
    'Athlete_ID_y': 'athlete_id',
    'country_code': 'athlete_countrycode',
    'Event_ID': 'event_id',
    'Medal_ID': 'medal_id',
    'slug_game': 'game_id',
    'Economic_Key': 'economy_id',
    'year_id': 'year_id',

}, inplace=True)



# Remove any duplicates and only keep unique rows
# 1. Athlete Country df
athlete_countrydf.drop_duplicates()
# 2. Athlete df:
athlete_df.drop_duplicates()
# 3. Medal Dimension:
medal_dim.drop_duplicates()
# 4. event_dim:
event_dim.drop_duplicates()
# 5. game_dim:
game_dim.drop_duplicates()

# 6. date_dim:
year_dim.drop_duplicates()
# 7. Econ_dim
econ_dim.drop_duplicates()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  event_dim.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_dim.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_dim.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_dim['year_id'] = year_dim['

Unnamed: 0,economy_id,health_expenditure,gdp_percapita
0,ITA001,9.63378334,31918.69349
2,NOR002,11.41755009,68340.0181
4,SWE003,11.37992764,52837.90398
6,GBR004,11.97759342,40318.41692
7,JPN005,10.9042511,39986.92863
...,...,...,...
9008,TZA143,3.74659801,1104.164429
9170,GUY144,5.50864172,6863.074346
11457,IRQ145,5.08489752,4251.337253
13240,HTI146,3.22171094,1283.141228


# Load Data
- create schema in Postgresql
- connect to database in Postgresql
- load data in PostgreSQL

In [None]:
# connect to database
def df_to_postgres(df, table_name, conn):
    """
    Inserts the DataFrame `df` into the PostgreSQL table `table_name`.
    """
    tuples = [tuple(x) for x in df.to_numpy()]
    cols = ','.join(list(df.columns))
    query = f"INSERT INTO {table_name} ({cols}) VALUES %s"
    
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples, template=None, page_size=100)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error: {error}")
        conn.rollback()
        cursor.close()
        return 1
    print(f"{table_name} - execute_values() done")
    cursor.close()

# Load Dimnesion Tables
df_to_postgres(athlete_df, 'dimathlete', conn)
df_to_postgres(athlete_countrydf, 'dimathlete_country', conn)
df_to_postgres(event_dim, 'dimevent', conn)
df_to_postgres(game_dim, 'dimgame', conn)
df_to_postgres(year_dim, 'dimyear', conn)  
df_to_postgres(econ_dim, 'dimeconomy', conn)
df_to_postgres(medal_dim, 'dimmedal', conn)




dimathlete - execute_values() done
dimathlete_country - execute_values() done
dimevent - execute_values() done
dimgame - execute_values() done
dimyear - execute_values() done
dimeconomy - execute_values() done
dimmedal - execute_values() done


In [135]:
# Load fact tabe separately to avoid conflicts
df_to_postgres(fact_table, 'fact_olympicathletes', conn)

fact_olympicathletes - execute_values() done
