# country_territory_data

In [29]:
import pandas as pd

# Function to format data from a given sheet
def format_data(sheet_name):
    data = pd.read_excel(file_path, sheet_name=sheet_name, header = None)

    # Extract years from the first row
    years = data.iloc[0, 1::3].values

    # Prepare a list to collect the new data
    rows = []

    # Iterate through the data starting from the second row
    for i in range(2, len(data)):
        country = data.iloc[i, 0]
        for j, year in enumerate(years):
            PR = data.iloc[i, j*3+1]
            CL = data.iloc[i, j*3+2]
            Status = data.iloc[i, j*3+3]
            rows.append([country, year, PR, CL, Status])

    # Create a new DataFrame
    formatted_data = pd.DataFrame(rows, columns=['Country', 'Year', 'PR', 'CL', 'Status'])
    return formatted_data

# Load the Excel file
file_path = 'data/Country_and_Territory_Ratings_and_Statuses_FIW_1973-2024.xlsx'

# Format data from both sheets and combine
formatted_country_data = format_data('Country Ratings, Statuses ')
formatted_territory_data = format_data('Territory Ratings, Statuses')

# Combine the formatted data
combined_data = pd.concat([formatted_country_data, formatted_territory_data])

# Display the combined DataFrame
print(combined_data.head())

# Save the combined DataFrame to a CSV file
combined_data.to_csv('cleandata/combined_country_territory_data.csv', index=False)

       Country  Year PR CL Status
0  Afghanistan  1972  4  5     PF
1  Afghanistan  1973  7  6     NF
2  Afghanistan  1974  7  6     NF
3  Afghanistan  1975  7  6     NF
4  Afghanistan  1976  7  6     NF


# gdp_imf

In [35]:
import pandas as pd

# Load the GDP data
gdp_file_path = 'data/GDP_imf.xlsx'
gdp_data = pd.read_excel(gdp_file_path)

# Display the first few rows of the GDP data to understand its structure
gdp_data.head()

Unnamed: 0,Real GDP growth (Annual percent change),1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029
0,Afghanistan,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,-2.4,-14.5,-6.2,no data,no data,no data,no data,no data,no data,no data
1,Albania,2.7,5.7,2.9,1.1,2,-1.5,5.6,-0.8,-1.4,...,-3.3,8.9,4.8,3.3,3.1,3.4,3.5,3.5,3.5,3.5
2,Algeria,-5.4,3,6.4,5.4,5.6,5.6,-0.2,-0.7,-1.9,...,-5.0,3.8,3.6,4.2,3.8,3.1,2.5,2.1,2.1,2.1
3,Andorra,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,-11.2,8.3,9.6,2.3,1.8,1.5,1.5,1.5,1.5,1.5
4,Angola,2.4,-4.4,0,4.2,6,3.5,2.9,4.1,6.1,...,-5.6,1.2,3.0,0.5,2.6,3.1,3.4,3.5,3.6,3.6


In [37]:
gdp_long = pd.melt(gdp_data, id_vars=['Real GDP growth (Annual percent change)'], var_name='Year', value_name='GDP')

# Rename the country column
gdp_long.rename(columns={'Real GDP growth (Annual percent change)': 'Country'}, inplace=True)

# Replace 'no data' with null values
gdp_long['GDP'].replace('no data', pd.NA, inplace=True)
gdp_long

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gdp_long['GDP'].replace('no data', pd.NA, inplace=True)


Unnamed: 0,Country,Year,GDP
0,Afghanistan,1980,
1,Albania,1980,2.7
2,Algeria,1980,-5.4
3,Andorra,1980,
4,Angola,1980,2.4
...,...,...,...
11395,Major advanced economies (G7),2029,1.6
11396,Middle East and Central Asia,2029,3.7
11397,Other advanced economies,2029,2.1
11398,Sub-Saharan Africa,2029,4.3


In [39]:
gdp_long.to_csv('cleandata/cleaned_gdp_imf.csv', index=False)

# sport_spending_data

In [41]:
import pandas as pd

# Load the Excel file
file_path = 'data/gov_sport_spend.xlsx'

# Function to process each sheet
def process_sheet(sheet_name):
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Extract years and notes
    years = data.columns[1::2]  # Take every second column starting from the second
    notes_columns = data.columns[2::2]  # Take every second column starting from the third

    # Create a list to collect the processed data
    rows = []

    for i in range(len(data)):
        country = data.iloc[i, 0]
        for j, year in enumerate(years):
            amount = data.iloc[i, j*2 + 1]
            note = data.iloc[i, j*2 + 2]
            rows.append([country, year, amount, note, sheet_name])

    # Create a DataFrame from the collected data
    processed_data = pd.DataFrame(rows, columns=['Country', 'Year', 'Amount', 'Note', 'Source'])
    return processed_data

# Get the sheet names
xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names

# Process each sheet and combine the data
combined_data = pd.concat([process_sheet(sheet) for sheet in sheet_names])

# Convert Amount to numeric, coercing errors to NaN
combined_data['Amount'] = pd.to_numeric(combined_data['Amount'], errors='coerce')

# Pivot the data to get each sheet as a separate column
pivoted_data = combined_data.pivot_table(index=['Country', 'Year'], columns='Source', values='Amount', aggfunc='mean').reset_index()

pivoted_data

Source,Country,Year,Central government,General government,Local government,Recreational and sporting,State government
0,Austria,1995,,674.8,,,
1,Austria,1996,,668.1,,,
2,Austria,1997,,733.6,,,
3,Austria,1998,,689.8,,,
4,Austria,1999,,689.9,,,
...,...,...,...,...,...,...,...
834,Switzerland,2018,185.4,2177.9,1660.0,0.0,338.2
835,Switzerland,2019,203.3,2337.5,1800.8,0.0,344.6
836,Switzerland,2020,322.4,2451.9,1812.4,0.0,335.8
837,Switzerland,2021,437.1,2621.0,1844.0,0.0,368.2


In [42]:
# Save the combined data to a CSV file
pivoted_data.to_csv('cleandata/combined_sport_spending_data.csv', index=False)


# Mortality data

In [46]:
# Load the mortality rates data
mortality_file_path = 'data/mortality_rates.xlsx'
mortality_data = pd.read_excel(mortality_file_path)
# Convert the data to a long format
mortality_long = pd.melt(mortality_data, id_vars=['Country Name', 'Country Code', 'Series Name', 'Series Code'],
                         var_name='Year', value_name='Value')

# Extract the year from the 'Year' column
mortality_long['Year'] = mortality_long['Year'].str.extract('(\d{4})')

# Replace '..' with NaN
mortality_long['Value'].replace('..', pd.NA, inplace=True)

# Convert 'Value' to numeric, coercing errors to NaN
mortality_long['Value'] = pd.to_numeric(mortality_long['Value'], errors='coerce')

# Display the processed mortality data
mortality_long.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mortality_long['Value'].replace('..', pd.NA, inplace=True)


Unnamed: 0,Country Name,Country Code,Series Name,Series Code,Year,Value
0,Afghanistan,AFG,"Mortality from CVD, cancer, diabetes or CRD be...",SH.DYN.NCOM.ZS,1960,
1,Afghanistan,AFG,Mortality rate attributed to household and amb...,SH.STA.AIRP.P5,1960,
2,Afghanistan,AFG,"Mortality rate attributed to unsafe water, uns...",SH.STA.WASH.P5,1960,
3,Afghanistan,AFG,"Mortality rate, adult, male (per 1,000 male ad...",SP.DYN.AMRT.MA,1960,601.887
4,Afghanistan,AFG,"Mortality rate, adult, female (per 1,000 femal...",SP.DYN.AMRT.FE,1960,550.189


In [48]:
# Save the processed mortality data to a CSV file
mortality_long.to_csv('cleandata/processed_mortality_data.csv', index=False)

# OECD_health_govspend

In [75]:
import pandas as pd
import numpy as np

# Load the OECD health government spending data
oecd_file_path = 'data/OECD_health_govspend.xlsx'
oecd_data = pd.read_excel(oecd_file_path, header=[0, 1])


In [76]:
# Flatten the multi-level columns
oecd_data.columns = [' '.join(col).strip() for col in oecd_data.columns.values]

# Extract relevant columns and rows
oecd_data = oecd_data.rename(columns={'Combined unit of measure Reference area': 'Country'})

# Convert the data to a long format
id_vars = [col for col in oecd_data.columns if 'Time period' in col or 'Country' in col]
oecd_long = pd.melt(oecd_data, id_vars=id_vars, var_name='Year', value_name='Value')

# Clean the 'Value' column to handle letters in front
oecd_long['Value'] = oecd_long['Value'].astype(str).str.extract('([0-9,.]+)').replace({',': ''}, regex=True).astype(float)

# Extract the year from the 'Year' column if necessary
oecd_long['Year'] = oecd_long['Year'].str.extract('(\d{4})')

# Display the processed OECD data
oecd_long

Unnamed: 0,Time period Combined unit of measure,Time period Reference area,Year,Value
0,"National currency, Current prices, Millions, A...",Australia,1995,
1,"National currency, Current prices, Millions, Euro",Austria,1995,646.46
2,"National currency, Current prices, Millions, Euro",Belgium,1995,
3,"National currency, Current prices, Millions, C...",Colombia,1995,
4,"National currency, Current prices, Millions, C...",Costa Rica,1995,
...,...,...,...,...
1031,"National currency, Current prices, Millions, Euro",European Union (27 countries from 01/02/2020),2022,60254.72
1032,"National currency, Current prices, Millions, B...",Non-OECD economies,2022,
1033,"National currency, Current prices, Millions, B...",Bulgaria,2022,196.79
1034,"National currency, Current prices, Millions, Euro",Croatia,2022,257.90


In [77]:
# Save the processed OECD data to a CSV file
oecd_long.to_csv('cleandata/processed_oecd_health_govspend_data.csv', index=False)