In [1071]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [1072]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [1073]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [1074]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [1075]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [1076]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [1077]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1078]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [1079]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [1080]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [1081]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [1082]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [1083]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [1084]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1085]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [1086]:
df_2.drop_duplicates(inplace=True)


In [1087]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [1088]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [1089]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [1090]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [1091]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [1092]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1093]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [1094]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [1095]:
df_3.drop_duplicates(inplace=True)


In [1096]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [1097]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [1098]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [1099]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [1100]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1101]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [1102]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [1103]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [1104]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [1105]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [1106]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [1107]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [1108]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1109]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [1110]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [1111]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [1112]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [1113]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [1114]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
61,Women,,44,84.0,91.3,95.4,86.5,74.1,,,
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
49,Women,,32,75.5,85.8,92.4,79.9,61.0,45.8,34.2,
36,Women,,19,2.1,5.3,13.4,12.1,3.8,1.4,0.4,0.2
18,Men,,35,84.8,86.7,89.1,73.7,56.3,44.2,,


In [1115]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [1116]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [1117]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [1118]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [1119]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [1120]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1121]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1424,New Zealand,NZL,1971,13.9
437,Cyprus,CYP,2013,20.2
1080,Japan,JPN,2012,2.2
425,Cyprus,CYP,2001,2.5
1290,Malta,MLT,1988,1.7


In [1122]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [1123]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [1124]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [1125]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [1126]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1127]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
20,Men,GBR,37,87.1,88.4,90.1,75.8,59.8,49.0,,
3,Men,GBR,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
54,Women,GBR,37,80.6,89.4,94.4,84.1,69.4,57.8,,
34,Women,GBR,17,0.1,0.3,1.0,1.3,0.4,0.1,0.0,0.0
18,Men,GBR,35,84.8,86.7,89.1,73.7,56.3,44.2,,


In [1128]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
61,Women,44,84.0,91.3,95.4,86.5,74.1
52,Women,35,79.0,88.3,93.8,82.8,66.7
65,Women,48,85.0,91.8,95.6,87.2,75.7
25,Men,42,90.7,90.6,91.6,79.1,65.8
7,Men,24,30.2,34.4,49.8,33.7,14.8


In [1129]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [1130]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [1131]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [1132]:
#pip install openpyxl pywin32

In [1133]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [1134]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [1135]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [1136]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [1137]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [1138]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [1139]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [1140]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [1141]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [1142]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [1143]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [1144]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
12939,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,440,Lithuania,LT,LTU,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,60.42
6171,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,214,Dominican Republic,DO,DOM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,58.18
15864,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,540,New Caledonia,NC,NCL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,41.57
4866,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,175,Mayotte,YT,MYT,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.39
23099,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,796,Turks and Caicos Islands,TC,TCA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,53.58


In [1145]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
13725,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,466,Mali,ML,MLI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,77.86
15340,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,520,Nauru,NR,NRU,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,55.33
16990,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,585,Palau,PW,PLW,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,52.27
24586,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,862,Venezuela (Bolivarian Republic of),VE,VEN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,53.8
21191,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,729,Sudan,SD,SDN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,70.99


In [1146]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [1147]:
df_9.drop_duplicates(inplace=True)

In [1148]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [1149]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [1150]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [1151]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [1152]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [1153]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
23734,Sri Lanka,144,Married or in-union women,2020,45-49,84.62279,597.604871,Projection
54966,Guinea,324,Married or in-union women,2036,45-49,83.459179,354.996538,Projection
19754,Cambodia,116,Married or in-union women,2009,25-29,78.99375,520.48034,Estimate
44626,French Polynesia,258,Married or in-union women,2040,25-29,65.703688,6.026342,Projection
64719,Jamaica,388,Married or in-union women,2040,15-49,41.049274,269.185745,Projection
138965,United Republic of Tanzania,834,Married or in-union women,2006,40-44,79.505455,598.436761,Estimate
23824,Sri Lanka,144,Married or in-union women,2032,15-19,5.153481,43.320267,Projection
60427,Iran (Islamic Republic of),364,Married or in-union women,1990,30-34,90.941333,1751.079466,Estimate
38394,Ethiopia,231,Married or in-union women,1990,25-29,87.303055,1573.914746,Estimate
53927,Guatemala,320,Married or in-union women,1987,15-49,63.4471,1172.122354,Estimate


In [1154]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
76520,"China, Macao SAR",446,Married or in-union women,1977,15-19,1.844375,0.285648,Estimate
74052,Libya,434,Married or in-union women,1992,35-39,83.420667,83.506173,Estimate
1874,Algeria,12,Married or in-union women,2042,25-29,49.350543,1163.677419,Projection
131759,United Arab Emirates,784,Married or in-union women,1996,15-49,61.0576,286.020051,Estimate
98971,Palau,585,Married or in-union women,2029,30-34,53.982579,0.230236,Projection


In [1155]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [1156]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [1157]:
df_10.dropna(inplace=True)

In [1158]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [1159]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [1160]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [1161]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [1162]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [1163]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
31288,Mexico,484,1970,1970,Men,[25-29],25,29,71.48,Census,1970 Census,1015,Mexico 1970 Census,UNSD,1.0,Data have not been adjusted for underenumeration.,
20055,Hungary,348,1980,1980,Women,[55-59],55,59,69.21,Census,1980 Census,1092,Hungary 1980 Census,UNSD,,,
27449,Latvia,428,2014,2014,Women,[35-39],35,39,53.66,Estimate,2014 Estimate,2142,Latvia 2014 Estimate,UNSD,,,
32607,Mozambique,508,2011,2011,Men,[50-54],50,54,93.1,Survey,2011 DHS,5064,Mozambique 2011 Demographic and Health Survey,DHS_STATcompiler,1.0,,
19240,Guyana,328,2000,2000,Men,[35-39],35,39,84.5,Survey,2000 MICS_HH,1939,Guyana 2000 Multiple Indicator Cluster Survey,MICS_HH,1.0,,
4026,Bolivia (Plurinational State of),68,1998,1998,Men,[25-29],25,29,64.2,Survey,1998 DHS,1766,Bolivia 1998 Demographic and Health Survey,DHS_STATcompiler,1.0,,
42160,Sao Tome and Principe,678,2000,2000,Women,[15-19],15,19,22.87,Survey,2000 MICS_HH,1899,Sao Tome and Principe 2000 Multiple Indicator ...,MICS_HH,1.0,,
34665,New Zealand,554,2006,2006,Men,[25-29],25,29,23.27,Census,2006 Census,1306,New Zealand 2006 Census,UNSD,,Data randomly rounded to protect confidentiali...,


In [1164]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
28666,Liechtenstein,438,2006,2006,Women,[40-44],40,44,71.04,Estimate,2006 Estimate,2145,Liechtenstein 2006 Estimate,UNSD,,,
38897,Puerto Rico,630,1990,1990,Men,[50-54],50,54,80.38,Census,1990 Census,391,Puerto Rico 1990 Census,UNSD,1.0,,Including military personnel.
24403,Israel,376,2007,2007,Women,[70-74],70,74,50.34,Estimate,2007 Estimate,2127,Israel 2007 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
464,Algeria,12,2012,2013,Women,[35-39],35,39,69.41,Survey,2012 MICS_HH,4984,Algeria 2012 Multiple Indicator Cluster Survey,MICS_HH,,,
19256,Guyana,328,2000,2000,Women,[50-54],50,54,63.6,Survey,2000 MICS_HH,1939,Guyana 2000 Multiple Indicator Cluster Survey,MICS_HH,1.0,,
38712,Portugal,620,1973,1973,Women,[20-24],20,24,44.35,Estimate,1973 Estimate,2193,Portugal 1973 Estimate,UNSD,,,
33951,Netherlands,528,2001,2001,Men,[75+],75,999,67.28,Census,2001 Census,1347,Netherlands 2001 Census,UNSD,,,
50491,Tuvalu,798,2007,2007,Women,[45-49],45,49,82.6,Survey,2007 DHS,3329,Tuvalu 2007 Demographic Health Survey (national),National statistics,,,


In [1165]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
19088,Guinea-Bissau,624,Women,[15-19],15,19,41.53,Census,1991 Census,INED
17884,Greece,300,Men,[35-39],35,39,83.71,Census,1991 Census,UNSD
19287,Guyana,328,Men,[30-34],30,34,72.7,Survey,2005 AIS,DHS_STATcompiler
11482,Denmark,208,Women,[25-29],25,29,69.02,Estimate,1994 Estimate,UNSD
5231,Burkina Faso,854,Men,[55-59],55,59,94.7,Survey,1993 DHS,DHS_HH
35865,Norway,578,Men,[60-64],60,64,79.17,Estimate,1984 Estimate,UNSD
43262,Sierra Leone,694,Men,[75+],75,999,80.01,Survey,2000 MICS_HH,MICS_HH
45577,Sri Lanka,144,Men,[25-29],25,29,49.53,Census,2001 Census,UNSD
25587,Jordan,400,Men,[70-74],70,74,89.5,Survey,1976 WFS,National statistics
21626,Iceland,352,Men,[10-14],10,14,0.0,Estimate,1990 Estimate,UNSD


In [1166]:
df_11.drop_duplicates(inplace=True)

In [1167]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [1168]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [1169]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [1170]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [1171]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
37593,Norway,578,1974,1974,Men,[70-74],70,74,87.41,Estimate,1974 Estimate,2180,Norway 1974 Estimate,UNSD,,,
17995,Germany,276,1993,1993,Men,[35-39],35,39,79.93,Estimate,1993 Estimate,2102,Germany 1993 Estimate,UNSD,,,
9136,Colombia,170,2000,2000,Men,[15-19],15,19,3.4,Survey,2000 DHS,1861,Colombia 2000 Demographic and Health Survey,DHS_HH,1.0,,
23367,Iceland,352,2011,2011,Women,[40-44],40,44,84.16,Census,2011 Census,4826,Iceland 2011 Census,Eurostat,1.0,Estimates computed based on data on marital st...,
25660,Israel,376,1993,1993,Men,[20-24],20,24,14.48,Estimate,1993 Estimate,2127,Israel 1993 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
33017,Mexico,484,1995,1995,Men,[20-24],20,24,35.45,Census,1995 Sample Census,4466,Mexico 1995 Sample Census,IPUMS,1.0,Data are based on a 0.4 per cent sample.,
18021,Germany,276,1994,1994,Men,[25-29],25,29,32.37,Estimate,1994 Estimate,2102,Germany 1994 Estimate,UNSD,,,
39064,Pakistan,586,2005,2005,Women,[35-39],35,39,97.83,Estimate,2005 Estimate,2185,Pakistan 2005 Estimate,UNSD,,,Excluding data for disputed territory.


In [1172]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
28796,Latvia,428,Women,[15-19],15,19,1.14,Estimate,UNSD
3997,Bolivia (Plurinational State of),68,Women,[35-39],35,39,90.28,Census,UNSD
54963,Western Sahara,732,Women,[55-59],55,59,90.0,Census,UNSD
36601,New Zealand,554,Men,[70-74],70,74,94.09,Census,UNSD
24130,Indonesia,360,Women,[75+],75,999,98.99,Survey,MICS_HH
9498,Congo,178,Men,[50-54],50,54,93.92,Census,UNSD
55504,Zimbabwe,716,Women,[30-34],30,34,97.4,Survey,DHS_STATcompiler
22056,Hungary,348,Women,[35-39],35,39,88.46,Estimate,UNSD


In [1173]:
df_12.dropna(inplace=True)

In [1174]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [1175]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [1176]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [1177]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [1178]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
57303,Puerto Rico,630,[Total],TFR,1961.5,4.358041,"Register,Computed rate from DYB,DYB,541-135-19",Computed rate from DYB,Register,VR,541,Vital Registration,Register,1961,1961.0,Demographic Yearbook,1966.0
34195,India,356,[25-29],ASFR2529,2007.5,158.3,"SRS,Computed rate from reported ASFR,Report",Computed rate from reported ASFR,SRS,SRS,1865,India Sample Registration System,SRS,1968,,India Sample Registration System,
28281,Germany,276,[30-34],ASFR3034,1966.5,89.512,"Estimates,Fertility data (Adjusted),HFC-ODE,21...",Fertility data (adjusted),Estimate,Estimate,2102,All sources of estimates,Estimates,1966,1966.0,European Demographic Observatory (ODE). Data c...,2011.0
9022,Bosnia and Herzegovina,70,[15-19],ASFR1519,1968.5,51.638,"Estimates,Fertility data (Adjusted),HFC-ODE,20...",Fertility data (adjusted),Estimate,Estimate,2048,All sources of estimates,Estimates,1968,1968.0,European Demographic Observatory (ODE). Data c...,2011.0
50440,Nepal,524,[15-19],ASFR1519,1993.0,149.0,"Estimates,Official estimates,Article",Official estimates,Estimate,Estimate,2169,All sources of estimates,Estimates,1993,1993.0,"Retherford, R. D. (1999). ""The trend of fertil...",1999.0
14186,China,156,[25-29],ASFR2529,1985.337782,161.2,"2000 Census,Reconstructed Birth Histories data...",Reconstructed Birth Histories data,Census,Census,272,China 2000 Census,2000 Census,2000,2000.0,"Fertility Estimates for Provinces of China, 19...",2007.0


In [1179]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [1180]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
52957,Nigeria,[25-29],1965,275.24,Birth histories,Survey,1981-1982 WFS,1981,1982
56002,Philippines,[15-19],1985,48.0,Birth histories,Survey,1986 CPS,1986,1986
11928,Cameroon,[25-29],1971,282.16,Birth histories,Survey,1978 WFS,1978,1978
69301,TFYR Macedonia,[35-39],1955,149.09,Fertility data (adjusted),Estimate,Estimates,1955,1955
30109,Grenada,[Total],2014,28.47,Computed rate from DYB,Register,Register,2014,2014
17191,Costa Rica,[15-19],1970,102.0,Fertility data (adjusted),Estimate,Estimates,1970,1970
66171,Sri Lanka,[35-39],1970,134.0,Direct,Register,Register,1970,1970
65307,South Sudan,[20-24],2002,307.19,Birth histories,Survey,2010 HHS-II,2010,2010
16571,Colombia,[20-24],2007,125.0,Direct,Survey,2010 DHS,2009,2010
42335,Lesotho,[20-24],1997,206.43,Birth histories,Survey,2009 DHS,2009,2010


In [1181]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [1182]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [1183]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [1184]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
96775,Grenada,308,1981,1981,Men,Widowed,,,[35-39],35,...,1981 Census,398,Grenada 1981 Census,UNSD,,,,,,
219256,Sierra Leone,694,2008,2008,Men,Not living together,,,[15-19],15,...,2008 DHS,4550,Sierra Leone 2008 Demographic and Health Survey,DHS_STATcompiler,,,,,,
166652,Namibia,516,2001,2001,Men,Widowed,,,[45-49],45,...,2001 Census,1283,Namibia 2001 Census,UNSD,,,,,,
102858,Honduras,340,1988,1988,Men,Married,,,[30-34],30,...,1988 Census,1342,Honduras 1988 Census,US Census Bureau,,,,,,
164055,Mozambique,508,1970,1970,Men,Widowed,,,[30-34],30,...,1970 Census,847,Mozambique 1970 Census,US Census Bureau,,,,,,


In [1185]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
49893,Costa Rica,2003,2003,Women,Separated,[40-44],40,44,10.86,Estimate,2003 Estimate,UNSD
127938,Italy,2007,2007,Women,Never married,[35-39],35,39,17.4,Survey,2007 GGS,GGS
225584,Solomon Islands,2009,2009,Women,Married,[30-34],30,34,77.3,Census,2009 Census,National statistics
48084,Cook Islands,1976,1976,Women,Divorced,[20-24],20,24,0.0,Census,1976 Census,UNSD
252877,Uganda,2011,2011,Women,Widowed,[40-44],40,44,10.9,Survey,2011 DHS,DHS_STATcompiler
98448,Guatemala,2018,2018,Men,Consensual union,[15-19],15,19,5.32,Census,2018 Census,National statistics
118067,Indonesia,2005,2005,Women,Single,[45-49],45,49,1.97,Survey,2005 SUPAS,UNSD
178048,Niger,2001,2001,Men,Divorced,[60-64],60,64,11.44,Census,2001 Census,UNSD
60727,Denmark,1981,1981,Women,Widowed,[15-19],15,19,0.0,Census,1981 Census,UNSD
175409,New Zealand,1986,1986,Men,Divorced,[20-24],20,24,0.21,Census,1986 Census,UNSD


In [1186]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [1187]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [1188]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [1189]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [1190]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
6126,Middle Africa,911,M49,Married or in-union women,2006,45-49,73.249282,3098.582,Estimate
5532,Eastern Africa,910,M49,Married or in-union women,2013,35-39,80.805705,15796.35,Estimate
22104,Polynesia,957,SDG-M49,Married or in-union women,1979,15-19,10.447188,5.807592,Estimate
20639,Australia and New Zealand,1834,SDG-M49,Married or in-union women,2044,15-49,47.389856,3693.792,Projection
8349,Western Africa,914,M49,Married or in-union women,2041,40-44,82.132222,26586.65,Projection
23911,Developing countries,902,Development group,Married or in-union women,2042,15-49,62.227151,1180004.0,Projection


In [1191]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
28447,No income group available,1518,Married or in-union women,2042,15-49,47.377388,7101.470184,Projection
19781,Australia and New Zealand,1834,Married or in-union women,1991,25-29,66.368365,560.514357,Estimate
21839,Micronesia,954,Married or in-union women,2026,15-49,49.357312,123.560107,Projection
8510,Asia,935,Married or in-union women,1980,45-49,85.53913,92171.853566,Estimate
14290,Southern Europe,925,Married or in-union women,1974,25-29,73.58381,3397.377013,Estimate
26509,Lower-middle-income countries,1501,Married or in-union women,2043,40-44,86.530532,243151.641533,Projection
26261,Lower-middle-income countries,1501,Married or in-union women,2012,40-44,87.540949,148054.076038,Estimate
7568,Southern Africa,913,Married or in-union women,2025,15-19,1.156617,73.119968,Projection
6367,Middle Africa,911,Married or in-union women,2036,15-49,51.470091,74217.772396,Projection
3641,Oceania excluding Australia and New Zealand,543,Married or in-union women,2020,20-24,52.990194,611.220689,Estimate


In [1192]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [1193]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [1194]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [1195]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [1196]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [1197]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [1198]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1199]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [1200]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [1201]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [1202]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [1203]:
df_16_1

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
0,Mexico,3.56,4.08,2.76
1,Costa Rica,3.46,4.37,3.44
2,Türkiye,3.2,4.1,2.8
3,Israel,3.19,4.65,2.86
5,Slovak Republic,3.1,3.8,2.5
7,Iceland,2.7,4.12,2.61
8,New Zealand,2.61,3.88,2.67
9,Greece,2.6,3.8,2.5
10,Croatia,2.6,3.9,2.6
11,Australia,2.53,3.93,2.78


In [1204]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [1205]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [1206]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [1207]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [None]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [1229]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total            float64           0            36
with_children           float64           0            35
without_children        float64           0            36
single_parent_total     float64           0            34
single_mother           float64           0            32
single_father           float64           0            31
single_person           float64           0            35
other_households_types  float64           0            36
country                    object
couple_total              float64
with_children             float64
without_children          float64
single_parent_total       float64
single_mother             float64
single_father             float64
single_person             float64
other_households_types    float64
dtype: object


In [1230]:
df_16_2

Unnamed: 0,country,couple_total,with_children,without_children,single_parent_total,single_mother,single_father,single_person,other_households_types
1,Austria,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11
2,Belgium,52.22,23.98,28.24,7.42,6.08,1.35,35.5,4.86
6,Costa Rica,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74
7,Czechia,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67
8,Denmark,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52
9,Estonia,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98
10,Finland,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6
11,France,49.73,22.19,27.54,7.68,6.23,1.45,37.78,4.81
12,Germany,45.78,17.89,27.89,5.41,4.44,0.98,43.14,5.67
13,Greece,52.14,24.03,28.11,4.66,3.82,0.84,32.35,10.85


In [1211]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024a
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0.1,Unnamed: 0,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [1213]:
df_17 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_17

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,


In [1214]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1215]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [1216]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [1217]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [1218]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1219]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [1220]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [1221]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [1222]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [1223]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [1224]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [1225]:
df_888= pd.read_csv('../data/Raw/OECD/Households-by-type,-presence-of-children-and-country,-2015-2024.csv')
df_888

Unnamed: 0,Category,Single adult with children,Single adult without children,Couple with children,Couple without children,Other type of household with children,Other type of household without children
0,2015,6147.3,64181.3,31679.8,46641.6,11698.9,30771.6
1,2016,6148.5,63891.1,31907.3,47308.2,11766.3,30559.5
2,2017,6108.5,65353.9,32091.5,47426.1,11530.2,30297.5
3,2018,6163.6,66165.5,31720.2,48194.8,11342.5,30224.0
4,2019,6246.4,67417.9,31710.1,48503.6,11285.7,30134.8
5,2020,6136.4,67412.9,31622.2,48831.2,11212.9,30445.2
6,2021,5691.9,70200.4,30558.3,47447.4,11611.8,30700.7
7,2022,5984.9,72134.3,30469.3,47995.5,11513.6,30412.1
8,2023,5924.8,73396.2,30313.0,48477.5,11443.5,30608.8
9,2024,6077.7,75049.7,30286.5,49058.4,11311.9,30487.3


In [1226]:
df_999 = pd.read_csv('../data/Raw/OECD/Households-with-children-by-number-of-children,-2024.csv')
df_999

Unnamed: 0,Category,1 child,2 children,3 children or more
0,European Union,11.7,8.9,3.0
1,,,,
2,Slovakia,17.1,14.5,4.0
3,Ireland,12.4,12.2,6.4
4,Cyprus,13.9,11.7,3.1
5,Czechia,13.9,11.6,2.6
6,Romania,14.3,9.2,4.0
7,Luxembourg,12.5,12.1,2.4
8,Belgium,11.8,10.2,4.1
9,Croatia,12.0,10.1,3.8


In [1227]:
import pandas as pd

df = pd.read_csv('../data/Raw/OECD/OECD,DF_FAMILY,+all.csv')

df_wide = df.pivot_table(
    index=['Country', 'TIME_PERIOD', 'COU'],
    columns='Indicator',
    values='OBS_VALUE'
).reset_index()

df_wide.columns.name = None

df_wide.to_csv("WIDE_FORMAT.csv", index=False)

print(df_wide)

# df_wide = df.pivot(index='id', columns='variable', values='value')
# df = df.drop(columns='indicator')

            Country  TIME_PERIOD  COU  Child poverty rate  \
0         Argentina         2001  ARG                 NaN   
1         Argentina         2002  ARG                 NaN   
2         Argentina         2003  ARG                 NaN   
3         Argentina         2004  ARG                 NaN   
4         Argentina         2005  ARG                 NaN   
...             ...          ...  ...                 ...   
1170  United States         2018  USA                 NaN   
1171  United States         2019  USA                 NaN   
1172  United States         2020  USA                 NaN   
1173  United States         2021  USA                 NaN   
1174  United States         2022  USA                 NaN   

      Country mean average score in mathematics, by sex  \
0                                                   NaN   
1                                                   NaN   
2                                                   NaN   
3                              