In [744]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [745]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [746]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [747]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [748]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [749]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [750]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [751]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [752]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [753]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [754]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [755]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [756]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [757]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [758]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [759]:
df_2.drop_duplicates(inplace=True)


In [760]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [761]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [762]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [763]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [764]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [765]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [766]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [767]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [768]:
df_3.drop_duplicates(inplace=True)


In [769]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [770]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [771]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [772]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [773]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [774]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [775]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [776]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [777]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [778]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [779]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [780]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [781]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [782]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [783]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [784]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [785]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [786]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [787]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
62,Women,,45,84.2,91.4,95.4,86.7,74.5,,,
38,Women,,21,14.6,26.1,42.2,31.5,12.7,4.8,1.7,0.6
18,Men,,35,84.8,86.7,89.1,73.7,56.3,44.2,,
23,Men,,40,89.6,89.9,91.1,78.0,64.0,53.9,,
5,Men,,22,13.5,16.8,25.8,18.8,6.8,2.5,1.1,0.4


In [788]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [789]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [790]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [791]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [792]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [793]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [794]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1107,Latvia,LVA,1978,12.0
65,Austria,AUT,1979,16.5
1224,Luxembourg,LUX,1983,8.1
707,Germany,DEU,1986,16.1
2005,Switzerland,CHE,2020,27.7


In [795]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [796]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [797]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [798]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [799]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [800]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
33,Men,GBR,50,92.9,91.9,92.4,81.7,70.8,,,
40,Women,GBR,23,32.7,49.5,68.2,48.4,24.0,10.0,4.7,1.6
17,Men,GBR,34,83.2,85.5,88.5,72.4,53.7,40.9,,
6,Men,GBR,23,21.4,26.8,38.1,26.2,10.5,3.9,2.1,0.8
47,Women,GBR,30,71.6,82.7,90.9,76.7,55.8,37.8,28.9,


In [801]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
6,Men,23,21.4,26.8,38.1,26.2,10.5
54,Women,37,80.6,89.4,94.4,84.1,69.4
19,Men,36,86.1,87.6,89.7,74.8,58.3
14,Men,31,76.5,79.7,85.1,66.6,45.1
52,Women,35,79.0,88.3,93.8,82.8,66.7


In [802]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [803]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [804]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [805]:
#pip install openpyxl pywin32

In [806]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [807]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [808]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [809]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [810]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [811]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [812]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [813]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [814]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [815]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [816]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [817]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
1405,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,50,Bangladesh,BD,BGD,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,76.54
20128,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,694,Sierra Leone,SL,SLE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,68.94
14930,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,508,Mozambique,MZ,MOZ,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,74.47
14881,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,504,Morocco,MA,MAR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.99
10977,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,388,Jamaica,JM,JAM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,56.24


In [818]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
19809,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,686,Senegal,SN,SEN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,65.56
658,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,28,Antigua and Barbuda,AG,ATG,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,30.62
13127,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,446,"China, Macao SAR",MO,MAC,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,46.03
20037,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,690,Seychelles,SC,SYC,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,45.1
11757,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,410,Republic of Korea,KR,KOR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,43.16


In [819]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [820]:
df_9.drop_duplicates(inplace=True)

In [821]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [822]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [823]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [824]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [825]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [826]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
42285,Finland,246,Married or in-union women,1990,40-44,77.55,165.839512,Estimate
80955,Malta,470,Married or in-union women,2045,30-34,64.62186,9.056754,Projection
117622,Singapore,702,Married or in-union women,2011,45-49,78.83913,143.599168,Estimate
22054,Cayman Islands,136,Married or in-union women,1972,45-49,80.77,0.225348,Estimate
83668,Mongolia,496,Married or in-union women,1979,35-39,91.71,34.840629,Estimate
127849,Tajikistan,762,Married or in-union women,1994,20-24,71.916098,191.327384,Estimate
12219,Bolivia (Plurinational State of),68,Married or in-union women,2039,30-34,58.512295,332.087702,Projection
26628,Colombia,170,Married or in-union women,1977,35-39,74.35375,430.426795,Estimate
11908,Bolivia (Plurinational State of),68,Married or in-union women,2000,35-39,83.09,198.399394,Estimate
99807,Panama,591,Married or in-union women,1971,15-49,59.439294,204.164466,Estimate


In [827]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
28211,Mayotte,175,Married or in-union women,2013,30-34,70.616654,6.789085,Estimate
108162,Romania,642,Married or in-union women,2044,25-29,46.803192,218.746185,Projection
32560,Cyprus,196,Married or in-union women,1990,15-19,2.755064,0.817855,Estimate
39095,Eritrea,232,Married or in-union women,1996,15-49,66.332869,336.013458,Estimate
85369,Montenegro,499,Married or in-union women,2030,20-24,14.455283,2.646762,Projection


In [828]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [829]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [830]:
df_10.dropna(inplace=True)

In [831]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [832]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [833]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [834]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [835]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [836]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
7776,Chile,152,1992,1992,Men,[40-44],40,44,82.88,Census,1992 Census,861,Chile 1992 Census,UNSD,1.0,,
40416,Rwanda,646,2005,2005,Women,[75+],75,999,18.0,Survey,2005 DHS,1717,Rwanda 2005 Demographic and Health Survey,DHS_HH,1.0,,
17293,Germany,276,2014,2014,Men,[65-69],65,69,79.11,Estimate,2014 Estimate,2102,Germany 2014 Estimate,UNSD,,,
43237,Sierra Leone,694,1992,1992,Women,[35-39],35,39,86.7,Survey,1992 DSMS,4549,Sierra Leone 1992 Demographic and Social Situa...,National statistics,,,
22241,Iceland,352,2018,2018,Men,[25-29],25,29,12.11,Estimate,2018 Estimate,2121,Iceland 2018 Estimate,UNSD,,,
10891,Czechia,203,2012,2012,Men,[70-74],70,74,78.2,Estimate,2012 Estimate,2079,Czech Republic 2012 Estimate,UNSD,,,
32861,Myanmar,104,2015,2016,Men,[25-29],25,29,60.94,Survey,2015-16 MDHS,6084,Myanmar 2015-2016 Demographic and Health Survey,DHS_HH,,,
2787,Bangladesh,50,2004,2004,Women,[30-34],30,34,92.7,Survey,2004 DHS,1813,Bangladesh 2004 Demographic and Health Survey,DHS_STATcompiler,,,


In [837]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
31431,Mexico,484,2009,2009,Men,[25-29],25,29,54.64,Survey,2009 ENADID,5550,Mexico 2009 Encuesta Nacional de la Dinámica D...,UNSD,1.0,,
19910,Hungary,348,1975,1975,Women,[30-34],30,34,86.64,Estimate,1975 Estimate,2120,Hungary 1975 Estimate,UNSD,,,
34090,Netherlands,528,2007,2007,Women,[45-49],45,49,69.99,Estimate,2007 Estimate,2170,Netherlands 2007 Estimate,UNSD,,,
15826,France,250,1982,1982,Men,[55-59],55,59,84.06,Census,1982 Census,363,France 1982 Census,UNSD,,Based on a sample of census returns.,Excluding diplomatic personnel outside the cou...
2576,Bahrain,48,1995,1995,Women,[30-34],30,34,75.7,Survey,1995 FHS,181,Bahrain 1995 Family Health Survey,GFHS,,,
19820,Hungary,348,1972,1972,Men,[70-74],70,74,77.31,Estimate,1972 Estimate,2120,Hungary 1972 Estimate,UNSD,,,
47226,Sweden,752,2006,2006,Women,[70-74],70,74,52.36,Estimate,2006 Estimate,2227,Sweden 2006 Estimate,UNSD,,,
51947,Uruguay,858,1985,1985,Men,[15-19],15,19,1.83,Census,1985 Census,241,Uruguay 1985 Census,UNSD,1.0,Data have not been adjusted for underenumeration.,


In [838]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
50279,Turkey,792,Women,[10-14],10,14,0.0,Estimate,2013 Estimate,UNSD
13396,Estonia,233,Men,[50-54],50,54,73.0,Census,2000 Census,UNSD
9337,Costa Rica,188,Women,[20-24],20,24,49.43,Census,1984 Census,UNSD
13155,El Salvador,222,Women,[45-49],45,49,68.5,Survey,1985 DHS,DHS_STATcompiler
34364,Netherlands,528,Women,[35-39],35,39,52.15,Estimate,2018 Estimate,UNSD
48932,Timor-Leste,626,Men,[40-44],40,44,93.9,Survey,2009-2010 DHS,DHS_STATcompiler
32135,Montserrat,500,Women,[15-19],15,19,0.33,Census,1980 Census,UNSD
4220,Bosnia and Herzegovina,70,Women,[45-49],45,49,86.12,Survey,2011-2012 MICS,MICS
33243,Nepal,524,Women,[25-29],25,29,93.2,Survey,2001 DHS,DHS_STATcompiler
19428,Haiti,332,Women,[60-64],60,64,51.99,Census,1982 Census,UNSD


In [839]:
df_11.drop_duplicates(inplace=True)

In [840]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [841]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [842]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [843]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [844]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
30920,Lithuania,440,2015,2015,Women,[10-14],10,14,0.0,Estimate,2015 Estimate,2146,Lithuania 2015 Estimate,UNSD,,,
53502,United Kingdom,826,2010,2010,Women,[20-24],20,24,6.27,Estimate,2010 Estimate,2246,United Kingdom 2010 Estimate,UNSD,,,Data pertain to England and Wales only. | Excl...
20833,Haiti,332,2005,2006,Women,[55-59],55,59,99.5,Survey,2005-2006 DHS,1785,Haiti 2005-2006 Demographic and Health Survey,DHS_HH,,,
17897,Germany,276,1987,1987,Women,[35-39],35,39,90.86,Census,1987 Census,5308,Germany 1987 Census,IPUMS,,Data are based on a 5 per cent sample of censu...,Data pertain to West Germany.
51206,Togo,768,2010,2010,Men,[35-39],35,39,89.85,Census,2010 Census,5267,Togo 2010 Census,INED,1.0,,
31353,Madagascar,450,2000,2000,Women,[20-24],20,24,75.38,Survey,2000 MICS_HH,1891,Madagascar 2000 Multiple Indicator Cluster Survey,MICS_HH,,,
13873,Egypt,818,2000,2000,Men,[40-44],40,44,97.9,Survey,2000 DHS,1830,Egypt 2000 Demographic and Health Survey,DHS_HH,,,
28968,Latvia,428,2011,2011,Women,[45-49],45,49,88.52,Estimate,2011 Estimate,2142,Latvia 2011 Estimate,UNSD,,,


In [845]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
40083,Philippines,608,Men,[65-69],65,69,97.19,Census,UNSD
9045,Colombia,170,Men,[50-54],50,54,90.7,Census,National statistics
14235,Equatorial Guinea,226,Women,[60-64],60,64,93.57,Survey,MICS_HH
31721,Malaysia,458,Women,[55-59],55,59,97.72,Census,UNSD
37578,Norway,578,Women,[65-69],65,69,83.87,Estimate,UNSD
27534,Kazakhstan,398,Women,[60-64],60,64,98.69,Census,National statistics
41580,Réunion,638,Women,[75+],75,999,82.34,Census,UNSD
39576,Paraguay,600,Women,[50-54],50,54,84.49,Census,UNSD


In [846]:
df_12.dropna(inplace=True)

In [847]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [848]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [849]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [850]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [851]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
26401,French Guiana,254,[35-39],ASFR3539,2004.5,89.66,"Register,Direct,CAG-BilanDemog2005,450-16-39269",Direct,Register,VR,450,Vital Registration,Register,2004,2004,CAG-BilanDemog2005,2005
60055,Russian Federation,643,[Total],MAC,2003.5,26.25,NSO.20170421,Direct,Register,VR,545,Vital Registration,Register,2003,2003,Central Statistics Database,2017
23601,Estonia,233,[30-34],ASFR3034,1979.5,55.16,Eurostat.20190531,Official estimates,Estimate,Estimate,2089,All sources of estimates,Estimates,1979,1979,"Eurostat Statistics, Fertility rates by age [d...",2019
71097,Trinidad and Tobago,780,[Total],TFR,1967.5,3.850256,"Register,Computed rate from DYB,DYB,567-135-22",Computed rate from DYB,Register,VR,567,Vital Registration,Register,1967,1967,Demographic Yearbook,1969
58314,Republic of Moldova,498,[20-24],ASFR2024,1988.5,193.88,NSO.201510,Direct,Register,VR,614,Vital Registration,Register,1988,1988,NSO website,2015
31896,Haiti,332,[20-24],ASFR2024,2004.620465,159.2419,"2016 DHS,Birth Histories,FBH analysis 2018,582...",Birth histories,Survey,DHS,5825,Haiti 2016 Demographic and Health Survey (DHS),2016 DHS,2016,2017,Fertility rates from full birth histories anal...,2018


In [852]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [853]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
74647,United Republic of Tanzania,[30-34],2007,217.0,Direct,Survey,2010 DHS,2009,2010
66121,Sri Lanka,[35-39],1967,145.5,Computed rate from reported ASFR,Register,Register,1967,1967
57453,Puerto Rico,[40-44],1979,11.51,Computed rate from DYB,Register,Register,1979,1979
69293,TFYR Macedonia,[40-44],1954,73.15,Fertility data (adjusted),Estimate,Estimates,1954,1954
57215,Portugal,[45-49],2014,0.56,Official estimates,Estimate,Estimates,2014,2014
50516,Nepal,[15-19],1995,155.0,Official estimates,Estimate,Estimates,1995,1995
6671,Belarus,[25-29],1977,124.4,Fertility data (adjusted),Estimate,Estimates,1977,1977
398,Albania,[Total],1964,5.66,Fertility data (adjusted),Estimate,Estimates,1964,1964
35747,Iran (Islamic Republic of),[35-39],1996,56.6,Own-children method,Census,1996 Census,1996,1996
68235,Switzerland,[30-34],1997,96.34,Official estimates,Estimate,Estimates,1997,1997


In [854]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [855]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [856]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [857]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
226297,South Africa,710,1991,1991,Women,Single,,,[70-74],70,...,1991 Census,1360,South Africa 1991 Census,UNSD,,,,,,
217419,Senegal,686,2017,2017,Men,Widowed,,,[35-39],35,...,2017 DHS,7630,Senegal 2017 Demographic and Health Survey,DHS_STATcompiler,,,,,,
83040,France,250,1985,1985,Men,Married,,,[40-44],40,...,1985 Estimate,2094,France 1985 Estimate,UNSD,,,,,Excluding diplomatic personnel outside the cou...,
125476,Israel,376,2009,2009,Men,Married,,,[65-69],65,...,2009 Estimate,2127,Israel 2009 Estimate,UNSD,,Totals may not add up to the sum of the respec...,,,Including data for East Jerusalem and Israeli ...,
268850,Zambia,894,1996,1997,Women,Never married,,,[25-29],25,...,1996 DHS,1699,Zambia 1996 Demographic and Health Survey,DHS_STATcompiler,,,,,,


In [858]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
22580,Brazil,1980,1980,Men,Married,[20-24],20,24,23.69,Census,1980 Census,UNSD
239449,Switzerland,1996,1996,Women,Divorced,[45-49],45,49,13.15,Estimate,1996 Estimate,UNSD
51232,Costa Rica,2015,2015,Women,Married,[25-29],25,29,23.15,Estimate,2015 Estimate,UNSD
239601,Switzerland,1998,1998,Women,Widowed,[35-39],35,39,0.77,Estimate,1998 Estimate,UNSD
67432,Ecuador,1974,1974,Women,Single,[20-24],20,24,40.71,Census,1974 Census,UNSD
144524,Liechtenstein,2000,2000,Men,Divorced,[60-64],60,64,5.42,Estimate,2000 Estimate,UNSD
74448,Faeroe Islands,2011,2011,Women,Married,[65-69],65,69,69.37,Census,2011 Census,National statistics
64013,Denmark,2013,2013,Women,Single,[45-49],45,49,19.07,Estimate,2013 Estimate,UNSD
245012,Togo,1970,1970,Women,Married or in consensual union,[30-34],30,34,93.04,Census,1970 Census,UNSD
109311,Hungary,2018,2018,Men,Divorced,[25-29],25,29,0.59,Estimate,2018 Estimate,UNSD


In [859]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [860]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [861]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [862]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [863]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
26268,Lower-middle-income countries,1501,Income group,Married or in-union women,2013,35-39,89.421682,176789.388846,Estimate
9281,Central Asia,5500,SDG-M49,Married or in-union women,1996,20-24,60.025997,2874.747027,Estimate
10709,Southern Asia,5501,SDG-M49,Married or in-union women,2012,40-44,89.536677,90170.624488,Estimate
24117,Other developing countries,934,Development group,Married or in-union women,1987,40-44,88.235734,67267.379949,Estimate
4056,Europe and Northern America,513,SDG,Married or in-union women,1991,15-19,7.685652,2642.798423,Estimate
25913,Low-income countries,1500,Income group,Married or in-union women,2050,20-24,46.658067,58837.237221,Projection


In [864]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
24009,Other developing countries,934,Married or in-union women,1974,20-24,63.243094,73034.429237,Estimate
17502,South America,931,Married or in-union women,1970,45-49,74.367035,5850.725361,Estimate
24629,Least developed countries,941,Married or in-union women,1970,40-44,81.870404,5894.647772,Estimate
8861,Asia,935,Married or in-union women,2024,40-44,89.292037,283836.509608,Estimate
7486,Southern Africa,913,Married or in-union women,2014,45-49,57.225779,1921.072824,Estimate
4719,Africa,903,Married or in-union women,1992,15-49,66.074066,205592.479669,Estimate
5804,Eastern Africa,910,Married or in-union women,2047,35-39,78.31285,43849.363835,Projection
7389,Southern Africa,913,Married or in-union women,2002,40-44,63.151416,2069.003309,Estimate
15924,Latin America and the Caribbean,904,Married or in-union women,2016,35-39,71.795045,33043.500098,Estimate
17105,Central America,916,Married or in-union women,2002,20-24,49.111576,6577.512833,Estimate


In [865]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [866]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [867]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [868]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [869]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [870]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [871]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [872]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [873]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [874]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [875]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [876]:
df_16_1.sample(10)

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
42,Finland,1.9,4.0,2.6
28,Switzerland,2.21,4.02,2.58
26,United Kingdom,2.3,3.9,2.8
30,Japan,2.21,3.85,2.73
13,Spain,2.5,3.7,2.4
9,Greece,2.6,3.8,2.5
8,New Zealand,2.61,3.88,2.67
29,Korea,2.21,3.55,2.34
40,Norway,2.0,3.9,2.4
31,Austria,2.2,3.8,2.5


In [877]:
#df_16_1.to_csv('../data/Cleaned/cleaned_average_size_of_households_type_2024_oecd.csv', index=False)

In [878]:
#df_16_1.to_sql('average_size_of_households_type_2024_oecd', engine, if_exists = 'replace', index= False)

In [879]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [880]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [881]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [882]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [883]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [884]:
df_16_2.rename(columns={
   "couple_total" : "couple_total(%)",
   "with_children" : "with_children(%)",
   "without_children" : "without_children(%)",
    "single_parent_total" : "single_parent_total(%)",
    "single_mother" : "single_mother(%)",
    "single_father" : "single_father(%)",
    "single_person" : "single_person(%)",
    "other_household_types" : "other_household_types(%)"
}, inplace=True)

In [885]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total(%)         float64           0            36
with_children(%)        float64           0            35
without_children(%)     float64           0            36
single_parent_total(%)  float64           0            34
single_mother(%)        float64           0            32
single_father(%)        float64           0            31
single_person(%)        float64           0            35
other_households_types  float64           0            36
country                    object
couple_total(%)           float64
with_children(%)          float64
without_children(%)       float64
single_parent_total(%)    float64
single_mother(%)          float64
single_father(%)          float64
single_person(%)          float64
other_households_types    float64
dtype: object


In [886]:
df_16_2.sample(10)

Unnamed: 0,country,couple_total(%),with_children(%),without_children(%),single_parent_total(%),single_mother(%),single_father(%),single_person(%),other_households_types
19,Japan,45.07,15.67,29.4,2.5,2.26,0.25,37.97,14.45
18,Italy,46.7,20.91,25.8,7.27,5.65,1.63,36.64,9.38
39,Bulgaria,40.3,16.35,23.95,4.6,3.88,0.73,35.81,19.28
25,Netherlands,53.6,23.01,30.59,6.1,5.0,1.09,38.5,1.8
40,Croatia,51.51,24.78,26.73,5.42,4.39,1.04,27.8,15.27
41,Cyprus,56.92,27.42,29.5,6.17,4.94,1.23,24.49,12.43
33,Sweden,49.27,22.49,26.78,6.67,4.91,1.76,39.24,4.82
42,Malta,46.92,21.11,25.81,5.68,4.56,1.12,32.51,14.89
30,Slovak Republic,37.15,16.99,20.16,6.23,5.39,0.84,31.4,25.21
10,Finland,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6


In [887]:
#df_16_2.to_csv('../data/Cleaned/cleaned_types_of_household_2021_oecd.csv', index = False)

In [888]:
#df_16_2.to_sql('types_of_household_2021_oecd', engine, if_exists = 'replace', index= False)

In [889]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [890]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [891]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [892]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [893]:
df_16_3.sample(10)

Unnamed: 0,country,0_children,1_child,2_children,3_or_more_children,children_under_6
9,Estonia,75.76,12.53,8.73,2.98,9.85
21,Latvia,74.8,14.05,8.32,2.83,10.07
16,Ireland,69.02,12.42,12.18,6.38,11.81
30,Slovak Republic,64.41,17.09,14.49,4.02,15.56
14,Hungary,74.98,13.16,8.71,3.16,10.41
13,Greece,74.31,11.83,9.97,3.89,9.37
8,Denmark,77.78,10.54,8.94,2.74,8.15
25,Netherlands,78.65,8.78,9.27,3.3,8.79
22,Lithuania,80.44,11.06,7.0,1.51,8.12
1,Austria,77.78,10.52,8.57,3.12,9.44


In [894]:
df_16_3.rename(columns={
    "0_children": "households_0_children(%)",
    "1_child": "households_1_child(%)",
    "2_children": "households_2_children(%)",
    "3_or_more_children": "households_3_or_more_children(%)"
}, inplace=True)

In [895]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                                    dtype  null_count  unique_count
country                            object           0            33
households_0_children(%)          float64           0            32
households_1_child(%)             float64           0            32
households_2_children(%)          float64           0            33
households_3_or_more_children(%)  float64           0            31
children_under_6                  float64           0            31
country                              object
households_0_children(%)            float64
households_1_child(%)               float64
households_2_children(%)            float64
households_3_or_more_children(%)    float64
children_under_6                    float64
dtype: object


In [896]:
df_16_3.sample(10)

Unnamed: 0,country,households_0_children(%),households_1_child(%),households_2_children(%),households_3_or_more_children(%),children_under_6
22,Lithuania,80.44,11.06,7.0,1.51,8.12
8,Denmark,77.78,10.54,8.94,2.74,8.15
10,Finland,81.98,7.89,6.99,3.14,7.14
1,Austria,77.78,10.52,8.57,3.12,9.44
41,Cyprus,71.36,13.88,11.67,3.1,12.71
28,Poland,74.39,12.91,9.84,2.86,9.97
19,Japan,81.94,8.78,7.17,2.11,7.58
32,Spain,74.61,13.54,8.95,2.9,8.79
14,Hungary,74.98,13.16,8.71,3.16,10.41
33,Sweden,74.84,10.77,9.83,4.56,9.95


In [897]:
#df_16_3.to_csv('../data/Cleaned/cleaned_households_by_number_of_children_2024_oecd.csv', index=False)

In [898]:
#df_16_3.to_sql('households_by_number_of_children_2024_oecd', engine, index= False)

In [899]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
#total_fertility_rates_from_1960_oecd
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [900]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [901]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [902]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [903]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [904]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            49
1960     float64           0            47
1961     float64           0            47
1962     float64           0            47
1963     float64           0            46
...          ...         ...           ...
2019     float64           0            37
2020     float64           0            39
2021     float64           0            40
2022     float64           0            34
2023     float64           0            35

[65 rows x 3 columns]


In [905]:
df_17_1.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
1,Austria,2.69,2.78,2.8,2.82,2.79,2.7,2.66,2.62,2.58,...,1.46,1.49,1.53,1.52,1.48,1.46,1.44,1.48,1.41,1.32
20,Latvia,1.94,1.94,1.91,1.85,1.79,1.74,1.76,1.8,1.83,...,1.65,1.7,1.74,1.69,1.6,1.61,1.55,1.57,1.47,1.36
4,Chile,4.7,4.66,4.6,4.54,4.46,4.36,4.26,4.14,4.03,...,1.77,1.74,1.69,1.56,1.54,1.43,1.31,1.18,1.26,1.17
27,Poland,2.98,2.83,2.72,2.7,2.57,2.52,2.34,2.33,2.24,...,1.29,1.29,1.36,1.45,1.44,1.42,1.39,1.33,1.26,1.16
25,New Zealand,4.24,4.31,4.19,4.05,3.8,3.54,3.41,3.35,3.34,...,1.92,1.99,1.87,1.81,1.71,1.72,1.61,1.64,1.66,1.56
33,Switzerland,2.44,2.53,2.6,2.67,2.68,2.61,2.52,2.41,2.3,...,1.54,1.54,1.54,1.52,1.52,1.48,1.46,1.52,1.39,1.33
2,Belgium,2.54,2.63,2.59,2.68,2.71,2.61,2.52,2.41,2.31,...,1.74,1.7,1.68,1.65,1.62,1.6,1.55,1.6,1.53,1.47
11,France,2.74,2.82,2.8,2.9,2.91,2.85,2.8,2.67,2.59,...,2.0,1.96,1.92,1.89,1.87,1.86,1.82,1.83,1.78,1.66
10,Finland,2.71,2.65,2.66,2.66,2.58,2.46,2.41,2.32,2.15,...,1.71,1.65,1.57,1.49,1.41,1.35,1.37,1.46,1.32,1.26
3,Canada,3.9,3.84,3.76,3.67,3.5,3.15,2.81,2.6,2.45,...,1.61,1.6,1.59,1.55,1.51,1.47,1.41,1.44,1.33,1.26


In [906]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [907]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [908]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [909]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [910]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_17_2.head()

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341


In [911]:
# --- Ensure "country" and "birth order" are strings (tidy casing/spacing) ---
df_17_2["country"] = df_17_2["country"].astype(str).str.strip().str.title()
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_17_2.columns if c not in ["country", "birth_order"]]
# --- Robust cleaning -> convert to float ---
df_17_2[num_cols] = (
    df_17_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_17_2[num_cols] = df_17_2[num_cols].round(2)


In [912]:
df_17_2.drop_duplicates(inplace=True)
df_17_2.dropna(inplace=True)

In [913]:
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str) + "(%)"

In [914]:
df_17_2.head(10)

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth(%),47.6,47.8,46.7,46.2,46.5,46.1,45.2,44.5,...,48.0,48.3,47.3,47.5,47.1,47.2,47.7,47.6,48.4,48.1
1,Austria,Second birth(%),33.7,33.7,34.3,34.9,34.5,34.8,35.8,36.4,...,35.5,35.3,35.6,35.3,35.3,35.1,35.3,35.5,34.9,35.1
2,Austria,Third birth or higher(%),18.8,18.5,19.0,18.9,19.0,19.1,18.9,19.1,...,16.5,16.4,17.1,17.2,17.6,17.7,17.0,16.9,16.7,16.8
3,Belgium,First birth(%),46.8,46.9,47.3,47.3,48.1,47.2,46.9,47.2,...,42.3,43.5,44.1,43.6,42.9,42.6,45.0,44.0,44.7,45.5
4,Belgium,Second birth(%),33.0,32.9,32.7,32.8,32.3,32.8,33.5,33.0,...,35.1,34.8,34.5,34.6,34.5,34.7,34.2,35.1,34.3,34.1
5,Belgium,Third birth or higher(%),20.2,20.2,19.9,19.9,19.6,20.0,19.6,19.8,...,22.6,21.8,21.4,21.9,22.6,22.6,20.8,20.9,20.9,20.4
6,Czechia,First birth(%),46.7,46.6,47.4,47.8,50.1,49.8,48.5,47.7,...,47.4,48.1,48.7,48.7,48.0,47.8,47.6,46.4,46.3,46.3
7,Czechia,Second birth(%),37.7,37.6,37.4,37.2,35.5,35.8,36.8,36.9,...,37.5,37.3,36.7,36.6,37.2,37.6,37.6,39.0,38.6,39.1
8,Czechia,Third birth or higher(%),15.6,15.8,15.2,15.0,14.4,14.4,14.8,15.4,...,15.1,14.7,14.6,14.7,14.7,14.6,14.8,14.6,15.0,14.6
9,Estonia,First birth(%),43.5,43.5,44.0,46.2,49.5,50.3,49.6,49.6,...,41.9,42.3,40.8,40.2,36.7,38.8,38.0,37.2,39.8,39.7


In [915]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

               dtype  null_count  unique_count
country       object           0            17
birth_order   object           0             3
1987         float64           0            48
1988         float64           0            49
1989         float64           0            48
1990         float64           0            44
1991         float64           0            48
1992         float64           0            46
1993         float64           0            47
1994         float64           0            47
1995         float64           0            48
1996         float64           0            47
1997         float64           0            49
1998         float64           0            50
1999         float64           0            49
2000         float64           0            48
2001         float64           0            50
2002         float64           0            47
2003         float64           0            50
2004         float64           0            49
2005         

In [916]:
#df_17_2.to_csv('../data/Cleaned/cleaned_births_by_birth_order_oecd.csv', index=False)

In [917]:
#df_17_2.to_sql('births_by_birth_order_oecd', engine, if_exists='replace', index=False)

In [918]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [919]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [920]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [921]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [922]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [923]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [924]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [925]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [926]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [927]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [928]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [929]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
#age_of_mothers_at_childbirth
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [930]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1963     object           0            19
1964     object           0            22
1965     object           0            22
1966     object           0            22
1967     object           0            22
1968     object           0            20
1969     object           0            21
1970     object           0            19
1971     object           0            19
1972     object           0            20
1973     object           0            20
1974     object           0            24
1975     object           0            21
1976     object           0            22
1977     object           0            20
1978     object           0            22
1979     object           0            23
1980     object           0            22
1981     object           0            20
1982     object           0            18
1983     object           0            20
1984     object           0       

In [931]:
df_19_1.columns = df_19_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [932]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_19_1.columns if c != "country"]

df_19_1[num_cols] = (
    df_19_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [933]:
df_19_1.drop_duplicates(inplace=True)
df_19_1.dropna(inplace=True)

In [934]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1963     float64           0            16
1964     float64           0            18
1965     float64           0            18
1966     float64           0            18
1967     float64           0            18
1968     float64           0            17
1969     float64           0            17
1970     float64           0            15
1971     float64           0            17
1972     float64           0            18
1973     float64           0            18
1974     float64           0            20
1975     float64           0            18
1976     float64           0            18
1977     float64           0            16
1978     float64           0            18
1979     float64           0            21
1980     float64           0            20
1981     float64           0            17
1982     float64           0            17
1983     float64           0            18
1984     fl

In [935]:
df_19_1.sample(10)

Unnamed: 0,country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
21,Slovak Republic,26.8,26.9,26.8,26.6,26.5,26.4,26.3,26.2,26.2,...,28.7,28.8,28.8,28.8,28.8,28.8,28.8,28.8,28.9,28.9
2,Belgium,27.8,27.7,27.6,27.5,27.4,27.3,27.2,27.2,27.0,...,30.0,30.2,30.3,30.4,30.5,30.6,30.7,30.8,30.8,31.0
8,Estonia,27.6,27.4,27.3,27.3,27.1,26.9,26.9,26.7,26.7,...,29.6,29.5,29.6,29.9,30.2,30.4,30.5,30.6,30.7,31.0
0,Australia,27.5,27.5,27.4,27.3,27.3,27.2,27.2,27.1,26.9,...,30.1,30.1,30.2,30.3,30.5,30.6,30.7,30.8,30.9,31.1
5,Czech Republic,25.7,25.8,25.5,25.2,25.0,24.9,24.8,24.8,24.9,...,29.8,29.9,29.9,30.0,30.0,30.0,30.1,30.2,30.2,30.4
14,Israel,28.1,28.1,28.2,28.2,28.2,28.5,28.4,28.2,28.5,...,30.1,30.2,30.3,30.3,30.4,30.4,30.4,30.5,30.5,30.6
1,Austria,27.4,27.4,27.3,27.1,27.0,26.8,26.8,26.7,26.7,...,30.2,30.3,30.4,30.6,30.6,30.7,30.9,31.0,31.0,31.2
11,Hungary,25.8,25.7,25.6,25.6,25.6,25.5,25.5,25.4,25.4,...,29.4,29.5,29.5,29.6,29.6,29.8,29.8,29.9,29.9,30.0
7,Denmark,27.3,26.8,26.8,26.6,26.5,26.5,26.6,26.7,26.7,...,30.7,30.8,30.9,31.0,31.0,31.1,31.2,31.3,31.4,31.6
19,Norway,27.8,27.8,27.7,27.6,27.5,27.3,27.2,27.0,26.8,...,30.3,30.5,30.6,30.7,30.8,31.0,31.1,31.3,31.4,31.5


In [936]:
#df_19_1.to_csv('../data/Cleaned/age_of_mothers_at_childbirth_oecd.csv', index=False)

In [937]:
#df_19_1.to_sql('age_of_mothers_at_childbirth_oecd', engine, if_exists='replace', index=False)

In [938]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
#fertility_per_1000_from 1960
df_19_2.head()

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [939]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            21
Age group  object           0             7
1960       object           0           136
1961       object           0           140
1962       object           0           140
...           ...         ...           ...
2017       object           0           124
2018       object           0           128
2019       object           0           126
2020       object           0           121
2021       object           7           119

[64 rows x 3 columns]


In [940]:
df_19_2.columns = df_19_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_2.head()

Unnamed: 0,country,age_group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [941]:
# --- Ensure "country" and "age_group" are strings
df_19_2["country"] = df_19_2["country"].astype(str).str.strip().str.title()
df_19_2["age_group"] = df_19_2["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_2.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_2[num_cols] = (
    df_19_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_2[num_cols] = df_19_2[num_cols].round(2)

In [942]:
df_19_2.drop_duplicates(inplace=True)
df_19_2.dropna(inplace = True)

In [943]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            19
age_group   object           0             7
1960       float64           0           124
1961       float64           0           126
1962       float64           0           126
...            ...         ...           ...
2017       float64           0           118
2018       float64           0           121
2019       float64           0           120
2020       float64           0           115
2021       float64           0           118

[64 rows x 3 columns]


In [944]:
#df_19_2.to_csv('../data/Cleaned/fertility_per_1000_by_age_from 1960_oecd.csv', index=False)

In [945]:
#df_19_2.to_sql('fertility_per_1000_from_1960_oecd', engine, if_exists='replace', index=False)

In [946]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
#fertility_per_1000_from_2000
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [947]:
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            43
Age group  object           0             7
2000       object           0           233
2001       object           0           248
2002       object           0           240
2003       object           0           239
2004       object           0           245
2005       object           0           240
2006       object           0           239
2007       object           0           242
2008       object           0           252
2009       object           0           251
2010       object           0           239
2011       object           0           235
2012       object           0           242
2013       object           0           234
2014       object           0           238
2015       object           0           237
2016       object           0           248
2017       object           0           236
2018       object           0           245
2019       object           0   

In [948]:
df_19_3.columns = df_19_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_3.head()

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587


In [949]:
# --- Ensure "country" and "age_group" are strings
df_19_3["country"] = df_19_3["country"].astype(str).str.strip().str.title()
df_19_3["age_group"] = df_19_3["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_3.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_3[num_cols] = (
    df_19_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_3[num_cols] = df_19_3[num_cols].round(2)

In [950]:
df_19_3.drop_duplicates(inplace=True)
df_19_3.dropna(inplace=True)

In [951]:
#Check again
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            41
age_group   object           0             7
2000       float64           0           225
2001       float64           0           237
2002       float64           0           232
2003       float64           0           229
2004       float64           0           233
2005       float64           0           229
2006       float64           0           229
2007       float64           0           230
2008       float64           0           238
2009       float64           0           238
2010       float64           0           230
2011       float64           0           227
2012       float64           0           231
2013       float64           0           225
2014       float64           0           226
2015       float64           0           225
2016       float64           0           237
2017       float64           0           227
2018       float64           0           233
2019      

In [952]:
df_19_3.sample(10)

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
193,New Zealand,35-39,52.5,53.7,54.7,58.7,60.2,62.3,63.7,70.3,...,69.5,69.8,66.4,70.8,67.8,65.3,63.7,64.8,60.7,62.6
76,Estonia,45-49,0.2,0.2,0.1,0.3,0.2,0.2,0.2,0.3,...,0.5,0.4,0.6,0.6,0.5,0.9,0.8,1.1,1.1,1.2
155,Latvia,20-24,81.7,78.5,76.6,78.5,74.8,74.1,76.8,77.2,...,58.7,60.2,63.5,64.4,64.3,62.6,56.0,54.1,53.8,46.6
239,Sweden,20-24,47.3,46.7,47.7,47.1,46.9,46.7,47.6,49.6,...,47.3,45.7,44.5,43.6,43.1,41.0,40.4,38.2,34.9,32.4
202,Norway,45-49,0.2,0.3,0.2,0.3,0.3,0.4,0.4,0.4,...,0.6,0.5,0.7,0.8,0.7,0.7,0.8,0.9,0.7,0.8
159,Latvia,40-44,3.8,4.2,4.3,4.7,5.1,4.9,5.7,6.4,...,7.6,9.4,10.0,11.3,12.2,13.5,12.8,13.6,13.2,13.5
278,Bulgaria,40-44,1.9,1.8,1.7,1.8,2.2,2.2,2.3,2.9,...,4.4,4.7,5.4,5.9,6.6,7.1,7.2,7.7,7.8,7.8
1,Oecd-Average,20-24,71.7,69.3,66.8,65.5,64.7,63.2,62.9,63.0,...,56.4,53.8,53.3,51.9,50.4,48.8,47.0,45.0,42.0,40.5
131,Israel,40-44,21.5,22.2,22.2,23.5,23.8,23.6,24.1,24.5,...,29.5,29.1,29.9,31.5,31.5,31.2,31.44,31.37,29.7,30.9
205,Poland,25-29,94.6,92.3,88.8,88.1,89.1,90.3,91.1,91.7,...,92.0,88.3,90.2,89.3,93.6,100.2,98.8,97.6,95.8,91.9


In [953]:
#df_19_3.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_2000_oecd.csv',index=False)

In [954]:
#df_19_3.to_sql('fertility_per_1000_from_2000_oecd',engine, if_exists='replace', index=False)

In [955]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
#(%)share_of_births_outside_of_marriage
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [956]:
df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1960     object           0            26
1961     object           0            24
1962     object           0            24
1963     object           0            24
...         ...         ...           ...
2016     object           0            24
2017     object           0            26
2018     object           0            25
2019     object           0            25
2020     object           0            24

[62 rows x 3 columns]


In [957]:
df_20.columns = df_20.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [958]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_20.columns if c != "country"]

df_20[num_cols] = (
    df_20[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [959]:
df_20.drop_duplicates(inplace=True)
df_20.dropna(inplace=True)

df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1960     float64           0            22
1961     float64           0            20
1962     float64           0            21
1963     float64           0            21
...          ...         ...           ...
2016     float64           0            20
2017     float64           0            22
2018     float64           0            21
2019     float64           0            22
2020     float64           0            22

[62 rows x 3 columns]


In [960]:
df_20.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
7,Hungary,5.5,5.5,5.4,5.3,5.2,5.2,5.1,5.0,5.0,...,42.3,44.5,45.6,47.3,47.9,46.7,44.7,43.9,38.7,30.4
6,Greece,1.2,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.1,...,7.4,7.6,7.0,8.2,8.8,9.4,10.3,11.1,12.4,13.8
20,Sweden,11.3,11.7,12.4,12.6,13.1,13.8,14.5,15.1,16.0,...,54.3,54.5,54.4,54.6,54.7,54.9,54.5,54.5,54.5,55.2
4,Finland,4.0,4.1,4.0,4.2,4.4,4.6,4.8,5.1,5.3,...,40.9,41.5,42.1,42.8,44.3,44.9,44.8,44.6,45.4,46.1
17,Slovak Republic,4.7,4.4,4.6,4.7,5.0,5.3,5.3,5.7,5.9,...,34.0,35.4,37.0,38.9,39.2,40.2,40.1,40.0,40.1,40.7
3,Denmark,7.8,8.0,8.3,8.9,9.3,9.5,10.2,11.1,11.1,...,49.0,50.6,51.5,52.5,53.8,54.0,54.2,54.2,54.1,54.2
25,Croatia,7.4,7.1,6.5,6.9,6.8,6.0,5.7,5.4,5.5,...,14.0,15.4,16.1,17.4,18.1,18.9,19.9,20.7,21.5,22.8
14,New Zealand,4.6,5.1,8.0,8.8,9.9,10.9,11.6,12.7,13.0,...,47.6,47.7,47.3,46.7,46.7,45.9,46.9,48.2,47.6,48.3
13,Netherlands,1.4,1.4,1.5,1.6,1.7,1.8,2.0,2.1,2.0,...,45.3,46.6,47.4,48.7,49.8,50.4,51.0,51.9,52.4,53.5
12,Luxembourg,3.2,3.4,3.1,3.1,3.2,3.7,3.2,3.5,3.2,...,34.1,37.1,37.8,39.1,38.8,40.7,40.8,39.5,40.4,41.6


In [961]:
#df_20.to_csv('../data/Cleaned/cleaned_share_of_births_outside_of_marriage_oecd.csv', index=False)

In [962]:
#df_20.to_sql('share_of_births_outside_of_marriage_oecd',engine, if_exists='replace', index=False)

In [963]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
#mean_age_first_marriage
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [964]:
df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            10
Gender     object           0             2
1990       object           0            17
1991       object           0            18
1992       object           0            18
1993       object           0            19
1994       object           0            16
1995       object           0            18
1996       object           0            19
1997       object           0            17
1998       object           0            14
1999       object           0            19
2000       object           0            18
2001       object           0            18
2002       object           0            19
2003       object           0            19
2004       object           0            16
2005       object           0            18
2006       object           0            18
2007       object           0            19
2008       object           0            18
2009       object           0   

In [965]:
df_21_1.columns = df_21_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [966]:
# --- Ensure "country" and "gender" are strings
df_21_1["country"] = df_21_1["country"].astype(str).str.strip().str.title()
df_21_1["gender"] = df_21_1["gender"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_1.columns if c not in ["country", "gender"]]
# --- Robust cleaning -> convert to float ---
df_21_1[num_cols] = (
    df_21_1[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_1[num_cols] = df_21_1[num_cols].round(2)

In [967]:
df_21_1.drop_duplicates(inplace=True)
df_21_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0             9
gender     object           0             2
1990      float64           0            15
1991      float64           0            16
1992      float64           0            16
1993      float64           0            17
1994      float64           0            15
1995      float64           0            16
1996      float64           0            17
1997      float64           0            15
1998      float64           0            13
1999      float64           0            17
2000      float64           0            16
2001      float64           0            16
2002      float64           0            17
2003      float64           0            17
2004      float64           0            15
2005      float64           0            17
2006      float64           0            17
2007      float64           0            17
2008      float64           0            16
2009      float64           0   

In [968]:
#df_21_1.to_csv('../data/Cleaned/cleaned_mean_age_first_marriage_oecd.csv',index=False)

In [969]:
#df_21_1.to_sql('mean_age_first_marriage_oecd', engine, if_exists='replace', index= False)

In [970]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
#divorce_rates_per_1000_oecd
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [971]:
df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            28
1970       object           0            18
1971       object           0            19
1972       object           0            19
1973       object           0            18
1974       object           0            18
1975       object           0            19
1976       object           0            18
1977       object           0            18
1978       object           0            18
1979       object           0            15
1980       object           0            18
1981       object           0            20
1982       object           0            22
1983       object           0            24
1984       object           0            20
1985       object           0            19
1986       object           0            20
1987       object           0            20
1988       object           0            20
1989       object           0            19
1990       object           0   

In [972]:
df_21_2.columns = df_21_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [973]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_21_2.columns if c != "country"]

df_21_2[num_cols] = (
    df_21_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [974]:
df_21_2.drop_duplicates(inplace=True)
df_21_2.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0            23
1970      float64           0            15
1971      float64           0            17
1972      float64           0            15
1973      float64           0            14
1974      float64           0            15
1975      float64           0            16
1976      float64           0            14
1977      float64           0            13
1978      float64           0            15
1979      float64           0            12
1980      float64           0            14
1981      float64           0            17
1982      float64           0            17
1983      float64           0            19
1984      float64           0            16
1985      float64           0            15
1986      float64           0            16
1987      float64           0            16
1988      float64           0            15
1989      float64           0            15
1990      float64           0   

In [975]:
df_21_2.head(8)

Unnamed: 0,country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,1.4,1.3,1.3,1.3,1.4,1.4,1.5,1.5,1.6,...,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.6,1.5
1,Belgium,0.7,0.7,0.8,0.9,1.0,1.1,1.3,1.3,1.4,...,2.2,2.2,2.2,2.1,2.0,2.0,2.0,1.8,1.9,1.7
2,Czechia,2.2,2.4,2.3,2.5,2.5,2.6,2.5,2.5,2.6,...,2.7,2.5,2.5,2.4,2.4,2.3,2.3,2.0,2.0,1.9
3,Denmark,1.9,2.7,2.6,2.5,2.6,2.6,2.6,2.6,2.6,...,3.4,3.4,2.9,3.0,2.6,2.6,1.8,2.7,2.2,2.1
5,Finland,1.3,1.6,1.8,1.9,2.1,2.0,2.1,2.1,2.2,...,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.4,2.2,2.0
6,Germany,1.3,1.4,1.5,1.6,1.8,1.9,2.0,1.5,1.0,...,2.1,2.1,2.0,2.0,1.9,1.8,1.8,1.7,1.7,1.6
8,Hungary,2.2,2.3,2.3,2.4,2.3,2.5,2.6,2.6,2.7,...,2.0,2.0,2.1,2.0,1.9,1.7,1.8,1.5,1.9,1.8
10,Japan,0.9,1.0,1.0,1.0,1.0,1.1,1.1,1.1,1.2,...,1.8,1.8,1.8,1.7,1.7,1.7,1.7,1.6,1.47,1.52


In [976]:
#df_21_2.to_csv('../data/Cleaned/cleaned_divorce_rates_per_1000_oecd.csv', index=False)

In [977]:
#df_21_2.to_sql('divorce_rates_per_1000_oecd',engine, if_exists= 'replace' , index=False)

In [978]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
#share_of_previous_marital_status
df_21_3

Unnamed: 0,Country,Previous marital status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216
5,Austria,Widowed,12,11,12,11,12,12,11,10,...,10,9,10,9,10,8,9,9,8,4
6,Czechia,Single never married,749,745,743,740,739,742,745,726,...,740,740,752,756,766,767,764,764,761,759
7,Czechia,Divorced,237,242,244,247,247,245,244,261,...,249,249,238,234,224,223,226,226,229,230
8,Czechia,Widowed,14,13,13,13,14,12,11,13,...,12,11,10,10,10,10,10,10,10,11
9,Denmark,Single never married,759,760,762,764,760,756,756,763,...,772,760,750,762,761,769,764,771,776,783


In [979]:
df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
Country                    object           0            20
Previous marital status    object           0             3
2000                       object           0            47
2001                       object           0            51
2002                       object           0            56
2003                       object           0            50
2004                       object           0            50
2005                       object           0            52
2006                       object           0            49
2008                       object           0            47
2009                       object           0            50
2010                       object           0            49
2011                       object           0            49
2012                       object           0            53
2013                       object           0            49
2014                       object       

In [980]:
df_21_3.columns = df_21_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216


In [981]:
# --- Ensure "country" and "previous_marital_status" are strings
df_21_3["country"] = df_21_3["country"].astype(str).str.strip().str.title()
df_21_3["previous_marital_status"] = df_21_3["previous_marital_status"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_3.columns if c not in ["country", "previous_marital_status"]]
# --- Robust cleaning -> convert to float ---
df_21_3[num_cols] = (
    df_21_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_3[num_cols] = df_21_3[num_cols].round(2)

In [982]:
df_21_3.drop_duplicates(inplace=True)
df_21_3.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
country                    object           0            20
previous_marital_status    object           0             3
2000                      float64           0            47
2001                      float64           0            51
2002                      float64           0            56
2003                      float64           0            50
2004                      float64           0            50
2005                      float64           0            52
2006                      float64           0            49
2008                      float64           0            47
2009                      float64           0            50
2010                      float64           0            49
2011                      float64           0            49
2012                      float64           0            53
2013                      float64           0            49
2014                      float64       

In [None]:
#df_21_3.to_csv('../data/Cleaned/cleaned_share_of_previous_marital_status_oecd.csv', index=False)

In [984]:
#df_21_3.to_sql('share_of_previous_marital_status_oecd', engine, if_exists= 'replace', index =  False)

In [1002]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
df_22_1

Unnamed: 0,Country,20+_All_Total_Living_with_a_partner(%),20+_All_Married or in a civil or registered partnership_living_with_a_partner(%),20+_All_Cohabiting_living_with_a_partner(%),20+_All_Not living with a partner(%),20/34_Total_living_with_a_partner(%),20/34_Married or in a civil or registered partnership_living_with_a_partner(%),20/34_Cohabiting_living_with_a_partner(%),Not living with a partner_Total(%),Living with at least one parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620
5,Denmark,6415,5002,1412,3585,5054,2186,2868,4946,1067
6,Estonia,5393,3730,1664,4607,4531,1781,2750,5469,2646
7,France,6414,4941,1472,3586,5042,2189,2853,4958,2208
8,Germany,6261,5391,869,3739,3953,2215,1739,5974,2754
9,Greece,6023,5852,171,3977,3313,2924,390,6687,4543


In [995]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
20+_All_Total_Living_with_a_partner(%)                object           0   
20+_All_Married or in a civil or registered par...    object           0   
20+_All_Cohabiting_living_with_a_partner(%)           object           0   
20+_All_Not living with a partner(%)                  object           0   
20/34_Total_living_with_a_partner(%)                  object           0   
20/34_Married or in a civil or registered partn...    object           0   
20/34_Cohabiting_living_with_a_partner(%)             object           0   
Not living with a partner_Total(%)                    object           0   
Living with at least one parent(%)                    object           0   

                                                    unique_count  
Country                                                       37  
20+_All_Total_Living_with_a_p

In [1003]:
df_22_1.columns = df_22_1.columns.str.lower() \
                .str.replace(' ', '_') \


df_22_1.head()

Unnamed: 0,country,20+_all_total_living_with_a_partner(%),20+_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20+_all_cohabiting_living_with_a_partner(%),20+_all_not_living_with_a_partner(%),20/34_total_living_with_a_partner(%),20/34_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20/34_cohabiting_living_with_a_partner(%),not_living_with_a_partner_total(%),living_with_at_least_one_parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620


In [None]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_1.columns if c != "country"]

df_22_1[num_cols] = (
    df_22_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")

SyntaxError: incomplete input (2079695211.py, line 10)

In [None]:
df_22_1

In [986]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
df_22_2

Unnamed: 0,Country,Low_Education_Total_living_with_a_partner(%),Low_educationMarried or in a civil or registered partnership_living_with_a_partner(%),Low_education_Cohabiting_living_with_a_partner(%),Not living with a partner_Low_education(%),Medium education_Total_Living with a partner(%),Medium education_Married or in a civil or registered partnership_Living with a partner(%),Medium education_Cohabiting_Living with a partner(%),Not living with a partner_Medium education(%),High education_Total_Living with a partner(%),High education_Married or in a civil or registered partnership_Living with a partner(%),High education_Cohabiting_Living with a partner(%),Not living with a partner_High education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442
5,Germany,5446,4879,567,4554,6238,5313,925,3762.0,6889,5916,974,3111
6,Greece,6381,6288,93,3619,5700,5488,212,4300.0,5833,5570,263,4167
7,Hungary,5033,4038,995,4967,5794,4678,1115,4206.0,5956,5102,855,4044
8,Iceland,5186,4102,1084,4814,5831,4657,1174,4169.0,6972,5453,1519,3028
9,Latvia,3627,2592,1035,6373,4932,3954,978,5068.0,5291,4539,752,4709


In [987]:
df_6666 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_6666

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,


In [988]:
df_888= pd.read_csv('../data/Raw/OECD/Households-by-type,-presence-of-children-and-country,-2015-2024.csv')
df_888

Unnamed: 0,Category,Single adult with children,Single adult without children,Couple with children,Couple without children,Other type of household with children,Other type of household without children
0,2015,6147.3,64181.3,31679.8,46641.6,11698.9,30771.6
1,2016,6148.5,63891.1,31907.3,47308.2,11766.3,30559.5
2,2017,6108.5,65353.9,32091.5,47426.1,11530.2,30297.5
3,2018,6163.6,66165.5,31720.2,48194.8,11342.5,30224.0
4,2019,6246.4,67417.9,31710.1,48503.6,11285.7,30134.8
5,2020,6136.4,67412.9,31622.2,48831.2,11212.9,30445.2
6,2021,5691.9,70200.4,30558.3,47447.4,11611.8,30700.7
7,2022,5984.9,72134.3,30469.3,47995.5,11513.6,30412.1
8,2023,5924.8,73396.2,30313.0,48477.5,11443.5,30608.8
9,2024,6077.7,75049.7,30286.5,49058.4,11311.9,30487.3


In [989]:
df_999 = pd.read_csv('../data/Raw/OECD/Households-with-children-by-number-of-children,-2024.csv')
df_999

Unnamed: 0,Category,1 child,2 children,3 children or more
0,European Union,11.7,8.9,3.0
1,,,,
2,Slovakia,17.1,14.5,4.0
3,Ireland,12.4,12.2,6.4
4,Cyprus,13.9,11.7,3.1
5,Czechia,13.9,11.6,2.6
6,Romania,14.3,9.2,4.0
7,Luxembourg,12.5,12.1,2.4
8,Belgium,11.8,10.2,4.1
9,Croatia,12.0,10.1,3.8
