In [502]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [503]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [504]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [505]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [506]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [507]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [508]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [509]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [510]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [511]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [512]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [513]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [514]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [515]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [516]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [517]:
df_2.drop_duplicates(inplace=True)


In [518]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [519]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [520]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [521]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [522]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [523]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [524]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [525]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [526]:
df_3.drop_duplicates(inplace=True)


In [527]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [528]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [529]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [530]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [531]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [532]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [533]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [534]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [535]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [536]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [537]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [538]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [539]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [540]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [541]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [542]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [543]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [544]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [545]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
43,Women,,26,55.1,68.3,84.2,65.2,40.1,21.2,12.9,
27,Men,,44,91.5,91.1,91.8,80.0,67.5,,,
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
56,Women,,39,81.8,90.2,94.8,85.0,71.3,60.4,,


In [546]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [547]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [548]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [549]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [550]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [551]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [552]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1674,Slovakia,SVK,1974,5.3
1380,Netherlands,NLD,1988,10.2
410,Cyprus,CYP,1986,0.5
1333,Mexico,MEX,2002,45.7
677,France,FRA,2017,59.9


In [553]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [554]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [555]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [556]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [557]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [558]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
66,Women,GBR,49,85.2,91.9,95.7,87.3,76.0,,,
29,Men,GBR,46,92.1,91.4,92.1,80.8,68.8,,,
61,Women,GBR,44,84.0,91.3,95.4,86.5,74.1,,,
24,Men,GBR,41,90.2,90.3,91.4,78.6,65.0,54.4,,
10,Men,GBR,27,55.8,59.2,73.5,52.2,28.7,14.4,9.7,


In [559]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
39,Women,22,24.3,39.3,57.5,40.6,18.2
33,Men,50,92.9,91.9,92.4,81.7,70.8
2,Men,19,0.8,0.6,2.0,2.5,0.7
21,Men,38,88.1,89.0,90.5,76.6,61.3
4,Men,21,6.1,7.4,13.6,11.9,3.9


In [560]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [561]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [562]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [563]:
#pip install openpyxl pywin32

In [564]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [565]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [566]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [567]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [568]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [569]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [570]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [571]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [572]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [573]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [574]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [575]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
13520,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,458,Malaysia,MY,MYS,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.08
4229,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,148,Chad,TD,TCD,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,74.47
13695,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,466,Mali,ML,MLI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,79.27
23842,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,826,United Kingdom,GB,GBR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,50.99
2372,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,76,Brazil,BR,BRA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,56.7


In [576]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
13264,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,450,Madagascar,MG,MDG,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.41
14076,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,478,Mauritania,MR,MRT,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.8
21106,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,728,South Sudan,SS,SSD,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,75.07
24087,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,850,United States Virgin Islands,VI,VIR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,37.23
3401,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,116,Cambodia,KH,KHM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.27


In [577]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [578]:
df_9.drop_duplicates(inplace=True)

In [579]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [580]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [581]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [582]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [583]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [584]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
111652,Saint-Pierre-et-Miquelon,666,Married or in-union women,1994,35-39,80.267368,0.19826,Estimate
84666,Republic of Moldova,498,Married or in-union women,2023,25-29,81.130559,71.242772,Projection
133551,Turkmenistan,795,Married or in-union women,1977,15-49,62.513004,380.4457,Estimate
100654,Papua New Guinea,598,Married or in-union women,1996,45-49,86.546154,70.456791,Estimate
21361,Canada,124,Married or in-union women,2048,20-24,18.089865,220.63141,Projection
142185,Uzbekistan,860,Married or in-union women,2004,20-24,73.038462,890.601419,Estimate
74302,Libya,434,Married or in-union women,2023,45-49,65.264171,166.713083,Projection
125825,Sweden,752,Married or in-union women,1984,20-24,37.272927,102.202179,Estimate
77337,Madagascar,450,Married or in-union women,1998,20-24,70.424242,498.681455,Estimate
6520,Bahamas,44,Married or in-union women,1975,15-19,8.723434,0.869901,Estimate


In [585]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
82587,Mauritius,480,Married or in-union women,2006,30-34,79.356364,38.010111,Estimate
99621,Pakistan,586,Married or in-union women,2029,40-44,88.764875,6484.431241,Projection
11993,Bolivia (Plurinational State of),68,Married or in-union women,2011,20-24,42.598882,207.604801,Estimate
29994,Cook Islands,184,Married or in-union women,1993,25-29,60.859713,0.424496,Estimate
37134,El Salvador,222,Married or in-union women,1994,45-49,67.098653,75.943262,Estimate


In [586]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [587]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [588]:
df_10.dropna(inplace=True)

In [589]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [590]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [591]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [592]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [593]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [594]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
22362,India,356,1992,1993,Women,[15-19],15,19,38.4,Survey,1992-1993 NFHS,1789,India 1992-1993 Demographic and Health Survey,DHS_STATcompiler,,,
37655,Peru,604,1986,1986,Women,[45-49],45,49,83.1,Survey,1986 DHS,1771,Peru 1986 Demographic and Health Survey,DHS_STATcompiler,1.0,,
34375,Netherlands Antilles,530,1981,1981,Men,[25-29],25,29,37.19,Census,1981 Census,1348,Netherlands Antilles 1981 Census,US Census Bureau,,,
50828,United Arab Emirates,784,1975,1975,Men,[70-74],70,74,78.04,Census,1975 Census,1621,United Arab Emirates 1975 Census,UNSD,,,
30015,Malawi,454,2013,2014,Men,[45-49],45,49,95.47,Survey,2013-2014 MICS,5582,Malawi 2013-2014 Multiple Indicator Cluster Su...,MICS,1.0,,
47898,Switzerland,756,1995,1995,Women,[75+],75,999,23.18,Estimate,1995 Estimate,2228,Switzerland 1995 Estimate,UNSD,,,
4009,Bolivia (Plurinational State of),68,1992,1992,Women,[35-39],35,39,81.74,Census,1992 Census,944,Bolivia 1992 Census,UNSD,1.0,,
10522,Czechia,203,1994,1994,Men,[75+],75,999,61.51,Estimate,1994 Estimate,2079,Czech Republic 1994 Estimate,UNSD,,,


In [595]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
43924,Slovakia,703,2012,2012,Women,[50-54],50,54,69.44,Estimate,2012 Estimate,2216,Slovakia 2012 Estimate,UNSD,,,
15523,France,250,1972,1972,Women,[10-14],10,14,0.0,Estimate,1972 Estimate,2094,France 1972 Estimate,UNSD,,,Excluding diplomatic personnel outside the cou...
19317,Guyana,328,2009,2009,Men,[20-24],20,24,28.1,Survey,2009 DHS,4701,Guyana 2009 Demographic and Health Survey,DHS_STATcompiler,1.0,,
39392,Republic of Moldova,498,2004,2004,Women,[15-19],15,19,6.08,Census,2004 Census,845,Moldova 2004 Census,UNSD,1.0,,
13929,Faeroe Islands,234,2011,2011,Women,[20-24],20,24,8.1,Census,2011 Census,4820,Faroe Islands 2011 Census,National statistics,,,
621,Angola,24,2015,2016,Men,[45-49],45,49,90.7,Survey,2015-2016 DHS,5805,Angola 2015-2016 Demographic and Health Survey,DHS_STATcompiler,1.0,,
38685,Portugal,620,1972,1972,Women,[25-29],25,29,75.95,Estimate,1972 Estimate,2193,Portugal 1972 Estimate,UNSD,,,
15243,Finland,246,2011,2011,Women,[10-14],10,14,0.0,Estimate,2011 Estimate,2093,Finland 2011 Estimate,UNSD,1.0,,Excluding Åland Islands.


In [596]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
16916,Georgia,268,Men,[30-34],30,34,69.89,Census,2014 Census,UNSD
46944,Sweden,752,Women,[30-34],30,34,53.06,Estimate,1992 Estimate,UNSD
35195,Nigeria,566,Men,[25-29],25,29,38.92,Census,2006 Census,UNSD
31722,Monaco,492,Women,[25-29],25,29,59.7,Census,1975 Census,UNSD
29170,Lithuania,440,Women,[65-69],65,69,48.07,Estimate,2009 Estimate,UNSD
28082,Liberia,430,Women,[60-64],60,64,47.05,Census,1974 Census,UNSD
38451,Poland,616,Men,[25-29],25,29,63.9,Estimate,1990 Estimate,UNSD
43254,Sierra Leone,694,Men,[35-39],35,39,86.47,Survey,2000 MICS_HH,MICS_HH
43497,Singapore,702,Women,[55-59],55,59,65.64,Census,1980 Census,UNSD
19689,Honduras,340,Men,[30-34],30,34,78.19,Census,2001 Census,UNSD


In [597]:
df_11.drop_duplicates(inplace=True)

In [598]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [599]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [600]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [601]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [602]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
36644,New Zealand,554,2001,2001,Men,[20-24],20,24,5.64,Census,2001 Census,1093,New Zealand 2001 Census,UNSD,,Data randomly rounded to protect confidentiali...,
23069,Iceland,352,1995,1995,Women,[70-74],70,74,89.33,Estimate,1995 Estimate,2121,Iceland 1995 Estimate,UNSD,,,
36168,Netherlands,528,2010,2010,Men,[65-69],65,69,94.1,Estimate,2010 Estimate,2170,Netherlands 2010 Estimate,UNSD,,,
25982,Israel,376,2010,2010,Men,[30-34],30,34,70.6,Estimate,2010 Estimate,2127,Israel 2010 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
27102,Japan,392,2010,2010,Men,[40-44],40,44,71.39,Census,2010 Census,4788,Japan 2010 Census,UNSD,,,Excluding diplomatic personnel outside the cou...
26010,Israel,376,2011,2011,Men,[30-34],30,34,69.81,Estimate,2011 Estimate,2127,Israel 2011 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
717,Antigua and Barbuda,28,2001,2001,Women,[45-49],45,49,60.87,Census,2001 Census,2277,Antigua and Barbuda 2001 Census,UNSD,1.0,,
29284,Lesotho,426,1977,1977,Women,[20-24],20,24,83.5,Survey,1977 WFS,639,Lesotho 1977 World Fertility Survey,INED,,,


In [603]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
19946,Guatemala,320,Men,[15-19],15,19,6.83,Census,UNSD
5433,Burkina Faso,854,Women,[45-49],45,49,99.7,Survey,DHS_HH
6905,Canada,124,Men,[65-69],65,69,93.4,Estimate,UNSD
17866,Georgia,268,Men,[75+],75,999,98.56,Census,UNSD
33754,Montserrat,500,Men,[25-29],25,29,31.0,Census,US Census Bureau
14816,Faeroe Islands,234,Men,[75+],75,999,85.2,Estimate,UNSD
13592,Ecuador,218,Women,[60-64],60,64,88.11,Census,UNSD
27868,Kenya,404,Women,[60-64],60,64,96.13,Census,IPUMS


In [604]:
df_12.dropna(inplace=True)

In [605]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [606]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [607]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [608]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [609]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
48106,Montenegro,499,[30-34],ASFR3034,1993.5,76.576,"Estimates,Fertility data (Adjusted),HFC-ODE,21...",Fertility data (adjusted),Estimate,Estimate,2162,All sources of estimates,Estimates,1993,1993,European Demographic Observatory (ODE). Data c...,2011
32833,Hungary,348,[40-44],ASFR4044,2002.5,4.08,Eurostat.20190531,Official estimates,Estimate,Estimate,2120,All sources of estimates,Estimates,2002,2002,"Eurostat Statistics, Fertility rates by age [d...",2019
68304,Switzerland,756,[15-19],ASFR1519,2006.5,4.54,Eurostat.20190531,Official estimates,Estimate,Estimate,2228,All sources of estimates,Estimates,2006,2006,"Eurostat Statistics, Fertility rates by age [d...",2019
42363,Lesotho,426,[30-34],ASFR3034,2002.0,135.0,"2002 RHS,Recent births,2001 Demographic Survey...",Recent births,Survey,Survey,2923,Lesotho 2002 Reproductive Health Survey,2002 RHS,2002,2002,cited in: Lesotho Demographic Survey 2001,2002
12245,Cameroon,120,[40-44],ASFR4044,2007.0,100.6914,"2014 MICS,Birth Histories,FBH analysis 2018,56...",Birth histories,Survey,MICS,5641,Cameroon 2014 Multiple Indicator Cluster Survey,2014 MICS,2014,2014,Fertility rates from full birth histories anal...,2018
6651,Belarus,112,[15-19],ASFR1519,1976.00274,27.5,"Register, NSO 2019, Direct",Direct,Register,VR,426,Vital Registration,Register,1976,1976,"2018 Statistical Yearbook, Table 4.9., page 275",2018


In [610]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [611]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
6891,Belarus,[45-49],1996,0.1,Direct,Register,Register,1996,1996
2762,Armenia,[40-44],2010,3.15,Direct,Register,Register,2010,2010
57784,Qatar,[Total],2008,30.16,Fertility data (adjusted),Register,Register,2008,2008
77929,Viet Nam,[30-34],2008,79.5,Own-children method,Census,2009 Census,2009,2009
75792,Uruguay,[Total],2003,2.14,Fertility data (adjusted),Estimate,Estimates,2003,2003
35222,Iran (Islamic Republic of),[15-19],1985,147.0,Own-children method,Census,1996 Census,1996,1996
68078,Switzerland,[45-49],1977,0.31,Official estimates,Estimate,Estimates,1977,1977
69967,Thailand,[35-39],1964,222.4,Recent births,Survey,1964-1967 SPC,1964,1967
70921,Tonga,[Total],2004,3.8,Direct,Register,Register,2004,2004
58480,Republic of Moldova,[45-49],2001,0.15,Direct,Register,Register,2001,2001


In [612]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [613]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [614]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [615]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
225510,Solomon Islands,90,2009,2009,Men,Married,,,[50-54],50,...,2009 Census,2300,Solomon Islands 2009 Census,National statistics,,,,,,
114429,Iceland,352,2013,2013,Women,Separated,,,[70-74],70,...,2013 Estimate,2121,Iceland 2013 Estimate,UNSD,,,,,,
151829,Malawi,454,2015,2016,Men,Married,,,[25-29],25,...,2015-2016 DHS,5952,Malawi 2015-2016 Demographic and Health Survey,DHS_HH,,,,,,
2964,Angola,24,2014,2014,Women,Single,,,[45-49],45,...,2014 Census,4704,Angola 2014 Census,UNSD,,,,,,
186619,Norway,578,2018,2018,Men,Consensual union,,,[65-69],65,...,2018 Estimate,2180,Norway 2018 Estimate,UNSD,,,,,,


In [616]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
201006,Republic of Moldova,2005,2005,Women,Never married,[15-19],15,19,89.5,Survey,2005 DHS,DHS_STATcompiler
13155,Bangladesh,1974,1974,Men,Divorced,[20-24],20,24,0.2,Census,1974 Census,UNSD
47084,Congo,1974,1974,Women,Consensual union,[55-59],55,59,0.42,Census,1974 Census,INED
10662,Azerbaijan,2000,2000,Men,Widowed,[50-54],50,54,2.0,Survey,2000 MICS_HH,MICS_HH
22022,Botswana,2001,2001,Women,Single,[25-29],25,29,52.98,Census,2001 Census,UNSD
222922,Slovenia,2003,2003,Men,Widowed,[35-39],35,39,0.15,Estimate,2003 Estimate,UNSD
80,Afghanistan,1972,1974,Women,Widowed,[30-34],30,34,4.1,Survey,1972-1974 NDFGS,National statistics
217046,Senegal,2015,2015,Women,Divorced,[15-19],15,19,0.5,Survey,2015 DHS,DHS_STATcompiler
39222,Channel Islands,2001,2001,Women,Widowed,[50-54],50,54,4.22,Census,2001 Census,US Census Bureau
101844,Haiti,2003,2003,Men,Separated from marriage,[65-69],65,69,1.02,Census,2003 Census,IPUMS


In [617]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [618]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [619]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [620]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [621]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
10180,Eastern Asia,906,SDG-M49,Married or in-union women,2027,35-39,89.22404,109431.418861,Projection
24889,Least developed countries,941,Development group,Married or in-union women,2003,20-24,68.850759,22853.354668,Estimate
2885,Eastern and South-Eastern Asia,753,SDG,Married or in-union women,2006,40-44,92.568024,150998.194635,Estimate
25476,Low-income countries,1500,Income group,Married or in-union women,1995,35-39,86.527022,15815.918738,Estimate
11288,Western Asia,922,M49,Married or in-union women,2004,15-19,12.086433,2442.970736,Estimate
27573,High-income countries,1503,Income group,Married or in-union women,2014,40-44,74.221606,35798.415256,Estimate


In [622]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
16542,Caribbean,915,Married or in-union women,2012,45-49,67.123719,1871.863042,Estimate
23553,Developing countries,902,Married or in-union women,1998,20-24,56.546563,118818.545328,Estimate
9802,Eastern Asia,906,Married or in-union women,1980,25-29,92.158454,91445.269154,Estimate
19348,Oceania,909,Married or in-union women,2039,35-39,74.127908,1635.850221,Projection
21983,Micronesia,954,Married or in-union women,2044,15-49,47.415962,132.162511,Projection
20040,Australia and New Zealand,927,Married or in-union women,2007,35-39,74.470694,707.94076,Estimate
8827,Asia,935,Married or in-union women,2020,30-34,87.900477,318485.791285,Estimate
588,World,900,Married or in-union women,2043,35-39,81.455861,466274.154669,Projection
6848,Northern Africa,912,Married or in-union women,2016,15-19,9.740335,2027.972946,Estimate
18988,Oceania,909,Married or in-union women,1994,35-39,81.316562,1006.222934,Estimate


In [623]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [624]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [625]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [626]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [627]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [628]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [629]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [630]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [631]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [632]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [633]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [634]:
df_16_1.sample(10)

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
28,Switzerland,2.21,4.02,2.58
26,United Kingdom,2.3,3.9,2.8
10,Croatia,2.6,3.9,2.6
13,Spain,2.5,3.7,2.4
19,Portugal,2.4,3.5,2.4
3,Israel,3.19,4.65,2.86
5,Slovak Republic,3.1,3.8,2.5
25,Poland,2.3,3.7,2.5
31,Austria,2.2,3.8,2.5
42,Finland,1.9,4.0,2.6


In [635]:
#df_16_1.to_csv('../data/Cleaned/cleaned_average_size_of_households_type_2024_oecd.csv', index=False)

In [636]:
#df_16_1.to_sql('average_size_of_households_type_2024_oecd', engine, if_exists = 'replace', index= False)

In [637]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [638]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [639]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [640]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [641]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [642]:
df_16_2.rename(columns={
   "couple_total" : "couple_total(%)",
   "with_children" : "with_children(%)",
   "without_children" : "without_children(%)",
    "single_parent_total" : "single_parent_total(%)",
    "single_mother" : "single_mother(%)",
    "single_father" : "single_father(%)",
    "single_person" : "single_person(%)",
    "other_household_types" : "other_household_types(%)"
}, inplace=True)

In [643]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total(%)         float64           0            36
with_children(%)        float64           0            35
without_children(%)     float64           0            36
single_parent_total(%)  float64           0            34
single_mother(%)        float64           0            32
single_father(%)        float64           0            31
single_person(%)        float64           0            35
other_households_types  float64           0            36
country                    object
couple_total(%)           float64
with_children(%)          float64
without_children(%)       float64
single_parent_total(%)    float64
single_mother(%)          float64
single_father(%)          float64
single_person(%)          float64
other_households_types    float64
dtype: object


In [644]:
df_16_2.sample(10)

Unnamed: 0,country,couple_total(%),with_children(%),without_children(%),single_parent_total(%),single_mother(%),single_father(%),single_person(%),other_households_types
21,Latvia,27.8,12.21,15.6,13.44,11.21,2.23,41.08,17.68
10,Finland,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6
31,Slovenia,45.41,20.97,24.44,6.93,5.57,1.36,34.0,13.66
34,Switzerland,53.77,24.05,29.72,4.7,3.88,0.82,36.88,4.65
23,Luxembourg,53.06,26.86,26.2,6.63,5.39,1.24,28.87,11.44
25,Netherlands,53.6,23.01,30.59,6.1,5.0,1.09,38.5,1.8
43,Romania,45.66,20.73,24.93,6.5,4.56,1.94,33.63,14.21
12,Germany,45.78,17.89,27.89,5.41,4.44,0.98,43.14,5.67
39,Bulgaria,40.3,16.35,23.95,4.6,3.88,0.73,35.81,19.28
1,Austria,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11


In [645]:
#df_16_2.to_csv('../data/Cleaned/cleaned_types_of_household_2021_oecd.csv', index = False)

In [646]:
#df_16_2.to_sql('types_of_household_2021_oecd', engine, if_exists = 'replace', index= False)

In [647]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [648]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [649]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [650]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [651]:
df_16_3.sample(10)

Unnamed: 0,country,0_children,1_child,2_children,3_or_more_children,children_under_6
23,Luxembourg,73.0,12.49,12.07,2.41,11.54
18,Italy,77.79,12.26,8.28,1.66,8.05
12,Germany,79.86,9.91,7.72,2.51,8.57
40,Croatia,74.18,11.96,10.1,3.76,10.44
43,Romania,72.46,14.29,9.24,4.02,9.64
10,Finland,81.98,7.89,6.99,3.14,7.14
9,Estonia,75.76,12.53,8.73,2.98,9.85
19,Japan,81.94,8.78,7.17,2.11,7.58
33,Sweden,74.84,10.77,9.83,4.56,9.95
21,Latvia,74.8,14.05,8.32,2.83,10.07


In [652]:
df_16_3.rename(columns={
    "0_children": "households_0_children(%)",
    "1_child": "households_1_child(%)",
    "2_children": "households_2_children(%)",
    "3_or_more_children": "households_3_or_more_children(%)"
}, inplace=True)

In [653]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                                    dtype  null_count  unique_count
country                            object           0            33
households_0_children(%)          float64           0            32
households_1_child(%)             float64           0            32
households_2_children(%)          float64           0            33
households_3_or_more_children(%)  float64           0            31
children_under_6                  float64           0            31
country                              object
households_0_children(%)            float64
households_1_child(%)               float64
households_2_children(%)            float64
households_3_or_more_children(%)    float64
children_under_6                    float64
dtype: object


In [654]:
df_16_3.sample(10)

Unnamed: 0,country,households_0_children(%),households_1_child(%),households_2_children(%),households_3_or_more_children(%),children_under_6
23,Luxembourg,73.0,12.49,12.07,2.41,11.54
30,Slovak Republic,64.41,17.09,14.49,4.02,15.56
8,Denmark,77.78,10.54,8.94,2.74,8.15
42,Malta,76.49,12.68,7.81,2.98,9.61
35,Türkiye,57.62,17.42,14.5,10.45,19.65
9,Estonia,75.76,12.53,8.73,2.98,9.85
7,Czechia,71.95,13.85,11.56,2.64,12.29
44,EU average,75.1,12.28,9.46,3.15,9.9
43,Romania,72.46,14.29,9.24,4.02,9.64
40,Croatia,74.18,11.96,10.1,3.76,10.44


In [655]:
#df_16_3.to_csv('../data/Cleaned/cleaned_households_by_number_of_children_2024_oecd.csv', index=False)

In [656]:
#df_16_3.to_sql('households_by_number_of_children_2024_oecd', engine, index= False)

In [657]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
#total_fertility_rates_from_1960_oecd
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [658]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [659]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [660]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [661]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [662]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            49
1960     float64           0            47
1961     float64           0            47
1962     float64           0            47
1963     float64           0            46
...          ...         ...           ...
2019     float64           0            37
2020     float64           0            39
2021     float64           0            40
2022     float64           0            34
2023     float64           0            35

[65 rows x 3 columns]


In [663]:
df_17_1.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
45,Saudi Arabia,7.63,7.63,7.64,7.65,7.67,7.66,7.66,7.66,7.63,...,2.68,2.63,2.65,2.67,2.67,2.49,2.27,2.17,2.14,2.28
44,Argentina,3.14,3.12,3.13,3.11,3.08,3.01,2.97,2.98,3.01,...,2.39,2.35,2.24,2.17,2.07,1.88,1.6,1.59,1.48,1.5
37,Brazil,6.05,6.02,5.98,5.93,5.82,5.68,5.52,5.34,5.17,...,1.78,1.78,1.73,1.74,1.74,1.71,1.65,1.64,1.63,1.62
18,Japan,2.0,1.96,1.98,2.0,2.05,2.14,1.58,2.23,2.13,...,1.42,1.45,1.44,1.43,1.42,1.36,1.33,1.3,1.26,1.2
11,France,2.74,2.82,2.8,2.9,2.91,2.85,2.8,2.67,2.59,...,2.0,1.96,1.92,1.89,1.87,1.86,1.82,1.83,1.78,1.66
16,Ireland,3.76,3.79,3.92,4.01,4.06,4.03,3.95,3.84,3.78,...,1.89,1.85,1.82,1.78,1.75,1.7,1.63,1.72,1.7,1.5
40,Indonesia,5.51,5.52,5.53,5.55,5.57,5.59,5.59,5.58,5.57,...,2.39,2.35,2.31,2.25,2.22,2.21,2.19,2.17,2.15,2.13
20,Latvia,1.94,1.94,1.91,1.85,1.79,1.74,1.76,1.8,1.83,...,1.65,1.7,1.74,1.69,1.6,1.61,1.55,1.57,1.47,1.36
48,EU-27 average,2.62,2.62,2.61,2.65,2.67,2.62,2.58,2.53,2.45,...,1.54,1.54,1.57,1.55,1.54,1.52,1.51,1.54,1.46,1.39
8,Denmark,2.54,2.55,2.54,2.64,2.6,2.61,2.62,2.35,2.12,...,1.69,1.71,1.79,1.75,1.73,1.7,1.67,1.72,1.55,1.5


In [664]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [665]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [666]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [667]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [668]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_17_2.head()

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341


In [669]:
# --- Ensure "country" and "birth order" are strings (tidy casing/spacing) ---
df_17_2["country"] = df_17_2["country"].astype(str).str.strip().str.title()
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_17_2.columns if c not in ["country", "birth_order"]]
# --- Robust cleaning -> convert to float ---
df_17_2[num_cols] = (
    df_17_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_17_2[num_cols] = df_17_2[num_cols].round(2)


In [670]:
df_17_2.drop_duplicates(inplace=True)
df_17_2.dropna(inplace=True)

In [671]:
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str) + "(%)"

In [672]:
df_17_2.head(10)

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth(%),47.6,47.8,46.7,46.2,46.5,46.1,45.2,44.5,...,48.0,48.3,47.3,47.5,47.1,47.2,47.7,47.6,48.4,48.1
1,Austria,Second birth(%),33.7,33.7,34.3,34.9,34.5,34.8,35.8,36.4,...,35.5,35.3,35.6,35.3,35.3,35.1,35.3,35.5,34.9,35.1
2,Austria,Third birth or higher(%),18.8,18.5,19.0,18.9,19.0,19.1,18.9,19.1,...,16.5,16.4,17.1,17.2,17.6,17.7,17.0,16.9,16.7,16.8
3,Belgium,First birth(%),46.8,46.9,47.3,47.3,48.1,47.2,46.9,47.2,...,42.3,43.5,44.1,43.6,42.9,42.6,45.0,44.0,44.7,45.5
4,Belgium,Second birth(%),33.0,32.9,32.7,32.8,32.3,32.8,33.5,33.0,...,35.1,34.8,34.5,34.6,34.5,34.7,34.2,35.1,34.3,34.1
5,Belgium,Third birth or higher(%),20.2,20.2,19.9,19.9,19.6,20.0,19.6,19.8,...,22.6,21.8,21.4,21.9,22.6,22.6,20.8,20.9,20.9,20.4
6,Czechia,First birth(%),46.7,46.6,47.4,47.8,50.1,49.8,48.5,47.7,...,47.4,48.1,48.7,48.7,48.0,47.8,47.6,46.4,46.3,46.3
7,Czechia,Second birth(%),37.7,37.6,37.4,37.2,35.5,35.8,36.8,36.9,...,37.5,37.3,36.7,36.6,37.2,37.6,37.6,39.0,38.6,39.1
8,Czechia,Third birth or higher(%),15.6,15.8,15.2,15.0,14.4,14.4,14.8,15.4,...,15.1,14.7,14.6,14.7,14.7,14.6,14.8,14.6,15.0,14.6
9,Estonia,First birth(%),43.5,43.5,44.0,46.2,49.5,50.3,49.6,49.6,...,41.9,42.3,40.8,40.2,36.7,38.8,38.0,37.2,39.8,39.7


In [673]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

               dtype  null_count  unique_count
country       object           0            17
birth_order   object           0             3
1987         float64           0            48
1988         float64           0            49
1989         float64           0            48
1990         float64           0            44
1991         float64           0            48
1992         float64           0            46
1993         float64           0            47
1994         float64           0            47
1995         float64           0            48
1996         float64           0            47
1997         float64           0            49
1998         float64           0            50
1999         float64           0            49
2000         float64           0            48
2001         float64           0            50
2002         float64           0            47
2003         float64           0            50
2004         float64           0            49
2005         

In [674]:
#df_17_2.to_csv('../data/Cleaned/cleaned_births_by_birth_order_oecd.csv', index=False)

In [675]:
#df_17_2.to_sql('births_by_birth_order_oecd', engine, if_exists='replace', index=False)

In [676]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [677]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [678]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [679]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [680]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [681]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [682]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [683]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [684]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [685]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [686]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [687]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
#age_of_mothers_at_childbirth
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [688]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1963     object           0            19
1964     object           0            22
1965     object           0            22
1966     object           0            22
1967     object           0            22
1968     object           0            20
1969     object           0            21
1970     object           0            19
1971     object           0            19
1972     object           0            20
1973     object           0            20
1974     object           0            24
1975     object           0            21
1976     object           0            22
1977     object           0            20
1978     object           0            22
1979     object           0            23
1980     object           0            22
1981     object           0            20
1982     object           0            18
1983     object           0            20
1984     object           0       

In [689]:
df_19_1.columns = df_19_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [690]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_19_1.columns if c != "country"]

df_19_1[num_cols] = (
    df_19_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [691]:
df_19_1.drop_duplicates(inplace=True)
df_19_1.dropna(inplace=True)

In [692]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1963     float64           0            16
1964     float64           0            18
1965     float64           0            18
1966     float64           0            18
1967     float64           0            18
1968     float64           0            17
1969     float64           0            17
1970     float64           0            15
1971     float64           0            17
1972     float64           0            18
1973     float64           0            18
1974     float64           0            20
1975     float64           0            18
1976     float64           0            18
1977     float64           0            16
1978     float64           0            18
1979     float64           0            21
1980     float64           0            20
1981     float64           0            17
1982     float64           0            17
1983     float64           0            18
1984     fl

In [693]:
df_19_1.sample(10)

Unnamed: 0,country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
17,Netherlands,29.3,29.2,29.0,28.8,28.5,28.4,28.3,28.2,28.0,...,30.9,31.0,31.1,31.2,31.3,31.4,31.5,31.6,31.7,31.8
9,Finland,28.1,28.0,28.0,27.8,27.7,27.5,27.4,27.1,26.9,...,30.4,30.5,30.5,30.6,30.8,30.9,31.0,31.1,31.2,31.4
1,Austria,27.4,27.4,27.3,27.1,27.0,26.8,26.8,26.7,26.7,...,30.2,30.3,30.4,30.6,30.6,30.7,30.9,31.0,31.0,31.2
0,Australia,27.5,27.5,27.4,27.3,27.3,27.2,27.2,27.1,26.9,...,30.1,30.1,30.2,30.3,30.5,30.6,30.7,30.8,30.9,31.1
12,Iceland,27.6,27.7,27.7,27.5,27.5,27.4,27.3,27.2,27.0,...,30.1,30.4,30.2,30.3,30.6,30.6,30.6,30.9,30.7,30.9
8,Estonia,27.6,27.4,27.3,27.3,27.1,26.9,26.9,26.7,26.7,...,29.6,29.5,29.6,29.9,30.2,30.4,30.5,30.6,30.7,31.0
7,Denmark,27.3,26.8,26.8,26.6,26.5,26.5,26.6,26.7,26.7,...,30.7,30.8,30.9,31.0,31.0,31.1,31.2,31.3,31.4,31.6
11,Hungary,25.8,25.7,25.6,25.6,25.6,25.5,25.5,25.4,25.4,...,29.4,29.5,29.5,29.6,29.6,29.8,29.8,29.9,29.9,30.0
22,Switzerland,27.8,27.8,27.7,27.7,27.5,27.5,27.9,27.8,27.7,...,31.5,31.6,31.8,31.8,31.9,31.9,32.0,32.1,32.2,32.3
5,Czech Republic,25.7,25.8,25.5,25.2,25.0,24.9,24.8,24.8,24.9,...,29.8,29.9,29.9,30.0,30.0,30.0,30.1,30.2,30.2,30.4


In [694]:
#df_19_1.to_csv('../data/Cleaned/age_of_mothers_at_childbirth_oecd.csv', index=False)

In [695]:
#df_19_1.to_sql('age_of_mothers_at_childbirth_oecd', engine, if_exists='replace', index=False)

In [696]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
#fertility_per_1000_from 1960
df_19_2.head()

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [697]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            21
Age group  object           0             7
1960       object           0           136
1961       object           0           140
1962       object           0           140
...           ...         ...           ...
2017       object           0           124
2018       object           0           128
2019       object           0           126
2020       object           0           121
2021       object           7           119

[64 rows x 3 columns]


In [698]:
df_19_2.columns = df_19_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_2.head()

Unnamed: 0,country,age_group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [699]:
# --- Ensure "country" and "age_group" are strings
df_19_2["country"] = df_19_2["country"].astype(str).str.strip().str.title()
df_19_2["age_group"] = df_19_2["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_2.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_2[num_cols] = (
    df_19_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_2[num_cols] = df_19_2[num_cols].round(2)

In [700]:
df_19_2.drop_duplicates(inplace=True)
df_19_2.dropna(inplace = True)

In [701]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            19
age_group   object           0             7
1960       float64           0           124
1961       float64           0           126
1962       float64           0           126
...            ...         ...           ...
2017       float64           0           118
2018       float64           0           121
2019       float64           0           120
2020       float64           0           115
2021       float64           0           118

[64 rows x 3 columns]


In [702]:
#df_19_2.to_csv('../data/Cleaned/fertility_per_1000_by_age_from 1960_oecd.csv', index=False)

In [703]:
#df_19_2.to_sql('fertility_per_1000_from_1960_oecd', engine, if_exists='replace', index=False)

In [704]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
#fertility_per_1000_from_2000
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [705]:
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            43
Age group  object           0             7
2000       object           0           233
2001       object           0           248
2002       object           0           240
2003       object           0           239
2004       object           0           245
2005       object           0           240
2006       object           0           239
2007       object           0           242
2008       object           0           252
2009       object           0           251
2010       object           0           239
2011       object           0           235
2012       object           0           242
2013       object           0           234
2014       object           0           238
2015       object           0           237
2016       object           0           248
2017       object           0           236
2018       object           0           245
2019       object           0   

In [706]:
df_19_3.columns = df_19_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_3.head()

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587


In [707]:
# --- Ensure "country" and "age_group" are strings
df_19_3["country"] = df_19_3["country"].astype(str).str.strip().str.title()
df_19_3["age_group"] = df_19_3["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_3.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_3[num_cols] = (
    df_19_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_3[num_cols] = df_19_3[num_cols].round(2)

In [708]:
df_19_3.drop_duplicates(inplace=True)
df_19_3.dropna(inplace=True)

In [709]:
#Check again
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            41
age_group   object           0             7
2000       float64           0           225
2001       float64           0           237
2002       float64           0           232
2003       float64           0           229
2004       float64           0           233
2005       float64           0           229
2006       float64           0           229
2007       float64           0           230
2008       float64           0           238
2009       float64           0           238
2010       float64           0           230
2011       float64           0           227
2012       float64           0           231
2013       float64           0           225
2014       float64           0           226
2015       float64           0           225
2016       float64           0           237
2017       float64           0           227
2018       float64           0           233
2019      

In [710]:
df_19_3.sample(10)

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
134,Italy,20-24,34.2,33.7,33.1,32.3,34.4,34.1,34.6,35.3,...,33.4,31.5,30.5,28.8,27.8,26.5,25.0,23.6,22.4,20.9
137,Italy,35-39,41.4,43.4,45.2,47.8,50.6,51.5,54.2,56.4,...,61.1,59.6,59.2,59.4,59.9,60.5,59.7,60.1,59.5,62.8
106,Hungary,20-24,70.2,65.0,60.7,56.4,51.8,50.0,48.3,45.3,...,41.9,41.7,43.7,44.3,46.8,47.0,47.3,48.4,49.6,49.3
76,Estonia,45-49,0.2,0.2,0.1,0.3,0.2,0.2,0.2,0.3,...,0.5,0.4,0.6,0.6,0.5,0.9,0.8,1.1,1.1,1.2
197,Norway,20-24,67.3,62.7,59.5,58.9,59.6,58.6,60.3,60.5,...,52.6,48.4,44.8,42.3,39.6,34.7,32.6,28.8,25.5,24.3
170,Luxembourg,25-29,117.6,110.9,101.9,102.0,105.5,101.8,98.9,97.9,...,82.4,85.2,76.5,76.4,71.0,67.1,63.2,59.2,61.3,57.8
209,Poland,45-49,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.3,0.3,0.3,0.3,0.3,0.3,0.4,0.4,0.4,0.4
283,Cyprus,30-34,86.2,81.2,82.6,83.8,83.9,88.1,93.3,92.0,...,93.5,92.0,95.4,97.8,102.0,98.2,94.7,93.2,94.7,100.7
174,Luxembourg,45-49,0.3,0.3,0.3,0.4,0.3,0.3,0.2,0.4,...,0.7,0.7,1.0,1.0,1.1,1.0,1.1,1.6,1.3,1.6
20,Austria,45-49,0.1,0.3,0.3,0.3,0.2,0.3,0.3,0.3,...,0.5,0.5,0.5,0.6,0.6,0.6,0.8,0.8,0.8,0.6


In [711]:
#df_19_3.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_2000_oecd.csv',index=False)

In [712]:
#df_19_3.to_sql('fertility_per_1000_from_2000_oecd',engine, if_exists='replace', index=False)

In [713]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
#(%)share_of_births_outside_of_marriage
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [714]:
df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1960     object           0            26
1961     object           0            24
1962     object           0            24
1963     object           0            24
...         ...         ...           ...
2016     object           0            24
2017     object           0            26
2018     object           0            25
2019     object           0            25
2020     object           0            24

[62 rows x 3 columns]


In [715]:
df_20.columns = df_20.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [716]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_20.columns if c != "country"]

df_20[num_cols] = (
    df_20[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [717]:
df_20.drop_duplicates(inplace=True)
df_20.dropna(inplace=True)

df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1960     float64           0            22
1961     float64           0            20
1962     float64           0            21
1963     float64           0            21
...          ...         ...           ...
2016     float64           0            20
2017     float64           0            22
2018     float64           0            21
2019     float64           0            22
2020     float64           0            22

[62 rows x 3 columns]


In [718]:
df_20.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
4,Finland,4.0,4.1,4.0,4.2,4.4,4.6,4.8,5.1,5.3,...,40.9,41.5,42.1,42.8,44.3,44.9,44.8,44.6,45.4,46.1
20,Sweden,11.3,11.7,12.4,12.6,13.1,13.8,14.5,15.1,16.0,...,54.3,54.5,54.4,54.6,54.7,54.9,54.5,54.5,54.5,55.2
16,Portugal,9.5,8.8,8.5,8.2,8.0,7.8,7.5,7.5,7.4,...,42.8,45.6,47.6,49.3,50.7,52.8,54.9,55.9,56.8,57.9
10,Italy,2.4,2.4,2.2,2.2,2.0,2.0,2.0,2.0,2.0,...,23.7,25.0,26.9,28.8,30.0,31.5,32.8,34.0,35.4,33.8
7,Hungary,5.5,5.5,5.4,5.3,5.2,5.2,5.1,5.0,5.0,...,42.3,44.5,45.6,47.3,47.9,46.7,44.7,43.9,38.7,30.4
13,Netherlands,1.4,1.4,1.5,1.6,1.7,1.8,2.0,2.1,2.0,...,45.3,46.6,47.4,48.7,49.8,50.4,51.0,51.9,52.4,53.5
23,United States,5.3,5.6,5.9,6.3,6.9,7.7,8.4,9.0,9.7,...,40.7,40.7,40.6,40.2,40.3,39.8,39.8,39.6,40.0,40.5
15,Norway,3.7,3.7,3.8,3.9,4.2,4.6,4.9,5.1,5.6,...,55.0,54.9,55.2,55.5,55.9,56.2,55.7,56.4,57.6,58.5
19,Spain,2.3,2.2,2.1,1.9,1.8,1.7,1.6,1.5,1.4,...,37.4,39.0,40.9,42.5,44.5,45.9,46.8,47.3,48.4,47.6
11,Latvia,11.9,12.3,12.4,12.3,12.8,13.3,12.6,12.3,12.1,...,44.6,45.0,44.6,44.0,41.5,40.9,40.4,39.5,38.4,39.5


In [719]:
#df_20.to_csv('../data/Cleaned/cleaned_share_of_births_outside_of_marriage_oecd.csv', index=False)

In [None]:
#df_20.to_sql('share_of_births_outside_of_marriage_oecd',engine, if_exists='replace', index=False)

22

In [721]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
#mean_age_first_marriage
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [722]:
df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            10
Gender     object           0             2
1990       object           0            17
1991       object           0            18
1992       object           0            18
1993       object           0            19
1994       object           0            16
1995       object           0            18
1996       object           0            19
1997       object           0            17
1998       object           0            14
1999       object           0            19
2000       object           0            18
2001       object           0            18
2002       object           0            19
2003       object           0            19
2004       object           0            16
2005       object           0            18
2006       object           0            18
2007       object           0            19
2008       object           0            18
2009       object           0   

In [723]:
df_21_1.columns = df_21_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [724]:
# --- Ensure "country" and "gender" are strings
df_21_1["country"] = df_21_1["country"].astype(str).str.strip().str.title()
df_21_1["gender"] = df_21_1["gender"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_1.columns if c not in ["country", "gender"]]
# --- Robust cleaning -> convert to float ---
df_21_1[num_cols] = (
    df_21_1[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_1[num_cols] = df_21_1[num_cols].round(2)

In [725]:
df_21_1.drop_duplicates(inplace=True)
df_21_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0             9
gender     object           0             2
1990      float64           0            15
1991      float64           0            16
1992      float64           0            16
1993      float64           0            17
1994      float64           0            15
1995      float64           0            16
1996      float64           0            17
1997      float64           0            15
1998      float64           0            13
1999      float64           0            17
2000      float64           0            16
2001      float64           0            16
2002      float64           0            17
2003      float64           0            17
2004      float64           0            15
2005      float64           0            17
2006      float64           0            17
2007      float64           0            17
2008      float64           0            16
2009      float64           0   

In [None]:
#df_21_1.to_csv('../data/Cleaned/cleaned_mean_age_first_marriage_oecd.csv',index=False)

In [None]:
#df_21_1.to_sql('mean_age_first_marriage_oecd', engine, if_exists='replace', index= False)

18

In [729]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
#divorce_rates_per_1000_oecd
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [730]:
df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            28
1970       object           0            18
1971       object           0            19
1972       object           0            19
1973       object           0            18
1974       object           0            18
1975       object           0            19
1976       object           0            18
1977       object           0            18
1978       object           0            18
1979       object           0            15
1980       object           0            18
1981       object           0            20
1982       object           0            22
1983       object           0            24
1984       object           0            20
1985       object           0            19
1986       object           0            20
1987       object           0            20
1988       object           0            20
1989       object           0            19
1990       object           0   

In [731]:
df_21_2.columns = df_21_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [732]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_21_2.columns if c != "country"]

df_21_2[num_cols] = (
    df_21_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [733]:
df_21_2.drop_duplicates(inplace=True)
df_21_2.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0            23
1970      float64           0            15
1971      float64           0            17
1972      float64           0            15
1973      float64           0            14
1974      float64           0            15
1975      float64           0            16
1976      float64           0            14
1977      float64           0            13
1978      float64           0            15
1979      float64           0            12
1980      float64           0            14
1981      float64           0            17
1982      float64           0            17
1983      float64           0            19
1984      float64           0            16
1985      float64           0            15
1986      float64           0            16
1987      float64           0            16
1988      float64           0            15
1989      float64           0            15
1990      float64           0   

In [734]:
df_21_2.head(8)

Unnamed: 0,country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,1.4,1.3,1.3,1.3,1.4,1.4,1.5,1.5,1.6,...,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.6,1.5
1,Belgium,0.7,0.7,0.8,0.9,1.0,1.1,1.3,1.3,1.4,...,2.2,2.2,2.2,2.1,2.0,2.0,2.0,1.8,1.9,1.7
2,Czechia,2.2,2.4,2.3,2.5,2.5,2.6,2.5,2.5,2.6,...,2.7,2.5,2.5,2.4,2.4,2.3,2.3,2.0,2.0,1.9
3,Denmark,1.9,2.7,2.6,2.5,2.6,2.6,2.6,2.6,2.6,...,3.4,3.4,2.9,3.0,2.6,2.6,1.8,2.7,2.2,2.1
5,Finland,1.3,1.6,1.8,1.9,2.1,2.0,2.1,2.1,2.2,...,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.4,2.2,2.0
6,Germany,1.3,1.4,1.5,1.6,1.8,1.9,2.0,1.5,1.0,...,2.1,2.1,2.0,2.0,1.9,1.8,1.8,1.7,1.7,1.6
8,Hungary,2.2,2.3,2.3,2.4,2.3,2.5,2.6,2.6,2.7,...,2.0,2.0,2.1,2.0,1.9,1.7,1.8,1.5,1.9,1.8
10,Japan,0.9,1.0,1.0,1.0,1.0,1.1,1.1,1.1,1.2,...,1.8,1.8,1.8,1.7,1.7,1.7,1.7,1.6,1.47,1.52


In [735]:
#df_21_2.to_csv('../data/Cleaned/cleaned_divorce_rates_per_1000_oecd.csv', index=False)

In [None]:
#df_21_2.to_sql('divorce_rates_per_1000_oecd',engine, if_exists= 'replace' , index=False)

23

In [737]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
#share_of_previous_marital_status
df_21_3

Unnamed: 0,Country,Previous marital status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216
5,Austria,Widowed,12,11,12,11,12,12,11,10,...,10,9,10,9,10,8,9,9,8,4
6,Czechia,Single never married,749,745,743,740,739,742,745,726,...,740,740,752,756,766,767,764,764,761,759
7,Czechia,Divorced,237,242,244,247,247,245,244,261,...,249,249,238,234,224,223,226,226,229,230
8,Czechia,Widowed,14,13,13,13,14,12,11,13,...,12,11,10,10,10,10,10,10,10,11
9,Denmark,Single never married,759,760,762,764,760,756,756,763,...,772,760,750,762,761,769,764,771,776,783


In [738]:
df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
Country                    object           0            20
Previous marital status    object           0             3
2000                       object           0            47
2001                       object           0            51
2002                       object           0            56
2003                       object           0            50
2004                       object           0            50
2005                       object           0            52
2006                       object           0            49
2008                       object           0            47
2009                       object           0            50
2010                       object           0            49
2011                       object           0            49
2012                       object           0            53
2013                       object           0            49
2014                       object       

In [739]:
df_21_3.columns = df_21_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216


In [740]:
# --- Ensure "country" and "previous_marital_status" are strings
df_21_3["country"] = df_21_3["country"].astype(str).str.strip().str.title()
df_21_3["previous_marital_status"] = df_21_3["previous_marital_status"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_3.columns if c not in ["country", "previous_marital_status"]]
# --- Robust cleaning -> convert to float ---
df_21_3[num_cols] = (
    df_21_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_3[num_cols] = df_21_3[num_cols].round(2)

In [741]:
df_21_3.drop_duplicates(inplace=True)
df_21_3.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
country                    object           0            20
previous_marital_status    object           0             3
2000                      float64           0            47
2001                      float64           0            51
2002                      float64           0            56
2003                      float64           0            50
2004                      float64           0            50
2005                      float64           0            52
2006                      float64           0            49
2008                      float64           0            47
2009                      float64           0            50
2010                      float64           0            49
2011                      float64           0            49
2012                      float64           0            53
2013                      float64           0            49
2014                      float64       

In [742]:
#df_21_3.to_csv('../data/Cleaned/cleaned_share_of_previous_marital_status_oecd', index=False)

In [None]:
#df_21_3.to_sql('share_of_previous_marital_status_oecd', engine, if_exists= 'replace', index =  False)

60

In [None]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
df_22_1

In [None]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
df_22_2

In [None]:
df_6666 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_6666

In [None]:
df_888= pd.read_csv('../data/Raw/OECD/Households-by-type,-presence-of-children-and-country,-2015-2024.csv')
df_888

In [None]:
df_999 = pd.read_csv('../data/Raw/OECD/Households-with-children-by-number-of-children,-2024.csv')
df_999