In [2547]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [2548]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [2549]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [2550]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [2551]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [2552]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [2553]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2554]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [2555]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [2556]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [2557]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [2558]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [2559]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [2560]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2561]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [2562]:
df_2.drop_duplicates(inplace=True)


In [2563]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [2564]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [2565]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [2566]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [2567]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [2568]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2569]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [2570]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [2571]:
df_3.drop_duplicates(inplace=True)


In [2572]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [2573]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [2574]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [2575]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [2576]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2577]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [2578]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [2579]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [2580]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [2581]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [2582]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [2583]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [2584]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2585]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [2586]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [2587]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [2588]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [2589]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [2590]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
66,Women,,49,85.2,91.9,95.7,87.3,76.0,,,
8,Men,,25,39.6,41.6,59.7,40.6,19.4,8.5,5.0,
36,Women,,19,2.1,5.3,13.4,12.1,3.8,1.4,0.4,0.2
18,Men,,35,84.8,86.7,89.1,73.7,56.3,44.2,,
39,Women,,22,24.3,39.3,57.5,40.6,18.2,7.1,2.9,0.9


In [2591]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [2592]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [2593]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [2594]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [2595]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [2596]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2597]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1537,Poland,POL,1975,4.7
857,Hungary,HUN,2014,47.3
2002,Switzerland,CHE,2017,25.2
225,Bulgaria,BGR,2019,58.4
1906,Sweden,SWE,1982,42.0


In [2598]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [2599]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [2600]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [2601]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [2602]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2603]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
19,Men,GBR,36,86.1,87.6,89.7,74.8,58.3,46.7,,
40,Women,GBR,23,32.7,49.5,68.2,48.4,24.0,10.0,4.7,1.6
21,Men,GBR,38,88.1,89.0,90.5,76.6,61.3,50.9,,
22,Men,GBR,39,88.8,89.5,90.8,77.4,62.6,52.5,,
30,Men,GBR,47,92.3,91.6,92.2,81.0,69.4,,,


In [2604]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
13,Men,30,72.9,76.4,83.3,63.9,41.4
17,Men,34,83.2,85.5,88.5,72.4,53.7
61,Women,44,84.0,91.3,95.4,86.5,74.1
22,Men,39,88.8,89.5,90.8,77.4,62.6
53,Women,36,79.9,88.9,94.2,83.5,68.2


In [2605]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [2606]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [2607]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [2608]:
#pip install openpyxl pywin32

In [2609]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [2610]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [2611]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [2612]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [2613]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [2614]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [2615]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [2616]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [2617]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [2618]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [2619]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [2620]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
23183,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,796,Turks and Caicos Islands,TC,TCA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,58.98
1968,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,64,Bhutan,BT,BTN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,63.28
5662,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,196,Cyprus,CY,CYP,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,54.39
7237,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,242,Fiji,FJ,FJI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,59.86
19141,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,662,Saint Lucia,LC,LCA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,53.2


In [2621]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
5993,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,208,Denmark,DK,DNK,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.56
14508,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,498,Republic of Moldova,MD,MDA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,68.07
21729,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,756,Switzerland,CH,CHE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.3
1868,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,60,Bermuda,BM,BMU,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,53.19
22150,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,764,Thailand,TH,THA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.03


In [2622]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [2623]:
df_9.drop_duplicates(inplace=True)

In [2624]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [2625]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [2626]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [2627]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [2628]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [2629]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
96537,Norway,578,Married or in-union women,2049,20-24,21.322194,32.743214,Projection
85492,Montenegro,499,Married or in-union women,2045,35-39,76.708728,12.095432,Projection
30426,Cook Islands,184,Married or in-union women,2047,25-29,55.504606,0.186773,Projection
35255,Dominica,212,Married or in-union women,2002,15-49,48.085905,8.146714,Estimate
144885,Yemen,887,Married or in-union women,2017,40-44,86.61,598.55175,Estimate
26348,"China, Taiwan Province of China",158,Married or in-union women,2023,35-39,66.374773,547.507914,Projection
85099,Montenegro,499,Married or in-union women,1996,30-34,81.705864,19.537915,Estimate
73648,Liberia,430,Married or in-union women,2023,15-19,10.997371,33.880094,Projection
82742,Mauritius,480,Married or in-union women,2025,45-49,78.609482,37.629573,Projection
85272,Montenegro,499,Married or in-union women,2018,15-19,1.670223,0.323171,Estimate


In [2630]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
118281,Slovakia,703,Married or in-union women,2013,20-24,26.370004,49.591166,Estimate
44768,Djibouti,262,Married or in-union women,1977,15-19,7.5,0.95295,Estimate
86702,Morocco,504,Married or in-union women,2034,45-49,74.797458,1027.60375,Projection
30152,Cook Islands,184,Married or in-union women,2013,15-19,5.756108,0.040667,Estimate
128069,Tajikistan,762,Married or in-union women,2021,40-44,85.333482,219.839531,Projection


In [2631]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [2632]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [2633]:
df_10.dropna(inplace=True)

In [2634]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [2635]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [2636]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [2637]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [2638]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [2639]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
2830,Bangladesh,50,2011,2011,Men,[45-49],45,49,98.23,Census,2011 Census,4776,Bangladesh 2011 Census,IPUMS,1.0,Data are based on a 5 per cent sample.,
22851,Indonesia,360,2002,2003,Women,[25-29],25,29,83.7,Survey,2002-2003 DHS,1689,Indonesia 2002-2003 Demographic and Health Survey,DHS_HH,,,
12427,Dominican Republic,214,2002,2002,Men,[70-74],70,74,75.79,Census,2002 Census,910,Dominican Republic 2002 Census,UNSD,1.0,,
20559,Hungary,348,1997,1997,Women,[55-59],55,59,64.05,Estimate,1997 Estimate,2120,Hungary 1997 Estimate,UNSD,,,
50968,United Kingdom,826,1983,1983,Men,[55-59],55,59,83.52,Estimate,1983 Estimate,2246,United Kingdom 1983 Estimate,UNSD,,,Excluding Channel Islands (Guernsey and Jersey...
24431,Israel,376,2008,2008,Women,[70-74],70,74,50.75,Estimate,2008 Estimate,2127,Israel 2008 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
40718,Saint Pierre and Miquelon,666,1982,1982,Men,[20-24],20,24,15.98,Census,1982 Census,2550,Saint Pierre and Miquelon 1982 Census,UNSD,,,
6656,Canada,124,2001,2001,Women,[40-44],40,44,74.65,Census,2001 Census,1256,Canada 2001 Census,UNSD,1.0,,


In [2640]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
11325,Denmark,208,1971,1971,Men,[10-14],10,14,0.0,Estimate,1971 Estimate,2081,Denmark 1971 Estimate,UNSD,1.0,,Excluding Faeroe Islands and Greenland shown s...
15662,France,250,1976,1976,Men,[75+],75,999,61.5,Estimate,1976 Estimate,2094,France 1976 Estimate,UNSD,,,Excluding diplomatic personnel outside the cou...
18884,Guinea,324,1992,1992,Women,[15-19],15,19,48.5,Survey,1992 DHS,1738,Guinea 1992 Demographic and Health Survey,INED,1.0,,
2659,Bangladesh,50,1975,1976,Women,[30-34],30,34,90.74,Survey,1975-1976 WFS,749,Bangladesh 1975-1976 World Fertility Survey,National statistics,,,
13455,Estonia,233,2014,2014,Men,[15-19],15,19,0.08,Estimate,2014 Estimate,2089,Estonia 2014 Estimate,UNSD,,,
14689,Finland,246,1986,1986,Men,[70-74],70,74,75.04,Estimate,1986 Estimate,2093,Finland 1986 Estimate,UNSD,,,
53153,Zambia,894,1996,1997,Women,[50-54],50,54,64.7,Survey,1996 DHS,1699,Zambia 1996 Demographic and Health Survey,DHS_HH,1.0,,
17023,Germany,276,1992,1992,Men,[20-24],20,24,8.19,Estimate,1992 Estimate,2102,Germany 1992 Estimate,UNSD,,,


In [2641]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
26189,Kenya,404,Women,[40-44],40,44,84.24,Census,1999 Census,National statistics
16220,France,250,Men,[10-14],10,14,0.0,Estimate,2001 Estimate,UNSD
1756,Austria,40,Women,[35-39],35,39,82.76,Estimate,1976 Estimate,UNSD
30881,Martinique,474,Men,[40-44],40,44,64.6,Census,1982 Census,UNSD
29185,Lithuania,440,Men,[70-74],70,74,77.79,Estimate,2010 Estimate,UNSD
12941,Egypt,818,Women,[15-19],15,19,10.9,Survey,2000 DHS,DHS_HH
4639,British Virgin Islands,92,Women,[15-19],15,19,1.61,Census,1980 Census,UNSD
21291,Iceland,352,Men,[15-19],15,19,0.28,Estimate,1978 Estimate,UNSD
19322,Guyana,328,Men,[30-34],30,34,67.4,Survey,2009 DHS,DHS_HH
40092,Romania,642,Women,[15-19],15,19,6.7,Census,2011 Census,UNSD


In [2642]:
df_11.drop_duplicates(inplace=True)

In [2643]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [2644]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [2645]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [2646]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [2647]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
24974,Ireland,372,1994,1994,Men,[20-24],20,24,3.86,Estimate,1994 Estimate,2126,Ireland 1994 Estimate,UNSD,,,
7142,Canada,124,2001,2001,Men,[25-29],25,29,41.89,Estimate,2001 Estimate,2061,Canada 2001 Estimate,UNSD,1.0,,
20555,Guyana,328,2000,2000,Women,[20-24],20,24,60.78,Survey,2000 MICS_HH,1939,Guyana 2000 Multiple Indicator Cluster Survey,MICS_HH,1.0,,
29577,Liberia,430,1969,1970,Women,[45-49],45,49,97.9,Survey,1969-1970 PGS,714,Liberia 1969-1970 Population Growth Survey,INED,,,
4601,Brazil,76,2010,2010,Men,[15-19],15,19,4.34,Census,2010 Census,4765,Brazil 2010 Census,National statistics,1.0,,
45987,Slovakia,703,2013,2013,Women,[55-59],55,59,93.39,Estimate,2013 Estimate,2216,Slovakia 2013 Estimate,UNSD,,,
29290,Lesotho,426,1986,1986,Men,[15-19],15,19,1.59,Census,1986 Census,926,Lesotho 1986 Census,US Census Bureau,,,
12179,Denmark,208,1983,1983,Women,[70-74],70,74,90.51,Estimate,1983 Estimate,2081,Denmark 1983 Estimate,UNSD,,,Excluding Faeroe Islands and Greenland shown s...


In [2648]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
8293,Chile,152,Women,[45-49],45,49,84.31,Census,UNSD
45879,Slovakia,703,Women,[65-69],65,69,95.64,Estimate,UNSD
11879,Denmark,208,Men,[40-44],40,44,89.77,Estimate,UNSD
20612,Guyana,328,Women,[45-49],45,49,93.2,Survey,DHS_STATcompiler
8908,"China, Macao SAR",446,Women,[30-34],30,34,75.79,Census,UNSD
42235,Rwanda,646,Women,[60-64],60,64,99.88,Survey,UNSD
28738,Latvia,428,Men,[70-74],70,74,94.8,Estimate,UNSD
47539,Spain,724,Women,[50-54],50,54,90.92,Census,Eurostat


In [2649]:
df_12.dropna(inplace=True)

In [2650]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [2651]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [2652]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [2653]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [2654]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
79435,Zimbabwe,716,[25-29],ASFR2529,1992.22998,199.0,"1994 DHS,Direct,DHS,1701-16-39167",Direct,Survey,DHS,1701,Zimbabwe 1994 Demographic and Health Survey,1994 DHS,1994,1994,DHS Statcompiler,2012
4002,Austria,40,[25-29],ASFR2529,2011.5,86.81,Eurostat.20190531,Official estimates,Estimate,Estimate,2038,All sources of estimates,Estimates,2011,2011,"Eurostat Statistics, Fertility rates by age [d...",2019
14523,China,156,[30-34],ASFR3034,1994.249268,40.17555,"Annual APC,Recent births,China's Fertility Puz...",Recent births,Survey,Annual HH survey,625,China 1994 Annual Population Change Survey (1‰...,Annual APC,1994,1994,Reexamining-China's-Fertility-Puzzle PDR,2006
62345,Serbia,688,[35-39],ASFR3539,1989.5,22.838,"Estimates,Fertility data (Adjusted),HFC-ODE,22...",Fertility data (adjusted),Estimate,Estimate,2212,All sources of estimates,Estimates,1989,1989,European Demographic Observatory (ODE). Data c...,2011
38623,Jordan,400,[40-44],ASFR4044,1978.374023,172.7455,"1990 DHS,Birth Histories (Extrapolated),DHS,17...",Extrapolated from Truncated Birth Histories,Survey,DHS,1727,Jordan 1990 Demographic and Health Survey,1990 DHS,1990,1990,DHS Statcompiler,2012
63948,Slovakia,703,[15-19],ASFR1519,1966.5,39.79,Eurostat.20190531,Official estimates,Estimate,Estimate,2216,All sources of estimates,Estimates,1966,1966,"Eurostat Statistics, Fertility rates by age [d...",2019


In [2655]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [2656]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
68451,Syrian Arab Republic,[25-29],1970,371.0,Arriaga-Mortara CEB method,Census,1970 Census,1970,1970
46539,Mauritania,[25-29],2002,247.25,Birth histories,Survey,2015 MICS,2015,2015
68572,Syrian Arab Republic,[20-24],1985,283.12,Birth histories,Survey,2001 FHS,2001,2001
18530,Croatia,[40-44],2005,4.72,Official estimates,Estimate,Estimates,2005,2005
20954,Denmark,[30-34],2000,115.25,Official estimates,Estimate,Estimates,2000,2000
31782,Haiti,[Total],1992,5.74,Extrapolated from Truncated Birth Histories,Survey,2000 DHS,2000,2000
56531,Poland,[20-24],2013,48.84,Official estimates,Estimate,Estimates,2013,2013
28820,Ghana,[15-19],1962,185.67,Computed rate from reported ASFR,PES,1960 PES,1960,1960
52514,Nicaragua,[35-39],1999,67.0,Direct,Survey,2001 DHS,2001,2001
4518,Azerbaijan,[45-49],2005,0.9,Direct,Register,Register,2005,2005


In [2657]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [2658]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [2659]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [2660]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
250752,Turkey,792,2013,2014,Women,Widowed,,,[35-39],35,...,2013 NDHS,5557,Turkey 2013 Demographic and Health Survey,DHS_HH,,,,,,
170162,Netherlands,528,1980,1980,Men,Divorced,,,[25-29],25,...,1980 Estimate,2170,Netherlands 1980 Estimate,UNSD,,,,,,
11927,Bahamas,44,1970,1970,Women,Consensual union,,,[40-44],40,...,1970 Census,1500,Bahamas 1970 Census,UNSD,,,,,,
266547,Viet Nam,704,2011,2011,Men,Divorced or Separated,,,[40-44],40,...,2011 Annual APC,5576,Viet Nam 2011 Annual Population Change Survey,National statistics,,,,,,
168526,Nepal,524,2011,2011,Women,Widowed,,,[25-29],25,...,2011 Census,4794,Nepal 2011 Census,UNSD,,,,,,


In [2661]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
87493,Georgia,2005,2005,Women,Married,[20-24],20,24,42.63,Survey,2005 MICS,MICS
243880,Thailand,2010,2010,Women,Widowed,[15-19],15,19,0.16,Census,2010 Census,UNSD
61112,Denmark,1985,1985,Men,Single,[50-54],50,54,8.72,Estimate,1985 Estimate,UNSD
36019,Canada,2011,2011,Men,Separated,[60-64],60,64,2.76,Census,2011 Census,UNSD
159863,Mexico,2015,2015,Women,Consensual union,[60-64],60,64,5.83,Survey,2015 Intercensal Survey,National statistics
41577,Chile,2012,2012,Women,Single,[30-44],30,44,38.86,Census,2012 Census,National statistics
126428,Israel,2018,2018,Women,Separated,[50-54],50,54,0.98,Census,2018 Census,National statistics
40128,Chile,1981,1981,Men,Consensual union,[60-64],60,64,2.6,Estimate,1981 Estimate,UNSD
142063,Lesotho,2014,2014,Men,Living together,[20-24],20,24,0.1,Survey,2014 DHS,DHS_STATcompiler
92930,Greenland,1970,1970,Men,Married,[50-54],50,54,77.42,Census,1970 Census,UNSD


In [2662]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [2663]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [2664]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [2665]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [2666]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
16489,Caribbean,915,SDG-M49,Married or in-union women,2006,20-24,45.931585,1533.027753,Estimate
12969,Eastern Europe,923,SDG-M49,Married or in-union women,1971,20-24,63.505485,7265.331633,Estimate
9839,Eastern Asia,906,SDG-M49,Married or in-union women,1984,15-49,66.056495,407013.040782,Estimate
25492,Low-income countries,1500,Income group,Married or in-union women,1997,35-39,86.378157,16746.752592,Estimate
16623,Caribbean,915,SDG-M49,Married or in-union women,2022,15-49,50.360002,11018.851989,Estimate
11556,Western Asia,922,M49,Married or in-union women,2037,35-39,76.902328,20026.181282,Projection


In [2667]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
19483,Australia and New Zealand,1834,Married or in-union women,1972,40-44,80.936126,372.894988,Estimate
751,Sub-Saharan Africa,202,Married or in-union women,1982,15-49,70.010505,124980.522447,Estimate
14906,Western Europe,926,Married or in-union women,1970,25-29,72.048947,3832.089325,Estimate
1929,Northern Africa and Western Asia,747,Married or in-union women,2049,20-24,36.462166,20285.981674,Projection
26516,Lower-middle-income countries,1501,Married or in-union women,2044,35-39,87.073546,260634.714899,Projection
9594,Central Asia,5500,Married or in-union women,2035,25-29,82.332253,5313.71456,Projection
20092,Australia and New Zealand,927,Married or in-union women,2010,45-49,72.311054,690.586834,Estimate
34,World,900,Married or in-union women,1974,25-29,86.657009,206446.38766,Estimate
1777,Northern Africa and Western Asia,747,Married or in-union women,2030,20-24,38.03508,19110.149382,Projection
15422,Western Europe,926,Married or in-union women,2034,45-49,68.195152,4288.031735,Projection


In [2668]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [2669]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [2670]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [2671]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [2672]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [2673]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [2674]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2675]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [2676]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [2677]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [2678]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [2679]:
df_16_1

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
0,Mexico,3.56,4.08,2.76
1,Costa Rica,3.46,4.37,3.44
2,Türkiye,3.2,4.1,2.8
3,Israel,3.19,4.65,2.86
5,Slovak Republic,3.1,3.8,2.5
7,Iceland,2.7,4.12,2.61
8,New Zealand,2.61,3.88,2.67
9,Greece,2.6,3.8,2.5
10,Croatia,2.6,3.9,2.6
11,Australia,2.53,3.93,2.78


In [2680]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [2681]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [2682]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [2683]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [2684]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [2685]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total            float64           0            36
with_children           float64           0            35
without_children        float64           0            36
single_parent_total     float64           0            34
single_mother           float64           0            32
single_father           float64           0            31
single_person           float64           0            35
other_households_types  float64           0            36
country                    object
couple_total              float64
with_children             float64
without_children          float64
single_parent_total       float64
single_mother             float64
single_father             float64
single_person             float64
other_households_types    float64
dtype: object


In [2686]:
df_16_2

Unnamed: 0,country,couple_total,with_children,without_children,single_parent_total,single_mother,single_father,single_person,other_households_types
1,Austria,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11
2,Belgium,52.22,23.98,28.24,7.42,6.08,1.35,35.5,4.86
6,Costa Rica,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74
7,Czechia,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67
8,Denmark,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52
9,Estonia,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98
10,Finland,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6
11,France,49.73,22.19,27.54,7.68,6.23,1.45,37.78,4.81
12,Germany,45.78,17.89,27.89,5.41,4.44,0.98,43.14,5.67
13,Greece,52.14,24.03,28.11,4.66,3.82,0.84,32.35,10.85


In [2687]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024a
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [2688]:
df_16_3.rename(columns={
    "0 children": "households_0_children(%)",
    "1 child": "households_1_child(%)",
    "2 children": "households_2_children(%)"
}, inplace=True)

In [2689]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [2690]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [2691]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [2692]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                         dtype  null_count  unique_count
country                 object           0            33
households_0_children  float64           0            32
households_1_child     float64           0            32
households_2_children  float64           0            33
3_or_more_children     float64           0            31
children_under_6       float64           0            31
country                   object
households_0_children    float64
households_1_child       float64
households_2_children    float64
3_or_more_children       float64
children_under_6         float64
dtype: object


In [2693]:
df_16_3

Unnamed: 0,country,households_0_children,households_1_child,households_2_children,3_or_more_children,children_under_6
1,Austria,77.78,10.52,8.57,3.12,9.44
2,Belgium,73.97,11.76,10.15,4.11,10.4
6,Costa Rica,30.29,23.08,24.61,22.02,26.3
7,Czechia,71.95,13.85,11.56,2.64,12.29
8,Denmark,77.78,10.54,8.94,2.74,8.15
9,Estonia,75.76,12.53,8.73,2.98,9.85
10,Finland,81.98,7.89,6.99,3.14,7.14
11,France,75.36,11.43,9.23,3.99,9.86
12,Germany,79.86,9.91,7.72,2.51,8.57
13,Greece,74.31,11.83,9.97,3.89,9.37


In [2694]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [2695]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [2696]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2697]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [2698]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [2699]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            49
1960     float64           0            47
1961     float64           0            47
1962     float64           0            47
1963     float64           0            46
...          ...         ...           ...
2019     float64           0            37
2020     float64           0            39
2021     float64           0            40
2022     float64           0            34
2023     float64           0            35

[65 rows x 3 columns]


In [2700]:
df_17_1.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
35,United States,3.65,3.62,3.46,3.32,3.19,2.91,2.72,2.56,2.46,...,1.86,1.84,1.82,1.77,1.73,1.71,1.64,1.66,1.66,1.62
5,Colombia,6.74,6.71,6.66,6.58,6.48,6.33,6.16,5.96,5.74,...,1.82,1.77,1.72,1.72,1.72,1.71,1.69,1.68,1.66,1.65
25,New Zealand,4.24,4.31,4.19,4.05,3.8,3.54,3.41,3.35,3.34,...,1.92,1.99,1.87,1.81,1.71,1.72,1.61,1.64,1.66,1.56
48,EU-27 average,2.62,2.62,2.61,2.65,2.67,2.62,2.58,2.53,2.45,...,1.54,1.54,1.57,1.55,1.54,1.52,1.51,1.54,1.46,1.39
26,Norway,2.91,2.94,2.91,2.93,2.98,2.94,2.9,2.81,2.75,...,1.76,1.73,1.71,1.62,1.56,1.53,1.48,1.55,1.41,1.4
15,Iceland,4.27,3.88,3.98,3.98,3.86,3.71,3.58,3.28,3.07,...,1.99,1.86,1.8,1.76,1.76,1.81,1.79,1.9,1.67,1.59
21,Lithuania,2.4,2.4,2.4,2.4,2.4,2.4,2.4,2.4,2.4,...,1.57,1.63,1.63,1.57,1.53,1.43,1.36,1.36,1.27,1.18
17,Italy,2.41,2.41,2.46,2.56,2.7,2.67,2.63,2.54,2.49,...,1.38,1.36,1.36,1.34,1.31,1.27,1.24,1.25,1.24,1.2
41,Peru,6.94,6.92,6.9,6.86,6.81,6.75,6.68,6.6,6.51,...,2.27,2.23,2.19,2.15,2.12,2.09,2.06,2.03,2.0,1.98
16,Ireland,3.76,3.79,3.92,4.01,4.06,4.03,3.95,3.84,3.78,...,1.89,1.85,1.82,1.78,1.75,1.7,1.63,1.72,1.7,1.5


In [2701]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [2702]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [2703]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [2704]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [2705]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [2706]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_2[num_cols] = (
    df_17_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

KeyError: "['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986'] not in index"

In [None]:
df_17_2.drop_duplicates(inplace=True)
df_17_2.dropna(inplace=True)

In [None]:
df_17_2.sample(10)

In [None]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

In [None]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

In [None]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [None]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [None]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [None]:
df_18

In [None]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [None]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

In [None]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

In [None]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

In [None]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [None]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [None]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
df_19_1

In [None]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
df_19_2

In [None]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
df_19_3

In [None]:
df_888= pd.read_csv('../data/Raw/OECD/Households-by-type,-presence-of-children-and-country,-2015-2024.csv')
df_888

In [None]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
df_20

In [None]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
df_21_1

In [None]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
df_21_2

In [None]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
df_21_3

In [None]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
df_22_1

In [None]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
df_22_2

In [None]:
df_6666 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_6666

In [None]:
df_999 = pd.read_csv('../data/Raw/OECD/Households-with-children-by-number-of-children,-2024.csv')
df_999