In [185]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [186]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [187]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [188]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [189]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [190]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [191]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [192]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [193]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [194]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [195]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [196]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [197]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [198]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [199]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [200]:
df_2.drop_duplicates(inplace=True)


In [201]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [202]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [203]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [204]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [205]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [206]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [207]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [208]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [209]:
df_3.drop_duplicates(inplace=True)


In [210]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [211]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [212]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [213]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [214]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [215]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [216]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [217]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [218]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [219]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [220]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [221]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [222]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [223]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [224]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [225]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [226]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [227]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [228]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
65,Women,,48,85.0,91.8,95.6,87.2,75.7,,,
52,Women,,35,79.0,88.3,93.8,82.8,66.7,54.2,,
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2
23,Men,,40,89.6,89.9,91.1,78.0,64.0,53.9,,
26,Men,,43,91.1,90.8,91.7,79.5,66.7,56.6,,


In [229]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [230]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [231]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [232]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [233]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [234]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [235]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1854,Spain,ESP,1991,10.0
1496,Norway,NOR,1981,16.1
1799,South Korea,KOR,1998,1.0
1700,Slovakia,SVK,2000,18.3
387,Cyprus,CYP,1963,0.1


In [236]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [237]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [238]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [239]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [240]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [241]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
51,Women,GBR,34,78.0,87.7,93.4,82.0,64.9,51.7,,
57,Women,GBR,40,82.4,90.5,95.0,85.3,72.0,61.6,,
3,Men,GBR,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
30,Men,GBR,47,92.3,91.6,92.2,81.0,69.4,,,
38,Women,GBR,21,14.6,26.1,42.2,31.5,12.7,4.8,1.7,0.6


In [242]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
21,Men,38,88.1,89.0,90.5,76.6,61.3
13,Men,30,72.9,76.4,83.3,63.9,41.4
50,Women,33,76.8,86.8,93.0,81.1,63.1
24,Men,41,90.2,90.3,91.4,78.6,65.0
2,Men,19,0.8,0.6,2.0,2.5,0.7


In [243]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [244]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [245]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [246]:
#pip install openpyxl pywin32

In [247]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [248]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [249]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [250]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [251]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [252]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [253]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [254]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [255]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [256]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [257]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [258]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
670,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,28,Antigua and Barbuda,AG,ATG,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,30.23
18810,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,646,Rwanda,RW,RWA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,49.95
8215,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,275,State of Palestine,PS,PSE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,66.3
8376,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,276,Germany,DE,DEU,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.83
8123,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,270,Gambia,GM,GMB,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,74.5


In [259]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
3698,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,132,Cabo Verde,CV,CPV,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,42.48
19086,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,662,Saint Lucia,LC,LCA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,52.74
17795,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,616,Poland,PL,POL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,58.55
3283,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,112,Belarus,BY,BLR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.39
11033,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,388,Jamaica,JM,JAM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,42.25


In [260]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [261]:
df_9.drop_duplicates(inplace=True)

In [262]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [263]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [264]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [265]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [266]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [267]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
48373,Germany,276,Married or in-union women,2022,40-44,72.75584,1894.041498,Estimate
7229,Bahrain,48,Married or in-union women,1982,40-44,84.2,5.249028,Estimate
93276,Vanuatu,548,Married or in-union women,2046,35-39,85.134438,13.671314,Projection
98596,Palau,585,Married or in-union women,1982,35-39,78.134751,0.235186,Estimate
144199,Samoa,882,Married or in-union women,2012,15-49,58.611854,26.485524,Estimate
57191,"China, Hong Kong SAR",344,Married or in-union women,1990,15-49,55.245682,859.365919,Estimate
39792,Estonia,233,Married or in-union women,2003,15-19,4.780909,2.375634,Estimate
117856,Singapore,702,Married or in-union women,2041,15-19,0.070627,0.086635,Projection
50783,Greece,300,Married or in-union women,1999,15-49,56.762136,1507.567697,Estimate
92461,New Caledonia,540,Married or in-union women,2025,40-44,69.820164,7.385577,Projection


In [268]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
121394,Zimbabwe,716,Married or in-union women,1997,25-29,80.898571,362.127489,Estimate
75146,Liechtenstein,438,Married or in-union women,2048,25-29,60.378268,0.73903,Projection
123951,Western Sahara,732,Married or in-union women,1992,15-49,76.868074,35.74058,Projection
99842,Panama,591,Married or in-union women,1976,25-29,71.89,47.051286,Estimate
132096,United Arab Emirates,784,Married or in-union women,2039,15-19,4.980983,12.96383,Projection


In [269]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [270]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [271]:
df_10.dropna(inplace=True)

In [272]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [273]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [274]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [275]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [276]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [277]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
3153,Belgium,56,1991,1991,Women,[35-39],35,39,80.99,Estimate,1991 Estimate,2044,Belgium 1991 Estimate,UNSD,,,
33742,Netherlands,528,1991,1991,Women,[75+],75,999,21.89,Estimate,1991 Estimate,2170,Netherlands 1991 Estimate,UNSD,,,
2484,Bahamas,44,2000,2000,Women,[70-74],70,74,31.27,Census,2000 Census,415,Bahamas 2000 Census,UNSD,1.0,,
46325,Sweden,752,1972,1972,Women,[15-19],15,19,1.58,Estimate,1972 Estimate,2227,Sweden 1972 Estimate,UNSD,,,
50977,United Kingdom,826,1983,1983,Women,[30-34],30,34,81.23,Estimate,1983 Estimate,2246,United Kingdom 1983 Estimate,UNSD,,,Excluding Channel Islands (Guernsey and Jersey...
2251,Azerbaijan,31,2014,2014,Women,[15-19],15,19,8.95,Estimate,2014 Estimate,2035,Azerbaijan 2014 Estimate,UNSD,,,
17356,Germany,276,2016,2016,Women,[30-34],30,34,48.16,Estimate,2016 Estimate,2102,Germany 2016 Estimate,UNSD,1.0,,
13110,Egypt,818,2017,2017,Women,[25-29],25,29,80.74,Census,2017 Census,5743,Egypt 2017 Census,National statistics,,,


In [278]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
36953,Pakistan,586,1998,1998,Men,[40-44],40,44,92.25,Census,1998 Census,250,Pakistan 1998 Census,UNSD,,,
11387,Denmark,208,1973,1973,Men,[40-44],40,44,82.72,Estimate,1973 Estimate,2081,Denmark 1973 Estimate,UNSD,1.0,,Excluding Faeroe Islands and Greenland shown s...
39843,Romania,642,2003,2003,Men,[25-29],25,29,43.13,Estimate,2003 Estimate,2199,Romania 2003 Estimate,UNSD,,,
10669,Czechia,203,2004,2004,Women,[50-54],50,54,72.01,Estimate,2004 Estimate,2079,Czech Republic 2004 Estimate,UNSD,,,
16851,Georgia,268,1989,1989,Women,[65+],65,999,27.46,Census,1989 Census,1226,Georgia 1989 Census,US Census Bureau,,,
32294,Morocco,504,1997,1997,Women,[40-44],40,44,88.8,Survey,1997 ENSME,204,Morocco 1997 Enquête nationale sur la santé de...,INED,,,
36054,Norway,578,1991,1991,Men,[25-29],25,29,28.45,Estimate,1991 Estimate,2180,Norway 1991 Estimate,UNSD,,,
11439,Denmark,208,1991,1991,Men,[20-24],20,24,25.72,Census,1991 Census,1527,Denmark 1991 Census,UNSD,1.0,,Excluding Faeroe Islands and Greenland shown s...


In [279]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
53570,Zimbabwe,716,Women,[25-29],25,29,74.09,Survey,2009 MICS_HH,MICS_HH
15176,Finland,246,Men,[25-29],25,29,18.78,Census,2010 Census,UNSD
3879,Bermuda,60,Women,[35-39],35,39,55.43,Census,2016 Census,UNSD
38756,Portugal,620,Men,[30-34],30,34,80.48,Census,1991 Census,UNSD
24742,Italy,380,Men,[45-49],45,49,85.88,Estimate,1999 Estimate,UNSD
25750,Jordan,400,Men,[20-24],20,24,9.52,Census,2004 Census,UNSD
43150,Seychelles,690,Women,[30-34],30,34,69.39,Census,1987 Census,UNSD
1634,Austria,40,Men,[55-59],55,59,86.54,Estimate,1972 Estimate,UNSD
31228,Mauritius,480,Women,[55-59],55,59,63.71,Census,2011 Census,UNSD
46446,Sweden,752,Women,[60-64],60,64,66.22,Estimate,1975 Estimate,UNSD


In [280]:
df_11.drop_duplicates(inplace=True)

In [281]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [282]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [283]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [284]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [285]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
53337,United Kingdom,826,1992,1992,Women,[35-39],35,39,89.54,Estimate,1992 Estimate,2246,United Kingdom 1992 Estimate,UNSD,,,Excluding Channel Islands (Guernsey and Jersey...
42676,Saint Lucia,662,1980,1980,Men,[40-44],40,44,57.86,Census,1980 Census,1474,Saint Lucia 1980 Census,UNSD,,Data have not been adjusted for underenumeration.,
37814,Norway,578,1982,1982,Men,[55-59],55,59,88.59,Estimate,1982 Estimate,2180,Norway 1982 Estimate,UNSD,,,
32041,Mali,466,1987,1987,Men,[35-39],35,39,91.32,Census,1987 Census,1324,Mali 1987 Census,UNSD,1.0,,
53236,United Kingdom,826,1990,1990,Men,[20-24],20,24,13.9,Estimate,1990 Estimate,2246,United Kingdom 1990 Estimate,UNSD,,,Excluding Channel Islands (Guernsey and Jersey...
20699,Haiti,332,1971,1971,Women,[65-69],65,69,68.08,Census,1971 Census,1656,Haiti 1971 Census,UNSD,1.0,,
28719,Latvia,428,2000,2000,Women,[40-44],40,44,89.27,Estimate,2000 Estimate,2142,Latvia 2000 Estimate,UNSD,,,
17420,French Guiana,254,2006,2006,Women,[10-14],10,14,0.0,Census,2006 Census,2362,French Guiana 2006 Census,UNSD,,,


In [286]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
18545,Ghana,288,Women,[25-29],25,29,77.2,Survey,DHS_HH
11404,Czechia,203,Men,[35-39],35,39,65.89,Estimate,UNSD
9363,Comoros,174,Women,[15-19],15,19,16.51,Census,UNSD
53569,United Republic of Tanzania,834,Women,[50-54],50,54,98.43,Census,UNSD
25013,Ireland,372,Men,[75+],75,999,76.64,Census,UNSD
17610,Gabon,266,Women,[45-49],45,49,90.2,Survey,DHS_HH
52869,Ukraine,804,Women,[20-24],20,24,40.68,Survey,MICS
13231,Dominican Republic,214,Men,[30-34],30,34,75.98,Census,UNSD


In [287]:
df_12.dropna(inplace=True)

In [288]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [289]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [290]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [291]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [292]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
485,Albania,8,[40-44],ASFR4044,1972.5,64.456,"Estimates,Fertility data (Adjusted),HFC-ODE,20...",Fertility data (adjusted),Estimate,Estimate,2029,All sources of estimates,Estimates,1972,1972.0,European Demographic Observatory (ODE). Data c...,2011
48503,Morocco,504,[Total],TFR,1979.665771,6.47678,"1992 DHS,Birth Histories (Extrapolated),DHS,17...",Extrapolated from Truncated Birth Histories,Survey,DHS,1797,Morocco 1992 Demographic and Health Survey,1992 DHS,1992,1992.0,DHS Statcompiler,2012
60657,Rwanda,646,[Total],TFR,2011.5,5.34,"Estimates,Direct,DYB,2201-16-39441",Direct,Estimate,Estimate,2201,All sources of estimates,Estimates,2011,2011.0,Demographic Yearbook,2013
64656,Slovenia,705,[15-19],ASFR1519,2007.5,5.12,Eurostat.20190531,Official estimates,Estimate,Estimate,2218,All sources of estimates,Estimates,2007,2007.0,"Eurostat Statistics, Fertility rates by age [d...",2019
6203,Bangladesh,50,[45-49],ASFR4549,2009.5,4.0,"SVRS,Direct,SRVS-2012,767-16-40609",Direct,SRS,SVRS,767,Bangladesh Sample Vital Registration System,SVRS,1980,,Sample Vital Registration Report 2012 of Bangl...,2014
59281,Romania,642,[30-34],ASFR3034,2012.5,66.63,Eurostat.20190531,Official estimates,Estimate,Estimate,2199,All sources of estimates,Estimates,2012,2012.0,"Eurostat Statistics, Fertility rates by age [d...",2019


In [293]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [294]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
76739,Venezuela (Bolivarian Republic of),[Total],1967,6.47,Computed rate from DYB,Register,Register,1967,1967
16304,Colombia,[25-29],1987,153.0,Direct,Survey,2005 DHS,2004,2005
56206,Philippines,[35-39],2002,95.0,Direct,Survey,2003 DHS,2003,2003
67185,Suriname,[30-34],1980,159.97,Computed rate from DYB,Register,Register,1980,1980
18845,Curaçao,[Total],2002,2.26,Direct,Estimate,Estimates,2002,2002
42769,Liberia,[Total],2006,28.93,Direct,Survey,2009 MIS,2008,2009
51740,New Caledonia,[35-39],2003,59.0,Direct,Register,Register,2003,2003
62714,Seychelles,[35-39],1994,49.0,Direct,Register,Register,1994,1994
56868,Portugal,[20-24],1975,140.11,Birth histories,Survey,1979-1980 WFS,1979,1980
73053,Ukraine,[20-24],2010,98.0,Direct,Survey,2012 MICS,2012,2012


In [295]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [296]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [297]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [298]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
99187,Guinea,324,2005,2005,Men,Not living together,,,[20-24],20,...,2005 DHS,1740,Guinea 2005 Demographic and Health Survey,DHS_STATcompiler,,,,,,
96892,Grenada,308,1991,1991,Women,Married,,,[15-19],15,...,1991 Census,1477,Grenada 1991 Census,US Census Bureau,,,,,,
232527,Sweden,752,1972,1972,Men,Married,,,[25-29],25,...,1972 Estimate,2227,Sweden 1972 Estimate,UNSD,,,,,,
251676,Tuvalu,798,1991,1991,Women,Single,,,[25-29],25,...,1991 Census,2595,Tuvalu 1991 Census,US Census Bureau,,,,,Data pertain to resident population only.,
222738,Slovenia,705,2000,2000,Women,Married,,,[40-44],40,...,2000 Estimate,2218,Slovenia 2000 Estimate,UNSD,,,,,,


In [299]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
220028,Singapore,1990,1990,Men,Married,[15-19],15,19,0.21,Census,1990 Census,UNSD
144683,Liechtenstein,2001,2001,Women,Married,[55-59],55,59,70.52,Estimate,2001 Estimate,UNSD
200008,Republic of Korea,2005,2005,Men,Divorced,[45-49],45,49,6.18,Census,2005 Census,UNSD
137040,Kyrgyzstan,2012,2012,Men,Divorced,[40-44],40,44,5.6,Survey,2012 DHS,DHS_HH
151826,Malawi,2015,2016,Men,Divorced,[75+],75,999,5.75,Survey,2015-2016 DHS,DHS_HH
266999,Wallis and Futuna Islands,1983,1983,Women,Single,[45-49],45,49,23.37,Census,1983 Census,US Census Bureau
216494,Senegal,2012,2014,Women,Divorced,[25-29],25,29,4.4,Survey,2012-2014 DHS,DHS_HH
199032,Qatar,1998,1998,Women,Divorced,[25-29],25,29,2.6,Survey,1998 FHS,GFHS
52894,Côte d'Ivoire,2011,2012,Men,Widowed,[50-54],50,54,3.5,Survey,2011-2012 DHS-MICS,DHS_STATcompiler
106339,Hungary,1995,1995,Men,Single,[35-39],35,39,16.15,Estimate,1995 Estimate,UNSD


In [300]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [301]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [302]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [303]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [304]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
18952,Oceania,909,SDG-M49,Married or in-union women,1990,15-19,9.241527,134.802361,Estimate
17049,Central America,916,SDG-M49,Married or in-union women,1995,20-24,51.178769,6326.526422,Estimate
25520,Low-income countries,1500,Income group,Married or in-union women,2001,15-19,27.316921,11896.160542,Estimate
4743,Africa,903,M49,Married or in-union women,1995,15-49,65.023372,221625.210216,Estimate
2579,Central and Southern Asia,62,SDG,Married or in-union women,2049,30-34,89.752787,159113.966454,Projection
4153,Europe and Northern America,513,SDG,Married or in-union women,2003,20-24,36.467681,13246.508678,Estimate


In [305]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
13591,Eastern Europe,923,Married or in-union women,2048,15-49,46.917241,23770.202946,Projection
19491,Australia and New Zealand,1834,Married or in-union women,1973,20-24,41.981434,285.624882,Estimate
12102,South-Eastern Asia,35,Married or in-union women,2024,45-49,83.765849,37411.532875,Estimate
23205,Developed countries,901,Married or in-union women,2035,40-44,67.928541,26539.337064,Projection
4435,Europe and Northern America,513,Married or in-union women,2038,30-34,58.473834,20482.061859,Projection
18734,Northern America,905,Married or in-union women,2043,45-49,69.768897,8558.428492,Projection
5699,Eastern Africa,910,Married or in-union women,2034,30-34,80.179665,37601.068434,Projection
25721,Low-income countries,1500,Married or in-union women,2026,20-24,56.835171,44501.337622,Projection
24475,Other developing countries,934,Married or in-union women,2032,30-34,81.114339,166718.241235,Projection
26134,Lower-middle-income countries,1501,Married or in-union women,1996,45-49,82.608267,72690.468989,Estimate


In [306]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [307]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [308]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [309]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [310]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [311]:
#pip install "xlrd==1.2.0"


In [312]:
excel_1_1 = pd.read_excel('../data/Raw/OECD/SF_1_1_Family_size_and_composition.xlsx')
excel_1_1.head(10)

Unnamed: 0.1,Unnamed: 0,"Chart SF1.1.A. Average size of households by household type, 2024a",Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,"Data for Chart SF1.1.A. Average size of households by household type, 2024a",Unnamed: 13,Unnamed: 14,Unnamed: 15
0,,"Mean average number of people per household, b...",,,,,,,,,,,"Mean average number of people per household, b...",,,
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,All households,Couple households with children,Single parent households with children
3,,,,,,,,,,,,,Mexico,3.56,4.08,2.76
4,,,,,,,,,,,,,Costa Rica,3.462513,4.372663,3.443867
5,,,,,,,,,,,,,Türkiye,3.2,4.1,2.8
6,,,,,,,,,,,,,Israel,3.19,4.649476,2.863297
7,,,,,,,,,,,,,Columbia,3.100732,,
8,,,,,,,,,,,,,Slovak Republic,3.1,3.8,2.5
9,,,,,,,,,,,,,Chile,2.8,,


In [313]:
file_path  = '../data/Raw/OECD/SF_1_1_Family_size_and_composition.xlsx'
output_csv = '../data/processed/household_oecd_combined.csv'

xls = pd.ExcelFile(file_path)

# ----------------- yardımcılar -----------------
def find_row_with_value(df, pattern):
    """"Dataframe first row that matches the pattern."""
    pat = re.compile(pattern, re.IGNORECASE)
    for i in range(df.shape[0]):
        row = df.iloc[i].astype(str).tolist()
        if any(pat.search(str(x)) for x in row):
            return i
    return 0

def pick_country_column(df):
    """'Unnamed' kolonlar arasından ülke adlarının olduğu en dolu olanı seç."""
    candidates = [c for c in df.columns if str(c).lower().startswith("unnamed")]
    if not candidates:
        return df.columns[0]
    return max(candidates, key=lambda c: df[c].astype(str).ne("nan").sum())

# ----------------- Sheet 1: Chart SF1.1.A -----------------
# Automatical finding of header row
_chart_raw = pd.read_excel(xls, sheet_name='Chart SF1.1.A', header=None)
chart_header_row = find_row_with_value(_chart_raw, r"\bAll households\b")

chart_df = pd.read_excel(xls, sheet_name='Chart SF1.1.A', header=chart_header_row)
country_col_chart = pick_country_column(chart_df)

chart_df = chart_df.rename(columns={country_col_chart: "country"})
rename_chart = {}
for c in chart_df.columns:
    lc = str(c).lower()
    if "all households" in lc:
        rename_chart[c] = "avg_size_all"
    elif "couple households with children" in lc:
        rename_chart[c] = "avg_size_couple_with_children"
    elif "single parent households with children" in lc:
        rename_chart[c] = "avg_size_single_parent_with_children"

chart_df = chart_df[["country"] + list(rename_chart.keys())].rename(columns=rename_chart)
chart_df["country"] = chart_df["country"].astype(str).str.strip()
for col in rename_chart.values():
    chart_df[col] = pd.to_numeric(chart_df[col], errors="coerce")
chart_df = chart_df.dropna(subset=[c for c in chart_df.columns if c != "country"], how="all")

# ----------------- Sheet 2: Table SF1.1.A (çok satırlı başlık) -----------------
_table_a_raw = pd.read_excel(xls, sheet_name='Table SF1.1.A', header=None)
table_a_header_row_top = find_row_with_value(_table_a_raw, r"Couple households")
table_a = pd.read_excel(xls, sheet_name='Table SF1.1.A',
                        header=[table_a_header_row_top, table_a_header_row_top+1])

# Ülke kolonu: en dolu Unnamed çifti
mi_cols = table_a.columns
unnamed_pairs = [c for c in mi_cols
                 if str(c[0]).lower().startswith("unnamed")
                 and str(c[1]).lower().startswith("unnamed")]
country_multi_col = max(unnamed_pairs, key=lambda c: table_a[c].astype(str).ne("nan").sum())

def match_best(mi_cols, top_label, sub_keyword):
    """Find best matching column in multi-index columns."""
    for c in mi_cols:
        top = str(c[0]).strip()
        sub = str(c[1]).strip()
        if top == top_label:
            if not sub_keyword:
                return c  # Get first match (e.g., "Single person" / "Other" sub-headings)
            if sub_keyword.lower() in sub.lower():
                return c
    return None

specs = [
    ("Couple households:", "Total",             "share_couple_total"),
    ("Couple households:", "With children",     "share_couple_with_children"),
    ("Couple households:", "Without children",  "share_couple_without_children"),
    ("Single parent households:", "Total",      "share_single_parent_total"),
    ("Single parent households:", "Single mother households", "share_single_mother"),
    ("Single parent households:", "Single father households", "share_single_father"),
    ("Single person households", "",            "share_single_person"),
    ("Other household types", "",              "share_other_types"),
]

selected_cols = {country_multi_col: "country"}
for top, sub, name in specs:
    c = match_best(mi_cols, top, sub)
    if c is not None:
        selected_cols[c] = name

table_a_clean = table_a.loc[:, list(selected_cols.keys())].copy()
table_a_clean.columns = list(selected_cols.values())
table_a_clean["country"] = table_a_clean["country"].astype(str).str.strip()
for col in [c for c in table_a_clean.columns if c != "country"]:
    table_a_clean[col] = pd.to_numeric(table_a_clean[col], errors="coerce")
table_a_clean = table_a_clean.dropna(subset=[c for c in table_a_clean.columns if c != "country"], how="all")

# ----------------- Sheet 3: Table SF1.1.B -----------------
_table_b_raw = pd.read_excel(xls, sheet_name='Table SF1.1.B', header=None)
table_b_header_row = find_row_with_value(_table_b_raw, r"^0\s*children$")
table_b = pd.read_excel(xls, sheet_name='Table SF1.1.B', header=table_b_header_row)

country_col_b = pick_country_column(table_b)
table_b = table_b.rename(columns={country_col_b: "country"})

rename_b = {}
for c in table_b.columns:
    lc = str(c).lower()
    if lc == "country":
        continue
    if re.search(r"^0\s*children", lc):
        rename_b[c] = "share_hh_0_children"
    elif re.search(r"^1\s*child", lc):
        rename_b[c] = "share_hh_1_child"
    elif re.search(r"^2\s*children", lc):
        rename_b[c] = "share_hh_2_children"
    elif "3 or more children" in lc:
        rename_b[c] = "share_hh_3plus_children"
    elif ("under 6" in lc) or ("under six" in lc):
        rename_b[c] = "share_hh_with_child_under6"

table_b = table_b[["country"] + list(rename_b.keys())].rename(columns=rename_b)
table_b["country"] = table_b["country"].astype(str).str.strip()
for col in rename_b.values():
    table_b[col] = pd.to_numeric(table_b[col], errors="coerce")
table_b = table_b.dropna(subset=[c for c in table_b.columns if c != "country"], how="all")

# ----------------- Together+ Save -----------------
combined = (
    chart_df
    .merge(table_a_clean, on="country", how="outer")
    .merge(table_b,       on="country", how="outer")
    .sort_values("country")
    .reset_index(drop=True)
)

# Save the combined DataFrame to CSV
Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
combined.to_csv(output_csv, index=False)
print(f"✅ Saved: {output_csv}")
display(combined.head(12))

✅ Saved: ../data/processed/household_oecd_combined.csv


Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [314]:
df_16 = pd.read_csv('../data/processed/household_oecd_combined.csv')
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [315]:
# To numeric columns, except 'country'
metric_cols = [c for c in df_16.columns if c != 'country']
for c in metric_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 3) Long/tidy verse format
tidy_16 = (df_16
           .melt(id_vars='country',
                 value_vars=metric_cols,
                 var_name='metric',
                 value_name='value')
           .dropna(subset=['value'])
           .sort_values(['country','metric'])
           .reset_index(drop=True))

# (opsiyonel) 
tidy_16['unit'] = tidy_16['metric'].apply(lambda m: 'persons' if m.startswith('avg_size') else 'percent')
def metric_group(m):
    if m.startswith('avg_size'): return 'average_size'
    if m.startswith('share_hh_'): return 'children_count_distribution'
    if m.startswith('share_couple'): return 'couple_households'
    if m.startswith('share_single_parent'): return 'single_parent_households'
    if m in ('share_single_mother','share_single_father'): return 'single_parent_gender'
    if m == 'share_single_person': return 'single_person_households'
    if m == 'share_other_types': return 'other_types'
    return 'other'
tidy_16['group'] = tidy_16['metric'].map(metric_group)

# 4) Gerekirse tidy üstünde oynadıktan sonra tekrar WIDE'a dön ve df_16'ya geri yaz
df_16 = (tidy_16.pivot_table(index='country', columns='metric', values='value', aggfunc='first')
         .reset_index()
         .reindex(columns=['country'] + metric_cols)   # orijinal kolon sırası
        )

In [316]:
df_16

metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [317]:
# 1) Identify columns
metric_cols = [c for c in df_16.columns if c != 'country']
pct_cols    = [c for c in metric_cols if str(c).startswith('share_') or '(%)' in str(c)]
size_cols   = [c for c in metric_cols if str(c).startswith('avg_size')]

# 2) Normalize percentage columns to 0–100 NUMERIC (do NOT divide by 100)
for c in pct_cols:
    s = (df_16[c].astype(str)
                    .str.replace('%', '', regex=False)   # drop percent sign if present
                    .str.replace(',', '.', regex=False)  # handle decimal comma
                    .str.strip()
                    .replace({'': np.nan}))
    s = pd.to_numeric(s, errors='coerce')

    # If column appears to be 0–1 scale, scale UP to 0–100
    maxv = s.max(skipna=True)
    if pd.notna(maxv) and maxv <= 1.5:
        s = s * 100.0

    df_16[c] = s  # write back as float (e.g., 55.93)

# 3) Ensure avg_size columns stay numeric (unchanged values)
for c in size_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 4) (Optional) add "(%)" to headers of percentage columns (header-only; no value changes)
df_16.rename(columns=lambda x: f'{x} (%)' if x in pct_cols and '(%)' not in str(x) else x, inplace=True)

# 5) Display with blanks instead of NaN (values remain numeric under the hood)
display(df_16.style.format(na_rep=''))


metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [318]:
df_16.sample(10)

metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
10,Czechia,2.3,3.7,2.4,47.025112,21.703808,25.321304,7.154102,6.112638,1.041464,39.150397,6.670389,71.952265,13.852443,11.560185,2.635107
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
41,Spain,2.5,3.7,2.4,50.872979,25.4583,25.41469,8.57236,6.886923,1.685437,26.976136,13.578514,74.607276,13.543846,8.948054,2.900824
26,Lithuania,1.7,3.7,2.4,50.089192,23.741196,26.347996,7.179025,6.676129,0.502896,35.155839,7.575945,80.437001,11.057086,6.99922,1.506693
17,Greece,2.6,3.8,2.5,52.138469,24.027235,28.111233,4.662701,3.820244,0.842457,32.347616,10.851376,74.311386,11.832867,9.969559,3.888643
33,OECD average,2.385529,,,,,,,,,,,,,,
40,Slovenia,2.4,3.9,2.5,45.413954,20.971828,24.442126,6.932339,5.570598,1.361741,33.997106,13.656601,74.99711,11.247255,10.195353,3.560282
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
31,New Zealand,2.607408,3.88397,2.668222,57.328125,29.253137,28.074988,10.387307,,,22.794397,9.490172,67.370046,13.689523,12.745727,6.194528


In [319]:
df_16.columns.name = None
df_16.index.name = None

In [320]:
df_16.sample(10)

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
28,Malta,2.5,3.7,2.5,46.920827,21.114001,25.806826,5.680812,4.557909,1.122903,32.512251,14.889819,76.48917,12.680505,7.806859,2.978339
25,Latvia,2.1,3.7,2.5,27.800585,12.205404,15.595181,13.436529,11.207919,2.228611,41.083861,17.679025,74.798867,14.050992,8.31728,2.832861
30,Netherlands,2.0,3.9,2.6,53.601238,23.007511,30.593727,6.098421,5.004163,1.094258,38.504867,1.795475,78.652492,8.776867,9.272513,3.296971
46,United States,2.51,,,53.185494,19.846267,33.339226,6.797399,5.212952,1.584447,27.611972,12.405135,,,,
5,Chile,2.8,,,,,,,,,,,,,,
36,Poland,2.3,3.7,2.5,48.321089,23.920366,24.400723,5.969036,5.147508,0.821529,23.435506,22.274353,74.390275,12.909212,9.842287,2.858225
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
39,Slovak Republic,3.1,3.8,2.5,37.153958,16.994268,20.15969,6.230566,5.389511,0.841055,31.404297,25.211179,64.409823,17.085602,14.485115,4.01946
31,New Zealand,2.607408,3.88397,2.668222,57.328125,29.253137,28.074988,10.387307,,,22.794397,9.490172,67.370046,13.689523,12.745727,6.194528
14,Finland,1.9,4.0,2.6,45.63784,17.056364,28.581477,5.427769,4.497123,0.930646,45.335962,3.598356,81.983051,7.888136,6.986441,3.142373


In [321]:
df_16.rename(columns=lambda c: re.sub(r'(?i)^share[_\s]*', '', c) if isinstance(c, str) else c, inplace=True)
df_16.drop_duplicates(inplace=True)
df_16.replace('', pd.NA, inplace=True)

metric_cols = [c for c in df_16.columns if c != 'country']

# A) Drop only rows where ALL metric columns are missing (safe):
df_16.dropna(subset=metric_cols, how='all', inplace=True)

# B) (optional) Also drop columns that are entirely missing:
df_16.dropna(axis=1, how='all', inplace=True)
df_16.isnull().sum()

country                                  0
avg_size_all                             2
avg_size_couple_with_children            8
avg_size_single_parent_with_children     8
couple_total (%)                         4
couple_with_children (%)                 5
couple_without_children (%)              5
single_parent_total (%)                  4
single_mother (%)                       11
single_father (%)                       11
single_person (%)                        4
other_types (%)                          5
hh_0_children (%)                       10
hh_1_child (%)                          10
hh_2_children (%)                       10
hh_3plus_children (%)                   10
dtype: int64

In [322]:
# Drop exactly these columns and create df_16_general
cols_to_drop = [
    "avg_size_couple_with_children",
    "avg_size_single_parent_with_children",
    "single_mother (%)",
    "single_father (%)",
    "hh_0_children (%)",
    "hh_1_child (%)",
    "hh_2_children (%)",
    "hh_3plus_children (%)",
]

df_16_general = df_16.drop(columns=cols_to_drop, errors="ignore").copy()

# quick check (optional)
print(df_16_general.columns.tolist())

['country', 'avg_size_all', 'couple_total (%)', 'couple_with_children (%)', 'couple_without_children (%)', 'single_parent_total (%)', 'single_person (%)', 'other_types (%)']


In [323]:
# Remove selected countries (exact matches)
df_16_general['country'] = df_16_general['country'].str.strip()  # trims spaces

countries_to_remove = ["Chile", "Columbia", "OECD average", "OECD-30 average", "OECD-36 average", "Israel"]
df_16_general = df_16_general[~df_16_general['country'].isin(countries_to_remove)].copy()


In [324]:
df_16_general = df_16_general.reset_index(drop=True)
df_16_general

Unnamed: 0,country,avg_size_all,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_person (%),other_types (%)
0,Australia,2.52758,55.926052,29.904701,26.021351,10.373549,25.124159,8.576123
1,Austria,2.2,48.927373,21.129493,27.79788,5.628256,38.337314,7.107057
2,Belgium,2.2,52.219429,23.979842,28.239587,7.423107,35.502304,4.855161
3,Bulgaria,2.2,40.303059,16.35041,23.95265,4.603051,35.80958,19.284309
4,Canada,2.425303,50.919441,25.300814,25.618627,8.715567,29.347961,11.017031
5,Costa Rica,3.462513,52.441873,38.147069,14.294803,10.548101,11.270909,25.739118
6,Croatia,2.6,51.508875,24.776445,26.732361,5.42231,27.80158,15.267514
7,Cyprus,2.5,56.920863,27.422456,29.498128,6.16526,24.490862,12.426368
8,Czechia,2.3,47.025112,21.703808,25.321304,7.154102,39.150397,6.670389
9,Denmark,1.9,48.59698,20.408734,28.188133,6.308205,37.574211,7.520455


In [325]:
df_16_general.isnull().sum()

country                        0
avg_size_all                   0
couple_total (%)               0
couple_with_children (%)       0
couple_without_children (%)    0
single_parent_total (%)        0
single_person (%)              0
other_types (%)                0
dtype: int64

In [326]:
drop_countries = ['OECD average', 'OECD-30 average', 'OECD-36 average', 'Canada', 'Chile', 'Columbia', 'United States','Iceland','Israel','EU average', 'New Zealand', 'Mexico', 'United Kingdom', 'Switzerland','Australia']

before = len(df_16)
df_16 = df_16[~df_16['country'].isin(drop_countries)].reset_index(drop=True)
print(f"Removed {before - len(df_16)} rows")

Removed 15 rows


In [327]:
df_16.isnull().sum()

country                                 0
avg_size_all                            0
avg_size_couple_with_children           0
avg_size_single_parent_with_children    0
couple_total (%)                        0
couple_with_children (%)                0
couple_without_children (%)             0
single_parent_total (%)                 0
single_mother (%)                       0
single_father (%)                       0
single_person (%)                       0
other_types (%)                         0
hh_0_children (%)                       0
hh_1_child (%)                          0
hh_2_children (%)                       0
hh_3plus_children (%)                   0
dtype: int64

In [328]:
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_mother (%),single_father (%),single_person (%),other_types (%),hh_0_children (%),hh_1_child (%),hh_2_children (%),hh_3plus_children (%)
0,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
1,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
2,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
3,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
4,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
5,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636
6,Czechia,2.3,3.7,2.4,47.025112,21.703808,25.321304,7.154102,6.112638,1.041464,39.150397,6.670389,71.952265,13.852443,11.560185,2.635107
7,Denmark,1.9,3.9,2.5,48.59698,20.408734,28.188133,6.308205,5.114202,1.194003,37.574211,7.520455,77.775278,10.541878,8.944527,2.738317
8,Estonia,1.8,3.8,2.6,46.199713,25.464743,20.734971,6.828569,6.09075,0.737819,36.99157,9.980148,75.755102,12.530612,8.734694,2.979592
9,Finland,1.9,4.0,2.6,45.63784,17.056364,28.581477,5.427769,4.497123,0.930646,45.335962,3.598356,81.983051,7.888136,6.986441,3.142373


In [329]:
#df_16_general.to_csv('../data/Cleaned/general/cleaned_household_general.csv', index=False)

In [330]:
df_16['country'] = df_16['country'].astype(str).str.strip()

# 2) Identify column groups
metric_cols = [c for c in df_16.columns if c != 'country']
pct_cols    = [c for c in metric_cols if '(%)' in str(c) or re.match(r'^share[_\s]', str(c) or '')]
size_cols   = [c for c in metric_cols if str(c).startswith('avg_size')]

# 3) Coerce to proper numeric types (DO NOT rescale percentages)
def to_float(s: pd.Series) -> pd.Series:
    return pd.to_numeric(
        s.astype(str)
         .str.replace('%', '', regex=False)   # strip a trailing % if any
         .str.replace(',', '.', regex=False)  # handle decimal comma
         .str.strip()
         .replace({'': np.nan}),
        errors='coerce'
    )

for c in pct_cols:
    df_16[c] = to_float(df_16[c])            # stays on 0–100 scale

for c in size_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 4) Round to 2 decimals (all numeric metrics)
df_16[metric_cols] = df_16[metric_cols].round(2)

In [331]:
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_mother (%),single_father (%),single_person (%),other_types (%),hh_0_children (%),hh_1_child (%),hh_2_children (%),hh_3plus_children (%)
0,Austria,2.2,3.8,2.5,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11,77.78,10.52,8.57,3.12
1,Belgium,2.2,3.9,2.6,52.22,23.98,28.24,7.42,6.08,1.35,35.5,4.86,73.97,11.76,10.15,4.11
2,Bulgaria,2.2,3.5,2.3,40.3,16.35,23.95,4.6,3.88,0.73,35.81,19.28,78.21,12.93,7.48,1.38
3,Costa Rica,3.46,4.37,3.44,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74,30.29,23.08,24.61,22.02
4,Croatia,2.6,3.9,2.6,51.51,24.78,26.73,5.42,4.39,1.04,27.8,15.27,74.18,11.96,10.1,3.76
5,Cyprus,2.5,3.7,2.4,56.92,27.42,29.5,6.17,4.94,1.23,24.49,12.43,71.36,13.88,11.67,3.1
6,Czechia,2.3,3.7,2.4,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67,71.95,13.85,11.56,2.64
7,Denmark,1.9,3.9,2.5,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52,77.78,10.54,8.94,2.74
8,Estonia,1.8,3.8,2.6,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98,75.76,12.53,8.73,2.98
9,Finland,1.9,4.0,2.6,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6,81.98,7.89,6.99,3.14


In [332]:
#df_16.to_csv('../data/Cleaned/cleaned_household_oecd.csv', index=False)

In [333]:
#df_16.to_sql('household_oecd', engine, if_exists= 'replace', index= False)

In [334]:
df_17 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_17

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,


In [335]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [336]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [337]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [338]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [339]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [340]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [341]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [342]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [353]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [None]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [None]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

474

In [347]:
df_888= pd.read_csv('../data/Raw/OECD/Households-by-type,-presence-of-children-and-country,-2015-2024.csv')
df_888

Unnamed: 0,Category,Single adult with children,Single adult without children,Couple with children,Couple without children,Other type of household with children,Other type of household without children
0,2015,6147.3,64181.3,31679.8,46641.6,11698.9,30771.6
1,2016,6148.5,63891.1,31907.3,47308.2,11766.3,30559.5
2,2017,6108.5,65353.9,32091.5,47426.1,11530.2,30297.5
3,2018,6163.6,66165.5,31720.2,48194.8,11342.5,30224.0
4,2019,6246.4,67417.9,31710.1,48503.6,11285.7,30134.8
5,2020,6136.4,67412.9,31622.2,48831.2,11212.9,30445.2
6,2021,5691.9,70200.4,30558.3,47447.4,11611.8,30700.7
7,2022,5984.9,72134.3,30469.3,47995.5,11513.6,30412.1
8,2023,5924.8,73396.2,30313.0,48477.5,11443.5,30608.8
9,2024,6077.7,75049.7,30286.5,49058.4,11311.9,30487.3


In [348]:
df_999 = pd.read_csv('../data/Raw/OECD/Households-with-children-by-number-of-children,-2024.csv')
df_999

Unnamed: 0,Category,1 child,2 children,3 children or more
0,European Union,11.7,8.9,3.0
1,,,,
2,Slovakia,17.1,14.5,4.0
3,Ireland,12.4,12.2,6.4
4,Cyprus,13.9,11.7,3.1
5,Czechia,13.9,11.6,2.6
6,Romania,14.3,9.2,4.0
7,Luxembourg,12.5,12.1,2.4
8,Belgium,11.8,10.2,4.1
9,Croatia,12.0,10.1,3.8
