In [450]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 

In [451]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [452]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [453]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [454]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [455]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [456]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [457]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [458]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [459]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [460]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [461]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [462]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [463]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [464]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [465]:
df_2.drop_duplicates(inplace=True)


In [466]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [467]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [468]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [469]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [470]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [471]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [472]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [473]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [474]:
df_3.drop_duplicates(inplace=True)


In [475]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [476]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [477]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [478]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [479]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [480]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [481]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [482]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [483]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [484]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [485]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [486]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [487]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [488]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [489]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [490]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [491]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [492]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [493]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
26,Men,,43,91.1,90.8,91.7,79.5,66.7,56.6,,
49,Women,,32,75.5,85.8,92.4,79.9,61.0,45.8,34.2,
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,
6,Men,,23,21.4,26.8,38.1,26.2,10.5,3.9,2.1,0.8
46,Women,,29,68.7,80.4,89.8,74.5,52.4,33.6,24.8,


In [494]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [495]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [496]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [497]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [498]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [499]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [500]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
590,Estonia,EST,2013,58.9
594,Estonia,EST,2017,56.9
760,Greece,GRC,1978,1.4
663,France,FRA,2003,46.2
2011,Turkey,TUR,2015,2.8


In [501]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [502]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [503]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [504]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [505]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [506]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
3,Men,GBR,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
41,Women,GBR,24,40.8,56.1,75.5,55.1,29.7,13.4,6.8,
45,Women,GBR,28,65.0,77.4,88.4,72.0,48.8,29.4,20.7,
65,Women,GBR,48,85.0,91.8,95.6,87.2,75.7,,,
66,Women,GBR,49,85.2,91.9,95.7,87.3,76.0,,,


In [507]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
58,Women,41,82.9,90.7,95.1,85.7,72.7
52,Women,35,79.0,88.3,93.8,82.8,66.7
11,Men,28,62.7,66.3,77.7,56.8,33.1
65,Women,48,85.0,91.8,95.6,87.2,75.7
15,Men,32,79.3,82.2,86.5,68.8,48.2


In [508]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [509]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [510]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [511]:
#pip install openpyxl pywin32

In [512]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [513]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [514]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [515]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [516]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [517]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [518]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [519]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [520]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [521]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [522]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [523]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
13819,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,470,Malta,MT,MLT,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,53.8
20009,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,690,Seychelles,SC,SYC,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,48.46
21439,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,740,Suriname,SR,SUR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,56.05
12782,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,438,Liechtenstein,LI,LIE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.89
4571,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,158,"China, Taiwan Province of China",TW,TWN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,49.82


In [524]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
18388,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,638,Réunion,RE,REU,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,54.51
23189,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,798,Tuvalu,TV,TUV,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,62.71
7230,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,242,Fiji,FJ,FJI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,60.1
11577,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,408,Dem. People's Rep. of Korea,KP,PRK,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,60.47
24275,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,854,Burkina Faso,BF,BFA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,76.88


In [525]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [526]:
df_9.drop_duplicates(inplace=True)

In [527]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [528]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [529]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [530]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [531]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [532]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
66275,Jordan,400,Married or in-union women,1992,30-34,82.232353,107.569785,Estimate
85413,Montenegro,499,Married or in-union women,2035,40-44,81.860006,14.328775,Projection
2387,American Samoa,16,Married or in-union women,2025,30-34,64.683712,0.876464,Projection
59568,India,356,Married or in-union women,2045,15-19,5.903988,3152.454879,Projection
120861,South Africa,710,Married or in-union women,2011,40-44,59.711765,983.968077,Estimate
109208,Rwanda,646,Married or in-union women,2013,15-19,3.77332,22.38241,Estimate
127757,Tajikistan,762,Married or in-union women,1982,40-44,87.615122,82.660487,Estimate
77963,Malawi,454,Married or in-union women,1995,30-34,84.543478,286.705534,Estimate
1040,Albania,8,Married or in-union women,2019,15-19,6.625204,6.560442,Projection
75668,Lithuania,440,Married or in-union women,2032,35-39,73.830154,66.179135,Projection


In [533]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
92366,New Caledonia,540,Married or in-union women,2013,45-49,69.48472,6.443665,Estimate
99325,Pakistan,586,Married or in-union women,1992,40-44,91.275,2189.05654,Estimate
133006,Türkiye,792,Married or in-union women,1990,45-49,90.965294,1048.385931,Estimate
87631,Oman,512,Married or in-union women,1988,15-49,68.767924,204.403026,Estimate
144071,Samoa,882,Married or in-union women,1996,15-49,58.195037,23.57481,Estimate


In [534]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [535]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [536]:
df_10.dropna(inplace=True)

In [537]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [538]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [539]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [540]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [541]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [542]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
27615,Lebanon,422,2004,2004,Women,[65+],65,999,42.39,Survey,2004 HLCS,2423,Lebanon 2004 National Survey of Household Livi...,National statistics,,,
8208,"China, Hong Kong SAR",344,2006,2006,Men,[40-44],40,44,77.77,Census,2006 Census,1603,"China, Hong Kong (SAR) 2006 Census",UNSD,,Based on the results of a sample survey.,Data pertain to resident population only.
40608,Saint Kitts and Nevis,659,1980,1980,Women,[45-49],45,49,51.54,Census,1980 Census,2535,Saint Kitts and Nevis 1980 Census,UNSD,,,Including Anguilla.
41268,San Marino,674,1972,1972,Women,[25-29],25,29,88.22,Estimate,1972 Estimate,2208,San Marino 1972 Estimate,UNSD,,,
27614,Lebanon,422,2004,2004,Women,[60-64],60,64,65.86,Survey,2004 HLCS,2423,Lebanon 2004 National Survey of Household Livi...,National statistics,,,
12659,Ecuador,218,1982,1982,Men,[40-44],40,44,88.9,Survey,1982 ESMIVD,38,Ecuador 1982 Encuesta Nacional de Salud Matern...,US Census Bureau,1.0,,
17701,Ghana,288,2014,2014,Women,[65-69],65,69,35.8,Survey,2014 DHS,5778,Ghana 2014 Demographic and Health Survey,DHS_HH,,,
33746,Netherlands,528,1992,1992,Men,[25-29],25,29,36.06,Estimate,1992 Estimate,2170,Netherlands 1992 Estimate,UNSD,,,


In [543]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
28043,Liberia,430,1970,1971,Men,[55-59],55,59,83.39,Dual record,1971 PGS,4620,Liberia 1971 Population Growth Survey,UNSD,1.0,,
13783,Faeroe Islands,234,1970,1970,Men,[25-29],25,29,60.23,Census,1970 Census,2352,Faroe Islands 1970 Census,UNSD,1.0,,
43122,Seychelles,690,1982,1982,Women,[20-24],20,24,44.43,Estimate,1982 Estimate,2213,Seychelles 1982 Estimate,UNSD,,,
20378,Hungary,348,1991,1991,Men,[60-64],60,64,83.57,Estimate,1991 Estimate,2120,Hungary 1991 Estimate,UNSD,,,
39835,Romania,642,2002,2002,Women,[55-59],55,59,68.87,Census,2002 Census,327,Romania 2002 Census,UNSD,,,
49253,Togo,768,2017,2017,Men,[55-59],55,59,89.82,Survey,2017 MIS,7503,Togo 2017 Malaria Indicator Survey,National statistics,,,
29340,Lithuania,440,2016,2016,Men,[15-19],15,19,0.0,Estimate,2016 Estimate,2146,Lithuania 2016 Estimate,UNSD,,,
42710,Senegal,686,2010,2011,Women,[15-19],15,19,24.3,Survey,2010-2011 DHS-MICS,5274,Senegal 2010-2011 Enquête Démographique et de...,DHS_STATcompiler,1.0,,


In [544]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
47307,Sweden,752,Women,[55-59],55,59,58.66,Estimate,2009 Estimate,UNSD
23213,Iraq,368,Women,[10-14],10,14,3.87,Census,1977 Census,UNSD
43403,Sierra Leone,694,Women,[75+],75,999,12.2,Survey,2013 DHS,DHS_HH
52969,Yemen,887,Women,[35-39],35,39,92.5,Census,1994 Census,UNSD
42294,Saudi Arabia,682,Women,[45-49],45,49,84.85,Estimate,1999 Estimate,UNSD
18985,Guinea,324,Women,[45-49],45,49,92.5,Survey,2012 DHS,DHS_HH
3638,Benin,204,Women,[20-24],20,24,69.3,Survey,2006 DHS,DHS_HH
51917,United States Virgin Islands,850,Women,[65+],65,999,35.67,Census,2010 Census,US Census Bureau
16428,French Guiana,254,Women,[35-39],35,39,49.44,Census,1982 Census,UNSD
48136,Switzerland,756,Men,[75+],75,999,71.02,Estimate,2008 Estimate,UNSD


In [545]:
df_11.drop_duplicates(inplace=True)

In [546]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [547]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [548]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [549]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [550]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
42606,Saint Kitts and Nevis,659,1970,1970,Men,[30-34],30,34,35.0,Census,1970 Census,2534,Saint Kitts and Nevis 1970 Census,US Census Bureau,,,Data pertain to St Kitts only.
40527,Poland,616,1996,1996,Women,[75+],75,999,92.73,Estimate,1996 Estimate,2192,Poland 1996 Estimate,UNSD,,,Data pertain to nationals only.
22433,Hungary,348,2017,2017,Men,[40-44],40,44,63.71,Estimate,2017 Estimate,2120,Hungary 2017 Estimate,UNSD,1.0,,
10138,Costa Rica,188,2016,2016,Men,[75+],75,999,92.41,Estimate,2016 Estimate,2075,Costa Rica 2016 Estimate,UNSD,1.0,,
46131,Slovakia,703,2018,2018,Women,[75+],75,999,96.34,Estimate,2018 Estimate,2216,Slovakia 2018 Estimate,UNSD,,,
44750,Senegal,686,2010,2011,Women,[30-34],30,34,88.3,Survey,2010-2011 DHS-MICS,5274,Senegal 2010-2011 Enquête Démographique et de...,DHS_HH,,,
40415,Poland,616,1980,1980,Women,[65-69],65,69,92.29,Estimate,1980 Estimate,2192,Poland 1980 Estimate,UNSD,,,Data pertain to nationals only.
48064,Suriname,740,1999,2000,Men,[55-59],55,59,96.72,Survey,2000 MICS_HH,1945,Suriname 2000 Multiple Indicator Cluster Survey,MICS_HH,1.0,,


In [551]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
44024,San Marino,674,Women,[20-24],20,24,4.46,Estimate,UNSD
25602,Israel,376,Men,[25-29],25,29,61.02,Estimate,UNSD
52211,Turkey,792,Women,[60-64],60,64,98.6,Survey,DHS_HH
19283,Greenland,304,Women,[15-19],15,19,0.47,Estimate,UNSD
41245,Republic of Korea,410,Men,[60-64],60,64,99.54,Census,UNSD
47769,State of Palestine,275,Women,[30-34],30,34,86.2,Survey,National statistics
23070,Iceland,352,Women,[75+],75,999,84.17,Estimate,UNSD
11135,Czechia,203,Women,[60-64],60,64,97.41,Estimate,UNSD


In [552]:
df_12.dropna(inplace=True)

In [553]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [554]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [555]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [556]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [557]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
75686,Uruguay,858,[30-34],ASFR3034,1998.5,95.61861,"Register,Computed rate from DYB,DYB,554-135-51",Computed rate from DYB,Register,VR,554,Vital Registration,Register,1998,1998,Demographic Yearbook,1998
55896,Philippines,608,[Total],TFR,1968.5,6.335,"1978 WFS,Birth Histories,1978 WFS report,821-6...",Birth histories,Survey,WFS,821,Philippines 1978 World Fertility Survey,1978 WFS,1978,1978,Philippines 1978 World Fertility Survey report,1979
71195,Trinidad and Tobago,780,[Total],MAC,1980.5,27.24595,"Register,Computed rate from DYB,DYB,567-135-38",Computed rate from DYB,Register,VR,567,Vital Registration,Register,1980,1980,Demographic Yearbook,1985
39585,Kenya,404,[30-34],ASFR3034,1985.999023,272.0,"2003 DHS,Direct,DHS,1692-16-39167",Direct,Survey,DHS,1692,Kenya 2003 Demographic and Health Survey,2003 DHS,2003,2003,DHS Statcompiler,2012
25365,France,250,[Total],TFR,1958.5,2.6716,"Estimates,Fertility data (Adjusted),HFC-ODE,20...",Fertility data (adjusted),Estimate,Estimate,2094,All sources of estimates,Estimates,1958,1958,European Demographic Observatory (ODE). Data c...,2011
20350,Dem. People's Rep. of Korea,408,[Total],TFR,2004.248634,1.957114,"2008 Census,Reverse survival,Spoorenberg (2014...",Reverse survival method,Census,Census,2410,Democratic People's Republic of Korea 2008 Census,2008 Census,2008,2008,"Spoorenberg, T. ""Fertility levels and trends i...",2014


In [558]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [559]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
58118,Republic of Korea,[30-34],2013,111.4,Direct,Register,Register,2013,2013
35389,Iran (Islamic Republic of),[25-29],1989,240.8,Own-children method,Census,1996 Census,1996,1996
3707,Austria,[30-34],1974,54.47,Official estimates,Estimate,Estimates,1974,1974
18863,Curaçao,[35-39],2011,48.53,Recent births,Census,2011 Census,2011,2011
59989,Russian Federation,[35-39],1999,11.1,Direct,Register,Register,1999,1999
12821,Canada,[25-29],2012,95.7,Direct,Register,Register,2012,2012
10125,Brazil,[45-49],2015,0.7,Fertility data (adjusted),Register,Register,2015,2015
79745,Zimbabwe,[35-39],2012,108.65,Birth histories,Survey,2014 MICS,2014,2014
44527,Malawi,[20-24],2014,230.39,Birth histories,Survey,2017 MIS,2017,2017
23403,Eritrea,[Total],1999,5.24,Direct,Survey,2002 DHS,2002,2002


In [560]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [561]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [562]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [563]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
66534,Dominican Republic,214,2007,2007,Men,Divorced,,,[30-34],30,...,2007 DHS,49,Dominican Republic 2007 Demographic and Health...,DHS_HH,,,,,,
98287,Guatemala,320,2014,2015,Men,Never married,,,[25-29],25,...,2014-2015 DHS,5790,Guatemala 2014-2015 Demographic and Health Survey,DHS_HH,,,,,,
134351,Kenya,404,2009,2009,Men,Single,,,[70-74],70,...,2009 Census,2409,Kenya 2009 Census,IPUMS,,,Data pertain to monogamous and polygamous marr...,Data are based on a 10 per cent sample of cens...,,
227292,South Sudan,728,1992,1993,Men,Widowed,,,[10-14],10,...,1992-1993 MCH,5237,Sudan 1992-1993 Maternal and Child Health Survey,INED,,,,,,
247456,Trinidad and Tobago,780,1980,1980,Men,Widowed,,,[30-34],30,...,1980 Census,1238,Trinidad and Tobago 1980 Census,UNSD,,,,,,


In [564]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
121254,Ireland,1991,1991,Women,Divorced or Separated,[60-64],60,64,2.02,Census,1991 Census,UNSD
190896,Paraguay,1992,1992,Women,Married,[45-49],45,49,63.97,Census,1992 Census,UNSD
167401,Namibia,2016,2016,Women,Married,[55-59],55,59,42.31,Survey,2016 NIDS,National statistics
233796,Sweden,1980,1980,Women,Widowed,[70-74],70,74,37.63,Estimate,1980 Estimate,UNSD
26637,Burkina Faso,1998,1999,Women,Divorced,[20-24],20,24,0.2,Survey,1998-1999 DHS,DHS_STATcompiler
169769,Netherlands,1976,1976,Women,Divorced,[20-24],20,24,0.9,Estimate,1976 Estimate,UNSD
75829,Fiji,2007,2007,Women,Separated,[65-69],65,69,1.97,Census,2007 Census,IPUMS
229838,State of Palestine,2007,2007,Men,Widowed,[15-19],15,19,0.01,Census,2007 Census,UNSD
162157,Mongolia,2010,2010,Women,Divorced,[40-44],40,44,6.58,Survey,2010 MICS,MICS
197358,Portugal,1991,1991,Men,Consensual union,[25-29],25,29,3.4,Census,1991 Census,UNSD


In [565]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [566]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [567]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [568]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [569]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
9767,Eastern Asia,906,SDG-M49,Married or in-union women,1975,15-49,66.968224,322432.884345,Estimate
4699,Africa,903,M49,Married or in-union women,1990,30-34,85.131874,34126.342615,Estimate
26198,Lower-middle-income countries,1501,Income group,Married or in-union women,2004,45-49,83.860749,101218.224667,Estimate
18342,Northern America,905,SDG-M49,Married or in-union women,1994,45-49,74.839975,7242.685734,Estimate
23582,Developing countries,902,Development group,Married or in-union women,2001,45-49,85.65871,109649.936047,Estimate
24645,Least developed countries,941,Development group,Married or in-union women,1972,40-44,81.811706,6124.304019,Estimate


In [570]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
10472,Southern Asia,5501,Married or in-union women,1983,15-19,43.846486,44053.400027,Estimate
20757,Melanesia,928,Married or in-union women,1972,40-44,88.485776,146.312115,Estimate
2356,Central and Southern Asia,62,Married or in-union women,2021,35-39,91.94047,136964.708592,Estimate
6198,Middle Africa,911,Married or in-union women,2015,45-49,72.67201,4107.098613,Estimate
28014,No income group available,1518,Married or in-union women,1988,45-49,67.624518,480.184118,Estimate
26508,Lower-middle-income countries,1501,Married or in-union women,2043,35-39,87.264577,258730.839269,Projection
3142,Eastern and South-Eastern Asia,753,Married or in-union women,2038,45-49,90.970357,152315.920372,Projection
9799,Eastern Asia,906,Married or in-union women,1979,15-49,67.668969,360776.086395,Estimate
22300,Polynesia,957,Married or in-union women,2003,35-39,80.992791,35.15978,Estimate
13372,Eastern Europe,923,Married or in-union women,2021,35-39,71.82951,8447.925465,Estimate


In [571]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [572]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [573]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [574]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [575]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [576]:
#pip install "xlrd==1.2.0"


In [577]:
excel_1_1 = pd.read_excel('../data/Raw/OECD/SF_1_1_Family_size_and_composition.xlsx')
excel_1_1.head(10)

Unnamed: 0.1,Unnamed: 0,"Chart SF1.1.A. Average size of households by household type, 2024a",Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,"Data for Chart SF1.1.A. Average size of households by household type, 2024a",Unnamed: 13,Unnamed: 14,Unnamed: 15
0,,"Mean average number of people per household, b...",,,,,,,,,,,"Mean average number of people per household, b...",,,
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,All households,Couple households with children,Single parent households with children
3,,,,,,,,,,,,,Mexico,3.56,4.08,2.76
4,,,,,,,,,,,,,Costa Rica,3.462513,4.372663,3.443867
5,,,,,,,,,,,,,Türkiye,3.2,4.1,2.8
6,,,,,,,,,,,,,Israel,3.19,4.649476,2.863297
7,,,,,,,,,,,,,Columbia,3.100732,,
8,,,,,,,,,,,,,Slovak Republic,3.1,3.8,2.5
9,,,,,,,,,,,,,Chile,2.8,,


In [578]:
file_path  = '../data/Raw/OECD/SF_1_1_Family_size_and_composition.xlsx'
output_csv = '../data/processed/household_oecd_combined.csv'

xls = pd.ExcelFile(file_path)

# ----------------- yardımcılar -----------------
def find_row_with_value(df, pattern):
    """"Dataframe first row that matches the pattern."""
    pat = re.compile(pattern, re.IGNORECASE)
    for i in range(df.shape[0]):
        row = df.iloc[i].astype(str).tolist()
        if any(pat.search(str(x)) for x in row):
            return i
    return 0

def pick_country_column(df):
    """'Unnamed' kolonlar arasından ülke adlarının olduğu en dolu olanı seç."""
    candidates = [c for c in df.columns if str(c).lower().startswith("unnamed")]
    if not candidates:
        return df.columns[0]
    return max(candidates, key=lambda c: df[c].astype(str).ne("nan").sum())

# ----------------- Sheet 1: Chart SF1.1.A -----------------
# Automatical finding of header row
_chart_raw = pd.read_excel(xls, sheet_name='Chart SF1.1.A', header=None)
chart_header_row = find_row_with_value(_chart_raw, r"\bAll households\b")

chart_df = pd.read_excel(xls, sheet_name='Chart SF1.1.A', header=chart_header_row)
country_col_chart = pick_country_column(chart_df)

chart_df = chart_df.rename(columns={country_col_chart: "country"})
rename_chart = {}
for c in chart_df.columns:
    lc = str(c).lower()
    if "all households" in lc:
        rename_chart[c] = "avg_size_all"
    elif "couple households with children" in lc:
        rename_chart[c] = "avg_size_couple_with_children"
    elif "single parent households with children" in lc:
        rename_chart[c] = "avg_size_single_parent_with_children"

chart_df = chart_df[["country"] + list(rename_chart.keys())].rename(columns=rename_chart)
chart_df["country"] = chart_df["country"].astype(str).str.strip()
for col in rename_chart.values():
    chart_df[col] = pd.to_numeric(chart_df[col], errors="coerce")
chart_df = chart_df.dropna(subset=[c for c in chart_df.columns if c != "country"], how="all")

# ----------------- Sheet 2: Table SF1.1.A (çok satırlı başlık) -----------------
_table_a_raw = pd.read_excel(xls, sheet_name='Table SF1.1.A', header=None)
table_a_header_row_top = find_row_with_value(_table_a_raw, r"Couple households")
table_a = pd.read_excel(xls, sheet_name='Table SF1.1.A',
                        header=[table_a_header_row_top, table_a_header_row_top+1])

# Ülke kolonu: en dolu Unnamed çifti
mi_cols = table_a.columns
unnamed_pairs = [c for c in mi_cols
                 if str(c[0]).lower().startswith("unnamed")
                 and str(c[1]).lower().startswith("unnamed")]
country_multi_col = max(unnamed_pairs, key=lambda c: table_a[c].astype(str).ne("nan").sum())

def match_best(mi_cols, top_label, sub_keyword):
    """Find best matching column in multi-index columns."""
    for c in mi_cols:
        top = str(c[0]).strip()
        sub = str(c[1]).strip()
        if top == top_label:
            if not sub_keyword:
                return c  # Get first match (e.g., "Single person" / "Other" sub-headings)
            if sub_keyword.lower() in sub.lower():
                return c
    return None

specs = [
    ("Couple households:", "Total",             "share_couple_total"),
    ("Couple households:", "With children",     "share_couple_with_children"),
    ("Couple households:", "Without children",  "share_couple_without_children"),
    ("Single parent households:", "Total",      "share_single_parent_total"),
    ("Single parent households:", "Single mother households", "share_single_mother"),
    ("Single parent households:", "Single father households", "share_single_father"),
    ("Single person households", "",            "share_single_person"),
    ("Other household types", "",              "share_other_types"),
]

selected_cols = {country_multi_col: "country"}
for top, sub, name in specs:
    c = match_best(mi_cols, top, sub)
    if c is not None:
        selected_cols[c] = name

table_a_clean = table_a.loc[:, list(selected_cols.keys())].copy()
table_a_clean.columns = list(selected_cols.values())
table_a_clean["country"] = table_a_clean["country"].astype(str).str.strip()
for col in [c for c in table_a_clean.columns if c != "country"]:
    table_a_clean[col] = pd.to_numeric(table_a_clean[col], errors="coerce")
table_a_clean = table_a_clean.dropna(subset=[c for c in table_a_clean.columns if c != "country"], how="all")

# ----------------- Sheet 3: Table SF1.1.B -----------------
_table_b_raw = pd.read_excel(xls, sheet_name='Table SF1.1.B', header=None)
table_b_header_row = find_row_with_value(_table_b_raw, r"^0\s*children$")
table_b = pd.read_excel(xls, sheet_name='Table SF1.1.B', header=table_b_header_row)

country_col_b = pick_country_column(table_b)
table_b = table_b.rename(columns={country_col_b: "country"})

rename_b = {}
for c in table_b.columns:
    lc = str(c).lower()
    if lc == "country":
        continue
    if re.search(r"^0\s*children", lc):
        rename_b[c] = "share_hh_0_children"
    elif re.search(r"^1\s*child", lc):
        rename_b[c] = "share_hh_1_child"
    elif re.search(r"^2\s*children", lc):
        rename_b[c] = "share_hh_2_children"
    elif "3 or more children" in lc:
        rename_b[c] = "share_hh_3plus_children"
    elif ("under 6" in lc) or ("under six" in lc):
        rename_b[c] = "share_hh_with_child_under6"

table_b = table_b[["country"] + list(rename_b.keys())].rename(columns=rename_b)
table_b["country"] = table_b["country"].astype(str).str.strip()
for col in rename_b.values():
    table_b[col] = pd.to_numeric(table_b[col], errors="coerce")
table_b = table_b.dropna(subset=[c for c in table_b.columns if c != "country"], how="all")

# ----------------- Together+ Save -----------------
combined = (
    chart_df
    .merge(table_a_clean, on="country", how="outer")
    .merge(table_b,       on="country", how="outer")
    .sort_values("country")
    .reset_index(drop=True)
)

# Save the combined DataFrame to CSV
Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
combined.to_csv(output_csv, index=False)
print(f"✅ Saved: {output_csv}")
display(combined.head(12))

✅ Saved: ../data/processed/household_oecd_combined.csv


Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [579]:
df_16 = pd.read_csv('../data/processed/household_oecd_combined.csv')
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [580]:
# To numeric columns, except 'country'
metric_cols = [c for c in df_16.columns if c != 'country']
for c in metric_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 3) Long/tidy verse format
tidy_16 = (df_16
           .melt(id_vars='country',
                 value_vars=metric_cols,
                 var_name='metric',
                 value_name='value')
           .dropna(subset=['value'])
           .sort_values(['country','metric'])
           .reset_index(drop=True))

# (opsiyonel) 
tidy_16['unit'] = tidy_16['metric'].apply(lambda m: 'persons' if m.startswith('avg_size') else 'percent')
def metric_group(m):
    if m.startswith('avg_size'): return 'average_size'
    if m.startswith('share_hh_'): return 'children_count_distribution'
    if m.startswith('share_couple'): return 'couple_households'
    if m.startswith('share_single_parent'): return 'single_parent_households'
    if m in ('share_single_mother','share_single_father'): return 'single_parent_gender'
    if m == 'share_single_person': return 'single_person_households'
    if m == 'share_other_types': return 'other_types'
    return 'other'
tidy_16['group'] = tidy_16['metric'].map(metric_group)

# 4) Gerekirse tidy üstünde oynadıktan sonra tekrar WIDE'a dön ve df_16'ya geri yaz
df_16 = (tidy_16.pivot_table(index='country', columns='metric', values='value', aggfunc='first')
         .reset_index()
         .reindex(columns=['country'] + metric_cols)   # orijinal kolon sırası
        )

In [581]:
df_16

metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [582]:
# 1) Identify columns
metric_cols = [c for c in df_16.columns if c != 'country']
pct_cols    = [c for c in metric_cols if str(c).startswith('share_') or '(%)' in str(c)]
size_cols   = [c for c in metric_cols if str(c).startswith('avg_size')]

# 2) Normalize percentage columns to 0–100 NUMERIC (do NOT divide by 100)
for c in pct_cols:
    s = (df_16[c].astype(str)
                    .str.replace('%', '', regex=False)   # drop percent sign if present
                    .str.replace(',', '.', regex=False)  # handle decimal comma
                    .str.strip()
                    .replace({'': np.nan}))
    s = pd.to_numeric(s, errors='coerce')

    # If column appears to be 0–1 scale, scale UP to 0–100
    maxv = s.max(skipna=True)
    if pd.notna(maxv) and maxv <= 1.5:
        s = s * 100.0

    df_16[c] = s  # write back as float (e.g., 55.93)

# 3) Ensure avg_size columns stay numeric (unchanged values)
for c in size_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 4) (Optional) add "(%)" to headers of percentage columns (header-only; no value changes)
df_16.rename(columns=lambda x: f'{x} (%)' if x in pct_cols and '(%)' not in str(x) else x, inplace=True)

# 5) Display with blanks instead of NaN (values remain numeric under the hood)
display(df_16.style.format(na_rep=''))


metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [583]:
df_16.sample(10)

metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
19,Iceland,2.701931,4.119078,2.608646,45.194843,25.421669,19.773174,7.354279,6.23161,1.122668,29.161858,18.28902,,,,
44,Türkiye,3.2,4.1,2.8,54.379403,40.839223,13.54018,10.064812,7.754038,2.310773,18.877345,16.67844,57.619229,17.424742,14.503294,10.453113
39,Slovak Republic,3.1,3.8,2.5,37.153958,16.994268,20.15969,6.230566,5.389511,0.841055,31.404297,25.211179,64.409823,17.085602,14.485115,4.01946
17,Greece,2.6,3.8,2.5,52.138469,24.027235,28.111233,4.662701,3.820244,0.842457,32.347616,10.851376,74.311386,11.832867,9.969559,3.888643
24,Korea,2.214024,3.553442,2.343395,43.480489,26.248858,17.23163,9.125677,6.849305,2.276371,35.468577,11.925258,78.516649,9.000955,9.824172,2.658224
20,Ireland,2.4,4.0,2.7,53.027121,29.44911,23.578011,6.925656,6.122436,0.803221,23.135621,16.911603,69.020563,12.418831,12.184343,6.376263
41,Spain,2.5,3.7,2.4,50.872979,25.4583,25.41469,8.57236,6.886923,1.685437,26.976136,13.578514,74.607276,13.543846,8.948054,2.900824
42,Sweden,2.1,3.9,2.6,49.272143,22.490881,26.781261,6.666267,4.906521,1.759745,39.240233,4.821357,74.84036,10.772303,9.825904,4.559353
13,Estonia,1.8,3.8,2.6,46.199713,25.464743,20.734971,6.828569,6.09075,0.737819,36.99157,9.980148,75.755102,12.530612,8.734694,2.979592
10,Czechia,2.3,3.7,2.4,47.025112,21.703808,25.321304,7.154102,6.112638,1.041464,39.150397,6.670389,71.952265,13.852443,11.560185,2.635107


In [584]:
df_16.columns.name = None
df_16.index.name = None

In [585]:
df_16.sample(10)

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
29,Mexico,3.56,4.08,2.76,50.357352,39.016163,11.341189,11.216618,,,12.464293,25.961737,50.23388,22.83395,17.403083,9.529087
45,United Kingdom,2.3,3.9,2.8,53.717778,20.285277,33.432479,7.4807,,,30.812606,7.988916,72.057972,12.09535,11.314672,4.532006
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636
16,Germany,2.0,3.8,2.5,45.781523,17.888933,27.892603,5.411383,4.436141,0.975242,43.139667,5.667427,79.862795,9.914962,7.716348,2.505657
10,Czechia,2.3,3.7,2.4,47.025112,21.703808,25.321304,7.154102,6.112638,1.041464,39.150397,6.670389,71.952265,13.852443,11.560185,2.635107
23,Japan,2.210988,3.854805,2.727759,45.073248,15.668254,29.404994,2.503514,2.257534,0.24598,37.969772,14.453466,81.938221,8.782047,7.165944,2.113788
46,United States,2.51,,,53.185494,19.846267,33.339226,6.797399,5.212952,1.584447,27.611972,12.405135,,,,
40,Slovenia,2.4,3.9,2.5,45.413954,20.971828,24.442126,6.932339,5.570598,1.361741,33.997106,13.656601,74.99711,11.247255,10.195353,3.560282
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966


In [586]:
df_16.rename(columns=lambda c: re.sub(r'(?i)^share[_\s]*', '', c) if isinstance(c, str) else c, inplace=True)
df_16.drop_duplicates(inplace=True)
df_16.replace('', pd.NA, inplace=True)

metric_cols = [c for c in df_16.columns if c != 'country']

# A) Drop only rows where ALL metric columns are missing (safe):
df_16.dropna(subset=metric_cols, how='all', inplace=True)

# B) (optional) Also drop columns that are entirely missing:
df_16.dropna(axis=1, how='all', inplace=True)
df_16.isnull().sum()

country                                  0
avg_size_all                             2
avg_size_couple_with_children            8
avg_size_single_parent_with_children     8
couple_total (%)                         4
couple_with_children (%)                 5
couple_without_children (%)              5
single_parent_total (%)                  4
single_mother (%)                       11
single_father (%)                       11
single_person (%)                        4
other_types (%)                          5
hh_0_children (%)                       10
hh_1_child (%)                          10
hh_2_children (%)                       10
hh_3plus_children (%)                   10
dtype: int64

In [587]:
# Drop exactly these columns and create df_16_general
cols_to_drop = [
    "avg_size_couple_with_children",
    "avg_size_single_parent_with_children",
    "single_mother (%)",
    "single_father (%)",
    "hh_0_children (%)",
    "hh_1_child (%)",
    "hh_2_children (%)",
    "hh_3plus_children (%)",
]

df_16_general = df_16.drop(columns=cols_to_drop, errors="ignore").copy()

# quick check (optional)
print(df_16_general.columns.tolist())

['country', 'avg_size_all', 'couple_total (%)', 'couple_with_children (%)', 'couple_without_children (%)', 'single_parent_total (%)', 'single_person (%)', 'other_types (%)']


In [588]:
# Remove selected countries (exact matches)
df_16_general['country'] = df_16_general['country'].str.strip()  # trims spaces

countries_to_remove = ["Chile", "Columbia", "OECD average", "OECD-30 average", "OECD-36 average", "Israel"]
df_16_general = df_16_general[~df_16_general['country'].isin(countries_to_remove)].copy()


In [589]:
df_16_general = df_16_general.reset_index(drop=True)
df_16_general

Unnamed: 0,country,avg_size_all,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_person (%),other_types (%)
0,Australia,2.52758,55.926052,29.904701,26.021351,10.373549,25.124159,8.576123
1,Austria,2.2,48.927373,21.129493,27.79788,5.628256,38.337314,7.107057
2,Belgium,2.2,52.219429,23.979842,28.239587,7.423107,35.502304,4.855161
3,Bulgaria,2.2,40.303059,16.35041,23.95265,4.603051,35.80958,19.284309
4,Canada,2.425303,50.919441,25.300814,25.618627,8.715567,29.347961,11.017031
5,Costa Rica,3.462513,52.441873,38.147069,14.294803,10.548101,11.270909,25.739118
6,Croatia,2.6,51.508875,24.776445,26.732361,5.42231,27.80158,15.267514
7,Cyprus,2.5,56.920863,27.422456,29.498128,6.16526,24.490862,12.426368
8,Czechia,2.3,47.025112,21.703808,25.321304,7.154102,39.150397,6.670389
9,Denmark,1.9,48.59698,20.408734,28.188133,6.308205,37.574211,7.520455


In [590]:
df_16_general.isnull().sum()

country                        0
avg_size_all                   0
couple_total (%)               0
couple_with_children (%)       0
couple_without_children (%)    0
single_parent_total (%)        0
single_person (%)              0
other_types (%)                0
dtype: int64

In [591]:
drop_countries = ['OECD average', 'OECD-30 average', 'OECD-36 average', 'Canada', 'Chile', 'Columbia', 'United States','Iceland','Israel','EU average', 'New Zealand', 'Mexico', 'United Kingdom', 'Switzerland','Australia']

before = len(df_16)
df_16 = df_16[~df_16['country'].isin(drop_countries)].reset_index(drop=True)
print(f"Removed {before - len(df_16)} rows")

Removed 15 rows


In [592]:
df_16.isnull().sum()

country                                 0
avg_size_all                            0
avg_size_couple_with_children           0
avg_size_single_parent_with_children    0
couple_total (%)                        0
couple_with_children (%)                0
couple_without_children (%)             0
single_parent_total (%)                 0
single_mother (%)                       0
single_father (%)                       0
single_person (%)                       0
other_types (%)                         0
hh_0_children (%)                       0
hh_1_child (%)                          0
hh_2_children (%)                       0
hh_3plus_children (%)                   0
dtype: int64

In [593]:
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_mother (%),single_father (%),single_person (%),other_types (%),hh_0_children (%),hh_1_child (%),hh_2_children (%),hh_3plus_children (%)
0,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
1,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
2,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
3,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
4,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
5,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636
6,Czechia,2.3,3.7,2.4,47.025112,21.703808,25.321304,7.154102,6.112638,1.041464,39.150397,6.670389,71.952265,13.852443,11.560185,2.635107
7,Denmark,1.9,3.9,2.5,48.59698,20.408734,28.188133,6.308205,5.114202,1.194003,37.574211,7.520455,77.775278,10.541878,8.944527,2.738317
8,Estonia,1.8,3.8,2.6,46.199713,25.464743,20.734971,6.828569,6.09075,0.737819,36.99157,9.980148,75.755102,12.530612,8.734694,2.979592
9,Finland,1.9,4.0,2.6,45.63784,17.056364,28.581477,5.427769,4.497123,0.930646,45.335962,3.598356,81.983051,7.888136,6.986441,3.142373


In [None]:
#df_16_general.to_csv('../data/Cleaned/general/cleaned_household_general.csv', index=False)

In [594]:
df_16['country'] = df_16['country'].astype(str).str.strip()

# 2) Identify column groups
metric_cols = [c for c in df_16.columns if c != 'country']
pct_cols    = [c for c in metric_cols if '(%)' in str(c) or re.match(r'^share[_\s]', str(c) or '')]
size_cols   = [c for c in metric_cols if str(c).startswith('avg_size')]

# 3) Coerce to proper numeric types (DO NOT rescale percentages)
def to_float(s: pd.Series) -> pd.Series:
    return pd.to_numeric(
        s.astype(str)
         .str.replace('%', '', regex=False)   # strip a trailing % if any
         .str.replace(',', '.', regex=False)  # handle decimal comma
         .str.strip()
         .replace({'': np.nan}),
        errors='coerce'
    )

for c in pct_cols:
    df_16[c] = to_float(df_16[c])            # stays on 0–100 scale

for c in size_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 4) Round to 2 decimals (all numeric metrics)
df_16[metric_cols] = df_16[metric_cols].round(2)

In [595]:
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_mother (%),single_father (%),single_person (%),other_types (%),hh_0_children (%),hh_1_child (%),hh_2_children (%),hh_3plus_children (%)
0,Austria,2.2,3.8,2.5,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11,77.78,10.52,8.57,3.12
1,Belgium,2.2,3.9,2.6,52.22,23.98,28.24,7.42,6.08,1.35,35.5,4.86,73.97,11.76,10.15,4.11
2,Bulgaria,2.2,3.5,2.3,40.3,16.35,23.95,4.6,3.88,0.73,35.81,19.28,78.21,12.93,7.48,1.38
3,Costa Rica,3.46,4.37,3.44,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74,30.29,23.08,24.61,22.02
4,Croatia,2.6,3.9,2.6,51.51,24.78,26.73,5.42,4.39,1.04,27.8,15.27,74.18,11.96,10.1,3.76
5,Cyprus,2.5,3.7,2.4,56.92,27.42,29.5,6.17,4.94,1.23,24.49,12.43,71.36,13.88,11.67,3.1
6,Czechia,2.3,3.7,2.4,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67,71.95,13.85,11.56,2.64
7,Denmark,1.9,3.9,2.5,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52,77.78,10.54,8.94,2.74
8,Estonia,1.8,3.8,2.6,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98,75.76,12.53,8.73,2.98
9,Finland,1.9,4.0,2.6,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6,81.98,7.89,6.99,3.14


In [596]:
#df_16.to_csv('../data/Cleaned/cleaned_household_oecd.csv', index=False)

In [597]:
#df_16.to_sql('household_oecd', engine, if_exists= 'replace', index= False)