In [339]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [340]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [341]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [342]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [343]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [344]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [345]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [346]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [347]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [348]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [349]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [350]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [351]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [352]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [353]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [354]:
df_2.drop_duplicates(inplace=True)


In [355]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [356]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [357]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [358]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [359]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [360]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [361]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [362]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [363]:
df_3.drop_duplicates(inplace=True)


In [364]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [365]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [366]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [367]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [368]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [369]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [370]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [371]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [372]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [373]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [374]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [375]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [376]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [377]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [378]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [379]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [380]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [381]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [382]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
6,Men,,23,21.4,26.8,38.1,26.2,10.5,3.9,2.1,0.8
31,Men,,48,92.5,91.7,92.3,81.3,69.9,,,
32,Men,,49,92.7,91.8,92.3,81.5,70.3,,,
23,Men,,40,89.6,89.9,91.1,78.0,64.0,53.9,,
58,Women,,41,82.9,90.7,95.1,85.7,72.7,62.0,,


In [383]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [384]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [385]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [386]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [387]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [388]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [389]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
2092,United States,USA,1974,13.2
1561,Poland,POL,2011,21.2
1468,New Zealand,NZL,2015,46.7
1707,Slovakia,SVK,2007,28.8
1250,Luxembourg,LUX,2009,32.1


In [390]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [391]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [392]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [393]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [394]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [395]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
30,Men,GBR,47,92.3,91.6,92.2,81.0,69.4,,,
2,Men,GBR,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
26,Men,GBR,43,91.1,90.8,91.7,79.5,66.7,56.6,,
14,Men,GBR,31,76.5,79.7,85.1,66.6,45.1,29.6,21.3,
54,Women,GBR,37,80.6,89.4,94.4,84.1,69.4,57.8,,


In [396]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
41,Women,24,40.8,56.1,75.5,55.1,29.7
60,Women,43,83.7,91.1,95.3,86.2,73.7
1,Men,18,0.1,0.1,0.4,0.6,0.1
0,Men,17,0.0,0.0,0.0,0.1,0.0
4,Men,21,6.1,7.4,13.6,11.9,3.9


In [397]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [398]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [399]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [400]:
#pip install openpyxl pywin32

In [401]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [402]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [403]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [404]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [405]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [406]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [407]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [408]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [409]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [410]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [411]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [412]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
2838,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,96,Brunei Darussalam,BN,BRN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,58.82
2409,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,76,Brazil,BR,BRA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,60.84
19256,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,666,Saint Pierre and Miquelon,PM,SPM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,62.08
13896,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,474,Martinique,MQ,MTQ,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,45.64
2612,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,90,Solomon Islands,SB,SLB,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,63.49


In [413]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
3178,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,108,Burundi,BI,BDI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,60.44
15589,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,531,Curaçao,CW,CUW,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,42.51
20654,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,706,Somalia,SO,SOM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,65.27
10649,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,376,Israel,IL,ISR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.37
9165,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,316,Guam,GU,GUM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,42.91


In [414]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [415]:
df_9.drop_duplicates(inplace=True)

In [416]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [417]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [418]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [419]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [420]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [421]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
129314,Togo,768,Married or in-union women,2015,25-29,81.563637,242.35085,Estimate
1654,Algeria,12,Married or in-union women,2014,45-49,78.610945,831.876346,Estimate
94471,Nicaragua,558,Married or in-union women,2033,15-49,55.36164,1151.693187,Projection
104014,Portugal,620,Married or in-union women,2011,45-49,76.670593,308.455764,Estimate
140654,Burkina Faso,854,Married or in-union women,1974,45-49,85.82,105.743542,Estimate
58360,Iceland,352,Married or in-union women,1975,15-19,4.688571,0.511453,Estimate
78067,Malawi,454,Married or in-union women,2008,30-34,81.55913,310.636707,Estimate
131901,United Arab Emirates,784,Married or in-union women,2014,40-44,92.961434,217.310831,Estimate
39033,Eritrea,232,Married or in-union women,1989,20-24,69.4,61.818744,Estimate
124476,Suriname,740,Married or in-union women,1977,35-39,76.0925,6.781744,Estimate


In [422]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
102628,Philippines,608,Married or in-union women,2000,35-39,86.077143,2245.595566,Estimate
64556,Jamaica,388,Married or in-union women,2020,35-39,46.571429,52.161164,Estimate
92525,New Caledonia,540,Married or in-union women,2033,40-44,71.056828,7.743063,Projection
88900,Nauru,520,Married or in-union women,1985,35-39,88.1,0.277075,Estimate
102637,Philippines,608,Married or in-union women,2001,40-44,85.025,1907.300781,Estimate


In [423]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [424]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [425]:
df_10.dropna(inplace=True)

In [426]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [427]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [428]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [429]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [430]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [431]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
34878,Nicaragua,558,2005,2005,Men,[75+],75,999,57.72,Census,2005 Census,1189,Nicaragua 2005 Census,UNSD,1.0,,
34285,Netherlands,528,2015,2015,Women,[60-64],60,64,69.23,Estimate,2015 Estimate,2170,Netherlands 2015 Estimate,UNSD,1.0,,
26038,Kazakhstan,398,2006,2006,Women,[30-34],30,34,73.66,Survey,2006 MICS,1882,Kazakhstan 2006 Multiple Indicator Cluster Survey,MICS,1.0,,
1684,Austria,40,1974,1974,Men,[25-29],25,29,60.47,Estimate,1974 Estimate,2038,Austria 1974 Estimate,UNSD,,,
7927,China,156,2000,2000,Women,[45-49],45,49,95.74,Census,2000 Census,272,China 2000 Census,UNSD,1.0,,
20036,Hungary,348,1980,1980,Men,[30-34],30,34,83.36,Census,1980 Census,1092,Hungary 1980 Census,UNSD,,,
25985,Kazakhstan,398,1989,1989,Women,[60-64],60,64,52.95,Census,1989 Census,1003,Kazakhstan 1989 Census,UNSD,,,
3385,Belize,84,1970,1970,Men,[45-49],45,49,65.41,Census,1970 Census,940,Belize 1970 Census,US Census Bureau,,,


In [432]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
2463,Bahamas,44,2000,2000,Men,[30-34],30,34,54.87,Census,2000 Census,415,Bahamas 2000 Census,UNSD,1.0,,
29609,Luxembourg,442,2011,2011,Women,[60-64],60,64,70.59,Census,2011 Census,4832,Luxembourg 2011 Census,Eurostat,1.0,Estimates computed based on data on marital st...,
35612,Norway,578,1975,1975,Men,[55-59],55,59,82.08,Estimate,1975 Estimate,2180,Norway 1975 Estimate,UNSD,,,
23466,Ireland,372,1983,1983,Men,[25-29],25,29,51.81,Estimate,1983 Estimate,2126,Ireland 1983 Estimate,UNSD,,Based on the results of a labour force survey.,
9800,Côte d'Ivoire,384,1994,1994,Men,[35-39],35,39,81.8,Survey,1994 DHS,1779,Cote d'Ivoire 1994 Demographic and Health Survey,DHS_STATcompiler,,,
6163,Cameroon,120,1998,1998,Women,[15-19],15,19,33.6,Survey,1998 DHS,1674,Cameroon 1998 Demographic and Health Survey,DHS_STATcompiler,1.0,,
39385,Republic of Moldova,498,2004,2004,Men,[40-44],40,44,86.6,Census,2004 Census,845,Moldova 2004 Census,UNSD,1.0,,
20825,Hungary,348,2006,2006,Men,[55-59],55,59,74.68,Estimate,2006 Estimate,2120,Hungary 2006 Estimate,UNSD,,,


In [433]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
2416,Bahamas,44,Men,[70-74],70,74,69.85,Census,1980 Census,UNSD
17291,Germany,276,Men,[55-59],55,59,71.72,Estimate,2014 Estimate,UNSD
34534,New Caledonia,540,Men,[40-44],40,44,44.45,Census,2014 Census,National statistics
42463,Senegal,686,Women,[35-39],35,39,93.02,Survey,1960-1961 ED,INED
42588,Senegal,686,Men,[40-44],40,44,91.7,Survey,1997 DHS,DHS_STATcompiler
11450,Denmark,208,Men,[75+],75,999,62.53,Census,1991 Census,UNSD
50357,Turks and Caicos Islands,796,Men,[35-39],35,39,77.66,Census,1970 Census,US Census Bureau
46965,Sweden,752,Men,[65-69],65,69,71.61,Estimate,1994 Estimate,UNSD
5177,Burkina Faso,854,Men,[75+],75,999,81.21,Census,1985 Census,UNSD
28769,Liechtenstein,438,Men,[65-69],65,69,79.12,Census,2010 Census,UNSD


In [434]:
df_11.drop_duplicates(inplace=True)

In [435]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [436]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [437]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [438]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [439]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
33231,Mexico,484,2016,2016,Men,[75+],75,999,96.26,Estimate,2016 Estimate,2158,Mexico 2016 Estimate,UNSD,1.0,,
54949,Western Sahara,732,1970,1970,Men,[55-59],55,59,90.86,Census,1970 Census,1106,Western Sahara 1970 Census,UNSD,1.0,,Comprises of Northern Region (former Saguia el...
37741,Norway,578,1979,1979,Women,[40-44],40,44,94.77,Estimate,1979 Estimate,2180,Norway 1979 Estimate,UNSD,,,
42705,Saint Lucia,662,1991,1991,Men,[75+],75,999,85.19,Census,1991 Census,1136,Saint Lucia 1991 Census,UNSD,,,
21525,Hungary,348,1985,1985,Women,[40-44],40,44,95.8,Estimate,1985 Estimate,2120,Hungary 1985 Estimate,UNSD,,,
20534,Guyana,328,2000,2000,Women,[15-19],15,19,31.65,Survey,2000 MICS,1939,Guyana 2000 Multiple Indicator Cluster Survey,MICS,,,
12054,Denmark,208,1979,1979,Men,[75+],75,999,92.59,Estimate,1979 Estimate,2081,Denmark 1979 Estimate,UNSD,,,Excluding Faeroe Islands and Greenland shown s...
3107,Belgium,56,1970,1970,Men,[15-19],15,19,1.03,Census,1970 Census,290,Belgium 1970 Census,UNSD,,,


In [440]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
47998,Sudan,729,Women,[45-49],45,49,96.08,Estimate,UNSD
53346,United Kingdom,826,Men,[10-14],10,14,0.0,Estimate,UNSD
8385,China,156,Women,[65+],65,999,99.72,Survey,National statistics
51548,Tonga,776,Men,[65-69],65,69,94.32,Census,UNSD
24647,Iraq,368,Women,[35-39],35,39,87.21,Survey,MICS
46324,Slovenia,705,Women,[35-39],35,39,72.46,Estimate,UNSD
25209,Ireland,372,Women,[15-19],15,19,0.37,Estimate,UNSD
47029,South Africa,710,Men,[65-69],65,69,94.7,Census,UNSD


In [441]:
df_12.dropna(inplace=True)

In [442]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [443]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [444]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [445]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [446]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
53686,Norway,578,[20-24],ASFR2024,1995.5,77.49001,Eurostat.20190531,Official estimates,Estimate,Estimate,2180,All sources of estimates,Estimates,1995,1995,"Eurostat Statistics, Fertility rates by age [d...",2019
30235,Guam,316,[20-24],ASFR2024,2010.5,148.8,NVSR.61.1,Direct,Register,VR,589,Vital Registration,Register,2010,2010,"National Vital Statistics Reports. Vol61, N1 (...",2012
54975,Paraguay,600,[Total],TFR,1986.869995,4.655122,"2004 ENDSSR-RHS,Birth Histories (Extrapolated)...",Extrapolated from Truncated Birth Histories,Survey,RHS,649,Paraguay 2004 Encuesta Nacional de Demografía ...,2004 ENDSSR-RHS,2004,2004,DHS Statcompiler,2012
48489,Morocco,504,[25-29],ASFR2529,1978.5,281.831,"1980 WFS,Birth Histories,FBH analysis 2018,771...",Birth histories,Survey,WFS,771,Morocco 1980 World Fertility Survey,1980 WFS,1979,1980,Fertility rates from full birth histories anal...,2018
44802,Malaysia,458,[Total],TFR,2010.50137,2.1,"Register, VS Malaysia",Direct,Register,VR,616,Vital Registration,Register,2010,2010,Vital Statistics Malaysia,2017
12664,Canada,124,[Total],MAC,1992.5,27.9495,"Register,Computed rate from DYB,DYB,427-135-48",Computed rate from DYB,Register,VR,427,Vital Registration,Register,1992,1992,Demographic Yearbook,1995


In [447]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [448]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
50462,Nepal,[20-24],1993,289.0,Direct,Survey,2001 DHS,2001,2001
24897,Finland,[35-39],1979,27.54,Official estimates,Estimate,Estimates,1979,1979
20333,Dem. People's Rep. of Korea,[Total],2000,1.87,Reverse survival method,Survey,2009 MICS,2009,2009
60463,Rwanda,[20-24],1991,227.0,Direct,Survey,1992 DHS,1992,1992
6822,Belarus,[15-19],1988,36.0,Fertility data (adjusted),Estimate,Estimates,1988,1988
49606,Namibia,[20-24],1996,172.67,Birth histories,Survey,2013 DHS,2013,2013
43632,Luxembourg,[30-34],1994,97.97,Official estimates,Estimate,Estimates,1994,1994
49146,Myanmar,[35-39],1982,138.0,Own-children method,Survey,1991 PCFS,1991,1991
50327,Nepal,[35-39],1988,122.0,Own-children method,Survey,1995-1996 LSMS,1995,1996
69070,Tajikistan,[Total],1996,4.28,Reverse survival method,Survey,2000 MICS,2000,2000


In [449]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [450]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [451]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [452]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
100734,Guyana,328,2002,2002,Men,Widowed,1.0,,[55+],55,...,2002 Census,1225,Guyana 2002 Census,National statistics,,,,,,
56635,Czechia,203,2006,2006,Women,Single,,,[15-19],15,...,2006 Estimate,2079,Czech Republic 2006 Estimate,UNSD,,,,,,
111745,Iceland,352,1990,1990,Men,Widowed,,,[15-19],15,...,1990 Estimate,2121,Iceland 1990 Estimate,UNSD,,,,,,
204690,Russian Federation,643,2010,2010,Women,Married,,,[35-39],35,...,2010 Census,4841,Russian Federation 2010 Census,UNSD,,,,,,
82333,France,250,1979,1979,Women,Single,,,[75+],75,...,1979 Estimate,2094,France 1979 Estimate,UNSD,,,,,Excluding diplomatic personnel outside the cou...,


In [453]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
69991,Egypt,2014,2014,Women,Divorced,[45-49],45,49,2.9,Survey,2014 DHS,DHS_STATcompiler
192067,Peru,2003,2006,Women,Never married,[20-24],20,24,55.9,Survey,2004-2006 DHS,DHS_STATcompiler
89340,Germany,2013,2013,Women,Single,[65-69],65,69,4.73,Estimate,2013 Estimate,UNSD
227992,Spain,1978,1978,Women,Widowed,[45-49],45,49,4.97,Estimate,1978 Estimate,UNSD
170732,Netherlands,1985,1985,Men,Divorced,[75+],75,999,2.16,Estimate,1985 Estimate,UNSD
170310,Netherlands,1981,1981,Men,Single,[65-69],65,69,6.25,Estimate,1981 Estimate,UNSD
170773,Netherlands,1985,1985,Men,Widowed,[70-74],70,74,11.34,Estimate,1985 Estimate,UNSD
124724,Israel,2002,2002,Women,Widowed,[65+],65,999,51.38,Estimate,2002 Estimate,UNSD
114847,Iceland,2016,2016,Men,Separated,[60-64],60,64,2.26,Estimate,2016 Estimate,UNSD
69193,Egypt,2000,2000,Women,Not living together,[50-54],50,54,0.6,Survey,2000 DHS,DHS_HH


In [454]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [455]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [456]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [457]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [458]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
12637,Europe,908,SDG-M49,Married or in-union women,2010,40-44,76.070509,20398.824756,Estimate
23624,Developing countries,902,Development group,Married or in-union women,2007,15-19,15.227034,40421.086732,Estimate
23296,Developed countries,901,Development group,Married or in-union women,2047,15-19,4.022915,1177.695196,Projection
2032,Central and Southern Asia,62,SDG,Married or in-union women,1981,15-19,44.051122,44493.023964,Estimate
6135,Middle Africa,911,M49,Married or in-union women,2007,15-49,64.385284,36111.968909,Estimate
20998,Melanesia,928,SDG-M49,Married or in-union women,2002,45-49,84.680038,241.650577,Estimate


In [459]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
6873,Northern Africa,912,Married or in-union women,2019,20-24,41.550468,8459.036327,Estimate
7117,Northern Africa,912,Married or in-union women,2049,40-44,80.600859,18626.113813,Projection
3803,Oceania excluding Australia and New Zealand,543,Married or in-union women,2040,30-34,77.66787,997.640682,Projection
890,Sub-Saharan Africa,202,Married or in-union women,2000,25-29,78.632949,38945.935539,Estimate
5606,Eastern Africa,910,Married or in-union women,2022,45-49,73.598118,12858.895407,Estimate
1527,Northern Africa and Western Asia,747,Married or in-union women,1998,15-49,61.250642,105350.164543,Estimate
568,World,900,Married or in-union women,2041,15-19,10.031292,58957.478892,Projection
9771,Eastern Asia,906,Married or in-union women,1976,30-34,97.081497,57392.704987,Estimate
5058,Africa,903,Married or in-union women,2035,25-29,72.068406,114435.034553,Projection
12484,Europe,908,Married or in-union women,1991,35-39,82.534949,22328.09397,Estimate


In [460]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [461]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [462]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [463]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [464]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [465]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [466]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [467]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [468]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [469]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [470]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [471]:
df_16_1

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
0,Mexico,3.56,4.08,2.76
1,Costa Rica,3.46,4.37,3.44
2,Türkiye,3.2,4.1,2.8
3,Israel,3.19,4.65,2.86
5,Slovak Republic,3.1,3.8,2.5
7,Iceland,2.7,4.12,2.61
8,New Zealand,2.61,3.88,2.67
9,Greece,2.6,3.8,2.5
10,Croatia,2.6,3.9,2.6
11,Australia,2.53,3.93,2.78


In [472]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [473]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [474]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [475]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [476]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [477]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total            float64           0            36
with_children           float64           0            35
without_children        float64           0            36
single_parent_total     float64           0            34
single_mother           float64           0            32
single_father           float64           0            31
single_person           float64           0            35
other_households_types  float64           0            36
country                    object
couple_total              float64
with_children             float64
without_children          float64
single_parent_total       float64
single_mother             float64
single_father             float64
single_person             float64
other_households_types    float64
dtype: object


In [478]:
df_16_2

Unnamed: 0,country,couple_total,with_children,without_children,single_parent_total,single_mother,single_father,single_person,other_households_types
1,Austria,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11
2,Belgium,52.22,23.98,28.24,7.42,6.08,1.35,35.5,4.86
6,Costa Rica,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74
7,Czechia,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67
8,Denmark,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52
9,Estonia,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98
10,Finland,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6
11,France,49.73,22.19,27.54,7.68,6.23,1.45,37.78,4.81
12,Germany,45.78,17.89,27.89,5.41,4.44,0.98,43.14,5.67
13,Greece,52.14,24.03,28.11,4.66,3.82,0.84,32.35,10.85


In [479]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024a
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [480]:
df_16_3.rename(columns={
    "0 children": "households_0_children(%)",
    "1 child": "households_1_child(%)",
    "2 children": "households_2_children(%)"
}, inplace=True)

In [481]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [482]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [483]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [484]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                         dtype  null_count  unique_count
country                 object           0            33
households_0_children  float64           0            32
households_1_child     float64           0            32
households_2_children  float64           0            33
3_or_more_children     float64           0            31
children_under_6       float64           0            31
country                   object
households_0_children    float64
households_1_child       float64
households_2_children    float64
3_or_more_children       float64
children_under_6         float64
dtype: object


In [485]:
df_16_3

Unnamed: 0,country,households_0_children,households_1_child,households_2_children,3_or_more_children,children_under_6
1,Austria,77.78,10.52,8.57,3.12,9.44
2,Belgium,73.97,11.76,10.15,4.11,10.4
6,Costa Rica,30.29,23.08,24.61,22.02,26.3
7,Czechia,71.95,13.85,11.56,2.64,12.29
8,Denmark,77.78,10.54,8.94,2.74,8.15
9,Estonia,75.76,12.53,8.73,2.98,9.85
10,Finland,81.98,7.89,6.99,3.14,7.14
11,France,75.36,11.43,9.23,3.99,9.86
12,Germany,79.86,9.91,7.72,2.51,8.57
13,Greece,74.31,11.83,9.97,3.89,9.37


In [486]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_1

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [487]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
df_17_2

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163.0,150.0
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141.0,132.0
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153.0,147.0
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133.0,126.0
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126.0,117.0
5,Colombia,674,671,666,658,648,633,616,596,574,...,182,177,172,172,172,171,169,168,166.0,165.0
6,Costa Rica,671,665,654,639,619,596,570,542,503,...,180,180,175,171,171,160,145,135,134.0,133.0
7,Czechia,211,213,214,233,236,218,201,190,183,...,153,157,163,169,171,171,171,183,162.0,145.0
8,Denmark,254,255,254,264,260,261,262,235,212,...,169,171,179,175,173,170,167,172,155.0,150.0
9,Estonia,198,198,195,189,194,188,187,190,203,...,154,158,160,159,167,166,158,161,141.0,131.0


In [488]:
df_6666 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_6666

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,


In [489]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [490]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [491]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [492]:
# 1) Drop rows with missing key fields
df_7777.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_7777.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_7777.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_7777.sort_values(["country", "year"], inplace=True)
df_7777.reset_index(drop=True, inplace=True)


In [493]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [494]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [495]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [496]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [497]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [498]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [499]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [500]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [501]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
df_19_2

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Bulgaria,25-29,1201,1193,1148,1135,1077,1045,1021,1013,...,889,867,881,881,885,886,888,913,898,917
143,Bulgaria,30-34,513,521,517,499,482,451,418,403,...,658,646,685,697,712,715,723,728,713,736
144,Bulgaria,35-39,197,194,192,181,169,167,153,146,...,276,271,301,306,312,327,329,340,340,348
145,Bulgaria,40-44,72,64,63,54,50,44,38,38,...,44,47,54,59,66,71,72,77,78,78


In [502]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [503]:
df_888= pd.read_csv('../data/Raw/OECD/Households-by-type,-presence-of-children-and-country,-2015-2024.csv')
df_888

Unnamed: 0,Category,Single adult with children,Single adult without children,Couple with children,Couple without children,Other type of household with children,Other type of household without children
0,2015,6147.3,64181.3,31679.8,46641.6,11698.9,30771.6
1,2016,6148.5,63891.1,31907.3,47308.2,11766.3,30559.5
2,2017,6108.5,65353.9,32091.5,47426.1,11530.2,30297.5
3,2018,6163.6,66165.5,31720.2,48194.8,11342.5,30224.0
4,2019,6246.4,67417.9,31710.1,48503.6,11285.7,30134.8
5,2020,6136.4,67412.9,31622.2,48831.2,11212.9,30445.2
6,2021,5691.9,70200.4,30558.3,47447.4,11611.8,30700.7
7,2022,5984.9,72134.3,30469.3,47995.5,11513.6,30412.1
8,2023,5924.8,73396.2,30313.0,48477.5,11443.5,30608.8
9,2024,6077.7,75049.7,30286.5,49058.4,11311.9,30487.3


In [506]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [507]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [508]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [512]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
df_21_3

Unnamed: 0,Country,Previous marital status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216
5,Austria,Widowed,12,11,12,11,12,12,11,10,...,10,9,10,9,10,8,9,9,8,4
6,Czechia,Single never married,749,745,743,740,739,742,745,726,...,740,740,752,756,766,767,764,764,761,759
7,Czechia,Divorced,237,242,244,247,247,245,244,261,...,249,249,238,234,224,223,226,226,229,230
8,Czechia,Widowed,14,13,13,13,14,12,11,13,...,12,11,10,10,10,10,10,10,10,11
9,Denmark,Single never married,759,760,762,764,760,756,756,763,...,772,760,750,762,761,769,764,771,776,783


In [515]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
df_22_1

Unnamed: 0,Country,20+_All_Total_Living_with_a_partner,20+_All_Married or in a civil or registered partnership_living_with_a_partner,20+_All_Cohabiting_living_with_a_partner,20+_All_Not living with a partner,Unnamed: 5,20/34_Total_living_with_a_partner,20/34_Married or in a civil or registered partnership_living_with_a_partner,20/34_Cohabiting_living_with_a_partner,Not living with a partner_Total,Living with at least one parent
0,Australia (c),6379,5359,1020,3621,,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,,3078,2132,946,6922,3620
5,Denmark,6415,5002,1412,3585,,5054,2186,2868,4946,1067
6,Estonia,5393,3730,1664,4607,,4531,1781,2750,5469,2646
7,France,6414,4941,1472,3586,,5042,2189,2853,4958,2208
8,Germany,6261,5391,869,3739,,3953,2215,1739,5974,2754
9,Greece,6023,5852,171,3977,,3313,2924,390,6687,4543


In [516]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
df_22_2

Unnamed: 0,Country,Low_Education_Total_living_with_a_partner,Low_educationMarried or in a civil or registered partnership_living_with_a_partner,Low_education_Cohabiting_living_with_a_partner,Not living with a partner_Low_education,Medium education_Total_Living with a partner,Medium education_Married or in a civil or registered partnership_Living with a partner,Medium education_Cohabiting_Living with a partner,Not living with a partner_Medium education,Unnamed: 9,High education_Total_Living with a partner,High education_Married or in a civil or registered partnership_Living with a partner,High education_Cohabiting_Living with a partner,Not living with a partner_High education_
0,Austria,5681,5049,632,4319,5927,4873,1054,,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,,6558,4660,1898,3442
5,Germany,5446,4879,567,4554,6238,5313,925,3762.0,,6889,5916,974,3111
6,Greece,6381,6288,93,3619,5700,5488,212,4300.0,,5833,5570,263,4167
7,Hungary,5033,4038,995,4967,5794,4678,1115,4206.0,,5956,5102,855,4044
8,Iceland,5186,4102,1084,4814,5831,4657,1174,4169.0,,6972,5453,1519,3028
9,Latvia,3627,2592,1035,6373,4932,3954,978,5068.0,,5291,4539,752,4709


In [504]:
df_999 = pd.read_csv('../data/Raw/OECD/Households-with-children-by-number-of-children,-2024.csv')
df_999

Unnamed: 0,Category,1 child,2 children,3 children or more
0,European Union,11.7,8.9,3.0
1,,,,
2,Slovakia,17.1,14.5,4.0
3,Ireland,12.4,12.2,6.4
4,Cyprus,13.9,11.7,3.1
5,Czechia,13.9,11.6,2.6
6,Romania,14.3,9.2,4.0
7,Luxembourg,12.5,12.1,2.4
8,Belgium,11.8,10.2,4.1
9,Croatia,12.0,10.1,3.8


In [505]:
import pandas as pd

df = pd.read_csv('../data/Raw/OECD/OECD,DF_FAMILY,+all.csv')

df_wide = df.pivot_table(
    index=['Country', 'TIME_PERIOD', 'COU'],
    columns='Indicator',
    values='OBS_VALUE'
).reset_index()

df_wide.columns.name = None

df_wide.to_csv("WIDE_FORMAT.csv", index=False)

print(df_wide)

# df_wide = df.pivot(index='id', columns='variable', values='value')
# df = df.drop(columns='indicator')

            Country  TIME_PERIOD  COU  Child poverty rate  \
0         Argentina         2001  ARG                 NaN   
1         Argentina         2002  ARG                 NaN   
2         Argentina         2003  ARG                 NaN   
3         Argentina         2004  ARG                 NaN   
4         Argentina         2005  ARG                 NaN   
...             ...          ...  ...                 ...   
1170  United States         2018  USA                 NaN   
1171  United States         2019  USA                 NaN   
1172  United States         2020  USA                 NaN   
1173  United States         2021  USA                 NaN   
1174  United States         2022  USA                 NaN   

      Country mean average score in mathematics, by sex  \
0                                                   NaN   
1                                                   NaN   
2                                                   NaN   
3                              