In [2]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [3]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [4]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [5]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [6]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [7]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [8]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [9]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [10]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [11]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [12]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [13]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [14]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [15]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [16]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [17]:
df_2.drop_duplicates(inplace=True)


In [18]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [19]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [20]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [21]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [22]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [23]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [24]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [25]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [26]:
df_3.drop_duplicates(inplace=True)


In [27]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [28]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [29]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [30]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [31]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [32]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [33]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [34]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [35]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [36]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [37]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [38]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [39]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [40]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [41]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [42]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [43]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [44]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [45]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
53,Women,,36,79.9,88.9,94.2,83.5,68.2,56.2,,
13,Men,,30,72.9,76.4,83.3,63.9,41.4,25.5,20.1,
15,Men,,32,79.3,82.2,86.5,68.8,48.2,33.5,25.1,
21,Men,,38,88.1,89.0,90.5,76.6,61.3,50.9,,
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,


In [46]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [47]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [48]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [49]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [50]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [51]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [52]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
517,Denmark,DNK,1972,14.4
256,Canada,CAN,2006,30.6
1281,Malta,MLT,1979,0.9
1274,Malta,MLT,1972,1.3
1245,Luxembourg,LUX,2004,26.1


In [53]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [54]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [55]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [56]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [57]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [58]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
59,Women,GBR,42,83.3,90.9,95.2,85.9,73.2,62.8,,
58,Women,GBR,41,82.9,90.7,95.1,85.7,72.7,62.0,,
12,Men,GBR,29,68.3,72.0,81.0,60.6,37.4,21.5,16.4,
15,Men,GBR,32,79.3,82.2,86.5,68.8,48.2,33.5,25.1,
36,Women,GBR,19,2.1,5.3,13.4,12.1,3.8,1.4,0.4,0.2


In [59]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
50,Women,33,76.8,86.8,93.0,81.1,63.1
25,Men,42,90.7,90.6,91.6,79.1,65.8
66,Women,49,85.2,91.9,95.7,87.3,76.0
31,Men,48,92.5,91.7,92.3,81.3,69.9
51,Women,34,78.0,87.7,93.4,82.0,64.9


In [60]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [61]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [62]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [63]:
#pip install openpyxl pywin32

In [64]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [65]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [66]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [67]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [68]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [69]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [70]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [71]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [72]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [73]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [74]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [75]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
7188,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,242,Fiji,FJ,FJI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,62.46
16836,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,584,Marshall Islands,MH,MHL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.68
11209,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,398,Kazakhstan,KZ,KAZ,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,65.21
18526,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,642,Romania,RO,ROU,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.37
19206,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,666,Saint Pierre and Miquelon,PM,SPM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.79


In [76]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
23131,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,796,Turks and Caicos Islands,TC,TCA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,58.22
22576,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,780,Trinidad and Tobago,TT,TTO,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,48.12
12472,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,428,Latvia,LV,LVA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,62.64
1263,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,48,Bahrain,BH,BHR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,53.74
13171,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,446,"China, Macao SAR",MO,MAC,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,56.22


In [77]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [78]:
df_9.drop_duplicates(inplace=True)

In [79]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [80]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [81]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [82]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [83]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [84]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
136419,Ukraine,804,Married or in-union women,2012,30-34,76.7,1336.938525,Estimate
91634,Aruba,533,Married or in-union women,2003,25-29,46.360987,1.37159,Estimate
114182,Saudi Arabia,682,Married or in-union women,1986,45-49,84.4,108.109648,Estimate
122442,Spain,724,Married or in-union women,2047,25-29,31.967739,352.126726,Projection
65398,Japan,392,Married or in-union women,2044,45-49,70.670168,1976.383474,Projection
90297,Netherlands,528,Married or in-union women,1998,20-24,35.919756,173.993503,Estimate
10145,Belgium,56,Married or in-union women,2023,20-24,20.71445,67.463961,Projection
16895,Bulgaria,100,Married or in-union women,1975,15-49,73.982453,1617.335214,Estimate
117474,Singapore,702,Married or in-union women,1993,25-29,57.806545,103.560425,Estimate
27771,Comoros,174,Married or in-union women,2039,30-34,87.01591,34.108931,Projection


In [85]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
98295,Marshall Islands,584,Married or in-union women,2025,15-49,60.165596,5.190486,Projection
73907,Libya,434,Married or in-union women,1974,30-34,93.807273,59.110308,Estimate
78955,Malaysia,458,Married or in-union women,2038,30-34,72.39458,1131.109929,Projection
104700,Guinea-Bissau,624,Married or in-union women,2016,35-39,85.423012,44.976924,Estimate
133430,Türkiye,792,Married or in-union women,2043,45-49,69.920594,2354.033058,Projection


In [86]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [87]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [88]:
df_10.dropna(inplace=True)

In [89]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [90]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [91]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [92]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [93]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [94]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
37275,Panama,591,1990,1990,Women,[30-34],30,34,75.09,Census,1990 Census,63,Panama 1990 Census,UNSD,1.0,,
29121,Lithuania,440,2008,2008,Men,[30-34],30,34,55.76,Estimate,2008 Estimate,2146,Lithuania 2008 Estimate,UNSD,,,
10400,Czechia,203,1970,1970,Women,[55-59],55,59,70.2,Census,1970 Census,392,Czech Republic 1970 Census,National statistics,,,
18514,Greenland,304,2018,2018,Women,[60-64],60,64,52.53,Estimate,2018 Estimate,2109,Greenland 2018 Estimate,UNSD,,,
21850,Iceland,352,1999,1999,Men,[20-24],20,24,14.01,Estimate,1999 Estimate,2121,Iceland 1999 Estimate,UNSD,1.0,,
29699,Madagascar,450,1992,1992,Women,[35-39],35,39,77.5,Survey,1992 DHS,1708,Madagascar 1992 Demographic and Health Survey,DHS_STATcompiler,1.0,,
56,Afghanistan,4,2010,2011,Women,[30-34],30,34,95.43,Survey,2010-2011 MICS,4983,Afghanistan 2010-2011 Multiple Indicator Clust...,MICS,,,
12365,Dominican Republic,214,1999,1999,Men,[25-29],25,29,56.6,Survey,1999 DHS,1684,Dominican Republic 1999 Demographic and Health...,DHS_STATcompiler,1.0,,


In [95]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
38959,Qatar,634,1986,1986,Women,[10-14],10,14,0.04,Census,1986 Census,307,Qatar 1986 Census,UNSD,,,
39474,Réunion,638,1974,1974,Women,[20-24],20,24,46.98,Census,1974 Census,1249,Reunion 1974 Census,UNSD,1.0,,
2842,Bangladesh,50,2011,2011,Women,[35-39],35,39,94.22,Census,2011 Census,4776,Bangladesh 2011 Census,IPUMS,1.0,Data are based on a 5 per cent sample.,
15820,France,250,1982,1982,Men,[25-29],25,29,57.15,Census,1982 Census,363,France 1982 Census,UNSD,,Based on a sample of census returns.,Excluding diplomatic personnel outside the cou...
47818,Switzerland,756,1993,1993,Men,[25-29],25,29,34.17,Estimate,1993 Estimate,2228,Switzerland 1993 Estimate,UNSD,,,
8876,Comoros,174,1991,1991,Women,[40-44],40,44,80.9,Census,1991 Census,1019,Comoros 1991 Census,UNSD,,,Excluding persons for the Island of Mayotte.
21500,Iceland,352,1985,1985,Women,[10-14],10,14,0.0,Estimate,1985 Estimate,2121,Iceland 1985 Estimate,UNSD,,,
10876,Czechia,203,2011,2011,Women,[65-69],65,69,57.41,Estimate,2011 Estimate,2079,Czech Republic 2011 Estimate,UNSD,,,


In [96]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
36241,Norway,578,Women,[50-54],50,54,71.07,Estimate,1999 Estimate,UNSD
10247,Cyprus,196,Men,[20-24],20,24,17.48,Census,1973 Census,UNSD
53488,Zimbabwe,716,Women,[45-49],45,49,75.4,Survey,1999 DHS,DHS_STATcompiler
38802,Portugal,620,Women,[50-54],50,54,80.77,Census,2001 Census,UNSD
52006,Uruguay,858,Men,[30-34],30,34,67.19,Census,2011 Census,UNSD
40123,Romania,642,Women,[30-34],30,34,72.88,Estimate,2011 Estimate,UNSD
12433,Dominican Republic,214,Women,[35-39],35,39,79.18,Census,2002 Census,UNSD
2196,Azerbaijan,31,Women,[20-24],20,24,44.19,Estimate,2012 Estimate,UNSD
19956,Hungary,348,Men,[50-54],50,54,89.27,Estimate,1977 Estimate,UNSD
13901,Faeroe Islands,234,Men,[75+],75,999,64.03,Estimate,2008 Estimate,UNSD


In [97]:
df_11.drop_duplicates(inplace=True)

In [98]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [99]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [100]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [101]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [102]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
21412,Hungary,348,1981,1981,Women,[35-39],35,39,95.18,Estimate,1981 Estimate,2120,Hungary 1981 Estimate,UNSD,,,
810,Argentina,32,2010,2010,Men,[55-59],55,59,90.27,Census,2010 Census,4763,Argentina 2010 Census,National statistics,1.0,,
21874,Hungary,348,1998,1998,Men,[35-39],35,39,81.9,Estimate,1998 Estimate,2120,Hungary 1998 Estimate,UNSD,,,
40522,Poland,616,1996,1996,Women,[50-54],50,54,94.8,Estimate,1996 Estimate,2192,Poland 1996 Estimate,UNSD,,,Data pertain to nationals only.
16341,Finland,246,2017,2017,Men,[25-29],25,29,16.02,Estimate,2017 Estimate,2093,Finland 2017 Estimate,UNSD,,,
2301,Azerbaijan,31,2016,2016,Men,[55-59],55,59,98.76,Estimate,2016 Estimate,2035,Azerbaijan 2016 Estimate,UNSD,,,
36739,Nicaragua,558,1995,1995,Men,[60-64],60,64,92.78,Census,1995 Census,1252,Nicaragua 1995 Census,National statistics,1.0,,
31371,Madagascar,450,2003,2004,Men,[45-49],45,49,97.4,Survey,2003-2004 DHS,1734,Madagascar 2003-2004 Demographic and Health Su...,DHS_STATcompiler,1.0,,


In [103]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
25123,Ireland,372,Men,[65-69],65,69,83.65,Census,UNSD
7986,Chile,152,Men,[45-49],45,49,90.7,Estimate,UNSD
4798,Brunei Darussalam,96,Women,[30-34],30,34,80.53,Census,UNSD
20912,Haiti,332,Men,[75+],75,999,98.93,Survey,DHS_HH
37074,Niger,562,Men,[20-24],20,24,32.2,Survey,DHS_STATcompiler
34575,Namibia,516,Women,[45-49],45,49,82.5,Survey,National statistics
50005,Switzerland,756,Women,[15-19],15,19,1.16,Census,UNSD
28243,Kuwait,414,Women,[55-59],55,59,96.33,Census,UNSD


In [104]:
df_12.dropna(inplace=True)

In [105]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [106]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [107]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [108]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [109]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
55599,Peru,604,[30-34],ASFR3034,2005.081055,113.0,"2012 DHS,Direct,DHS,5555-16-39167",Direct,Survey,DHS,5555,Peru 2012 Demographic and Health Survey (Conti...,2012 DHS,2012,2012,DHS Statcompiler,2012
21435,Dominican Republic,214,[15-19],ASFR1519,1979.33374,122.0,"1996 DHS,Direct,DHS,1685-16-39167",Direct,Survey,DHS,1685,Dominican Republic 1996 Demographic and Health...,1996 DHS,1996,1996,DHS Statcompiler,2012
40333,Kyrgyzstan,417,[Total],TFR,1997.50137,2.59,"NSO 2019, Direct",Direct,Register,VR,583,Vital Registration,Register,1997,1997,Dynamic Tables 5.01.00.06 Fertility rate of bi...,2019
58851,Romania,642,[Total],MAC,1968.5,26.90993,"Estimates,Fertility data (Adjusted),HFC-ODE,21...",Fertility data (adjusted),Estimate,Estimate,2199,All sources of estimates,Estimates,1968,1968,European Demographic Observatory (ODE). Data c...,2011
3701,Austria,40,[40-44],ASFR4044,1973.5,9.78,Eurostat.20190531,Official estimates,Estimate,Estimate,2038,All sources of estimates,Estimates,1973,1973,"Eurostat Statistics, Fertility rates by age [d...",2019
71053,Trinidad and Tobago,780,[15-19],ASFR1519,1963.5,116.6196,"Register,Computed rate from DYB,DYB,567-135-31",Computed rate from DYB,Register,VR,567,Vital Registration,Register,1963,1963,Demographic Yearbook,1978


In [110]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [111]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
26914,Gambia,[Total],1984,28.4,P/F Ratio method (Feeney),Census,1983 Census,1983,1983.0
16186,Colombia,[15-19],1973,83.0,Direct,Survey,1990 DHS,1990,1990.0
2295,Armenia,[40-44],1975,15.1,Direct,Register,Register,1975,1975.0
54699,Panama,[Total],1968,5.58,Direct,Estimate,Estimates,1968,1968.0
12874,Central African Republic,[20-24],1987,283.0,Direct,Survey,1994-1995 DHS,1994,1995.0
38351,Japan,[45-49],1996,0.08,Fertility data (adjusted),Estimate,Estimates,1996,1996.0
5847,Bangladesh,[30-34],1993,140.26,Birth histories,Survey,2001 DHS Special,2001,2001.0
7472,Belgium,[15-19],2003,10.73,Official estimates,Estimate,Estimates,2003,2003.0
49265,Myanmar,[40-44],1998,50.4,Extrapolated from Truncated Birth Histories,Survey,2015-2016 DHS,2015,2016.0
6302,Bangladesh,[45-49],2013,10.0,Direct,SRS,SVRS,1980,


In [112]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [113]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [114]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [115]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
187774,Pakistan,586,2001,2001,Women,Divorced,,,[50-54],50,...,2001 Estimate,2185,Pakistan 2001 Estimate,UNSD,,,,,,
46963,Congo,178,1960,1960,Men,Widowed,,1.0,[40-44],40,...,1960 Survey,3070,Congo 1960 Survey,INED,,,,,Excluding Brazzaville and Pointe Noire.,
214709,Saudi Arabia,682,2016,2016,Women,Divorced,,,[70-74],70,...,2016 DS,7336,Saudi Arabia 2016 Demographic Survey,National statistics,,,,,,
191952,Peru,604,2003,2006,Men,Widowed,,,[35-39],35,...,2004-2006 DHS,2515,Peru 2004-2006 Demographic and Health Survey (...,DHS_HH,,,,,,
205505,Rwanda,646,2002,2002,Women,Divorced or Separated,,,[40-44],40,...,2002 Census,304,Rwanda 2002 Census,UNSD,,,,,,


In [116]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
91719,Gibraltar,1970,1970,Women,Single,[25-29],25,29,19.63,Census,1970 Census,UNSD
140962,Lesotho,2000,2000,Women,Married or Living together,[15-19],15,19,2.67,Survey,2000 MICS,MICS
22787,Brazil,1991,1991,Men,Married,[35-39],35,39,71.84,Census,1991 Census,UNSD
143111,Liberia,2008,2008,Women,Separated,[75+],75,999,2.2,Census,2008 Census,UNSD
56884,Czechia,2008,2008,Women,Widowed,[70-74],70,74,43.52,Estimate,2008 Estimate,UNSD
67008,Dominican Republic,2013,2013,Men,Widowed,[35-39],35,39,0.6,Survey,2013 DHS,DHS_HH
241683,Syrian Arab Republic,1970,1970,Women,Divorced,[45-49],45,49,1.02,Census,1970 Census,UNSD
158461,Mexico,1980,1980,Men,Single,[10-14],10,14,97.58,Census,1980 Census,UNSD
155764,Malta,2011,2011,Women,Single,[60-64],60,64,13.28,Estimate,2011 Estimate,UNSD
207856,Saint Vincent and the Grenadines,1980,1980,Men,Divorced,[25-29],25,29,0.18,Census,1980 Census,UNSD


In [117]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [118]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [119]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [120]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [121]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
20998,Melanesia,928,SDG-M49,Married or in-union women,2002,45-49,84.680038,241.650577,Estimate
21947,Micronesia,954,SDG-M49,Married or in-union women,2040,30-34,64.063815,25.681261,Projection
5467,Eastern Africa,910,M49,Married or in-union women,2005,30-34,82.246769,15148.321841,Estimate
8091,Western Africa,914,M49,Married or in-union women,2009,30-34,89.196247,17888.410708,Estimate
10010,Eastern Asia,906,SDG-M49,Married or in-union women,2006,25-29,76.005477,79883.342546,Estimate
20436,Australia and New Zealand,927,SDG,Married or in-union women,2032,25-29,46.591351,500.315862,Projection


In [122]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
6233,Middle Africa,911,Married or in-union women,2020,20-24,52.194548,8583.758184,Estimate
20414,Australia and New Zealand,927,Married or in-union women,2030,15-49,49.618209,3791.082952,Projection
26976,Upper-middle-income countries,1502,Married or in-union women,2021,15-19,6.070829,9033.542723,Estimate
16820,Caribbean,915,Married or in-union women,2047,35-39,60.056821,1861.513412,Projection
6791,Northern Africa,912,Married or in-union women,2008,15-49,57.63654,61700.642599,Estimate
28453,No income group available,1518,Married or in-union women,2043,40-44,61.096464,1197.936087,Projection
12489,Europe,908,Married or in-union women,1992,20-24,43.16632,11210.661779,Estimate
16638,Caribbean,915,Married or in-union women,2024,45-49,63.715388,1692.176213,Estimate
12076,South-Eastern Asia,35,Married or in-union women,2021,35-39,86.067626,43933.408958,Estimate
3106,Eastern and South-Eastern Asia,753,Married or in-union women,2034,25-29,51.808014,71714.821577,Projection


In [123]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [124]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [125]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [126]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [127]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [None]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [None]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [None]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024a
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0.1,Unnamed: 0,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [128]:
excel_1_1 = pd.read_excel('../data/Raw/OECD/SF_1_1_Family_size_and_composition.xlsx')
excel_1_1.head(10)

Unnamed: 0.1,Unnamed: 0,"Chart SF1.1.A. Average size of households by household type, 2024a",Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,"Data for Chart SF1.1.A. Average size of households by household type, 2024a",Unnamed: 13,Unnamed: 14,Unnamed: 15
0,,"Mean average number of people per household, b...",,,,,,,,,,,"Mean average number of people per household, b...",,,
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,All households,Couple households with children,Single parent households with children
3,,,,,,,,,,,,,Mexico,3.56,4.08,2.76
4,,,,,,,,,,,,,Costa Rica,3.462513,4.372663,3.443867
5,,,,,,,,,,,,,Türkiye,3.2,4.1,2.8
6,,,,,,,,,,,,,Israel,3.19,4.649476,2.863297
7,,,,,,,,,,,,,Columbia,3.100732,,
8,,,,,,,,,,,,,Slovak Republic,3.1,3.8,2.5
9,,,,,,,,,,,,,Chile,2.8,,


In [129]:
file_path  = '../data/Raw/OECD/SF_1_1_Family_size_and_composition.xlsx'
output_csv = '../data/processed/household_oecd_combined.csv'

xls = pd.ExcelFile(file_path)

# ----------------- yardımcılar -----------------
def find_row_with_value(df, pattern):
    """"Dataframe first row that matches the pattern."""
    pat = re.compile(pattern, re.IGNORECASE)
    for i in range(df.shape[0]):
        row = df.iloc[i].astype(str).tolist()
        if any(pat.search(str(x)) for x in row):
            return i
    return 0

def pick_country_column(df):
    """'Unnamed' kolonlar arasından ülke adlarının olduğu en dolu olanı seç."""
    candidates = [c for c in df.columns if str(c).lower().startswith("unnamed")]
    if not candidates:
        return df.columns[0]
    return max(candidates, key=lambda c: df[c].astype(str).ne("nan").sum())

# ----------------- Sheet 1: Chart SF1.1.A -----------------
# Automatical finding of header row
_chart_raw = pd.read_excel(xls, sheet_name='Chart SF1.1.A', header=None)
chart_header_row = find_row_with_value(_chart_raw, r"\bAll households\b")

chart_df = pd.read_excel(xls, sheet_name='Chart SF1.1.A', header=chart_header_row)
country_col_chart = pick_country_column(chart_df)

chart_df = chart_df.rename(columns={country_col_chart: "country"})
rename_chart = {}
for c in chart_df.columns:
    lc = str(c).lower()
    if "all households" in lc:
        rename_chart[c] = "avg_size_all"
    elif "couple households with children" in lc:
        rename_chart[c] = "avg_size_couple_with_children"
    elif "single parent households with children" in lc:
        rename_chart[c] = "avg_size_single_parent_with_children"

chart_df = chart_df[["country"] + list(rename_chart.keys())].rename(columns=rename_chart)
chart_df["country"] = chart_df["country"].astype(str).str.strip()
for col in rename_chart.values():
    chart_df[col] = pd.to_numeric(chart_df[col], errors="coerce")
chart_df = chart_df.dropna(subset=[c for c in chart_df.columns if c != "country"], how="all")

# ----------------- Sheet 2: Table SF1.1.A (çok satırlı başlık) -----------------
_table_a_raw = pd.read_excel(xls, sheet_name='Table SF1.1.A', header=None)
table_a_header_row_top = find_row_with_value(_table_a_raw, r"Couple households")
table_a = pd.read_excel(xls, sheet_name='Table SF1.1.A',
                        header=[table_a_header_row_top, table_a_header_row_top+1])

# Ülke kolonu: en dolu Unnamed çifti
mi_cols = table_a.columns
unnamed_pairs = [c for c in mi_cols
                 if str(c[0]).lower().startswith("unnamed")
                 and str(c[1]).lower().startswith("unnamed")]
country_multi_col = max(unnamed_pairs, key=lambda c: table_a[c].astype(str).ne("nan").sum())

def match_best(mi_cols, top_label, sub_keyword):
    """Find best matching column in multi-index columns."""
    for c in mi_cols:
        top = str(c[0]).strip()
        sub = str(c[1]).strip()
        if top == top_label:
            if not sub_keyword:
                return c  # Get first match (e.g., "Single person" / "Other" sub-headings)
            if sub_keyword.lower() in sub.lower():
                return c
    return None

specs = [
    ("Couple households:", "Total",             "share_couple_total"),
    ("Couple households:", "With children",     "share_couple_with_children"),
    ("Couple households:", "Without children",  "share_couple_without_children"),
    ("Single parent households:", "Total",      "share_single_parent_total"),
    ("Single parent households:", "Single mother households", "share_single_mother"),
    ("Single parent households:", "Single father households", "share_single_father"),
    ("Single person households", "",            "share_single_person"),
    ("Other household types", "",              "share_other_types"),
]

selected_cols = {country_multi_col: "country"}
for top, sub, name in specs:
    c = match_best(mi_cols, top, sub)
    if c is not None:
        selected_cols[c] = name

table_a_clean = table_a.loc[:, list(selected_cols.keys())].copy()
table_a_clean.columns = list(selected_cols.values())
table_a_clean["country"] = table_a_clean["country"].astype(str).str.strip()
for col in [c for c in table_a_clean.columns if c != "country"]:
    table_a_clean[col] = pd.to_numeric(table_a_clean[col], errors="coerce")
table_a_clean = table_a_clean.dropna(subset=[c for c in table_a_clean.columns if c != "country"], how="all")

# ----------------- Sheet 3: Table SF1.1.B -----------------
_table_b_raw = pd.read_excel(xls, sheet_name='Table SF1.1.B', header=None)
table_b_header_row = find_row_with_value(_table_b_raw, r"^0\s*children$")
table_b = pd.read_excel(xls, sheet_name='Table SF1.1.B', header=table_b_header_row)

country_col_b = pick_country_column(table_b)
table_b = table_b.rename(columns={country_col_b: "country"})

rename_b = {}
for c in table_b.columns:
    lc = str(c).lower()
    if lc == "country":
        continue
    if re.search(r"^0\s*children", lc):
        rename_b[c] = "share_hh_0_children"
    elif re.search(r"^1\s*child", lc):
        rename_b[c] = "share_hh_1_child"
    elif re.search(r"^2\s*children", lc):
        rename_b[c] = "share_hh_2_children"
    elif "3 or more children" in lc:
        rename_b[c] = "share_hh_3plus_children"
    elif ("under 6" in lc) or ("under six" in lc):
        rename_b[c] = "share_hh_with_child_under6"

table_b = table_b[["country"] + list(rename_b.keys())].rename(columns=rename_b)
table_b["country"] = table_b["country"].astype(str).str.strip()
for col in rename_b.values():
    table_b[col] = pd.to_numeric(table_b[col], errors="coerce")
table_b = table_b.dropna(subset=[c for c in table_b.columns if c != "country"], how="all")

# ----------------- Together+ Save -----------------
combined = (
    chart_df
    .merge(table_a_clean, on="country", how="outer")
    .merge(table_b,       on="country", how="outer")
    .sort_values("country")
    .reset_index(drop=True)
)

# Save the combined DataFrame to CSV
Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
combined.to_csv(output_csv, index=False)
print(f"✅ Saved: {output_csv}")
display(combined.head(12))

✅ Saved: ../data/processed/household_oecd_combined.csv


Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [130]:
df_16 = pd.read_csv('../data/processed/household_oecd_combined.csv')
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [131]:
# To numeric columns, except 'country'
metric_cols = [c for c in df_16.columns if c != 'country']
for c in metric_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 3) Long/tidy verse format
tidy_16 = (df_16
           .melt(id_vars='country',
                 value_vars=metric_cols,
                 var_name='metric',
                 value_name='value')
           .dropna(subset=['value'])
           .sort_values(['country','metric'])
           .reset_index(drop=True))

# (opsiyonel) 
tidy_16['unit'] = tidy_16['metric'].apply(lambda m: 'persons' if m.startswith('avg_size') else 'percent')
def metric_group(m):
    if m.startswith('avg_size'): return 'average_size'
    if m.startswith('share_hh_'): return 'children_count_distribution'
    if m.startswith('share_couple'): return 'couple_households'
    if m.startswith('share_single_parent'): return 'single_parent_households'
    if m in ('share_single_mother','share_single_father'): return 'single_parent_gender'
    if m == 'share_single_person': return 'single_person_households'
    if m == 'share_other_types': return 'other_types'
    return 'other'
tidy_16['group'] = tidy_16['metric'].map(metric_group)

# 4) Gerekirse tidy üstünde oynadıktan sonra tekrar WIDE'a dön ve df_16'ya geri yaz
df_16 = (tidy_16.pivot_table(index='country', columns='metric', values='value', aggfunc='first')
         .reset_index()
         .reindex(columns=['country'] + metric_cols)   # orijinal kolon sırası
        )

In [132]:
df_16

metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total,share_couple_with_children,share_couple_without_children,share_single_parent_total,share_single_mother,share_single_father,share_single_person,share_other_types,share_hh_0_children,share_hh_1_child,share_hh_2_children,share_hh_3plus_children
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [133]:
# 1) Identify columns
metric_cols = [c for c in df_16.columns if c != 'country']
pct_cols    = [c for c in metric_cols if str(c).startswith('share_') or '(%)' in str(c)]
size_cols   = [c for c in metric_cols if str(c).startswith('avg_size')]

# 2) Normalize percentage columns to 0–100 NUMERIC (do NOT divide by 100)
for c in pct_cols:
    s = (df_16[c].astype(str)
                    .str.replace('%', '', regex=False)   # drop percent sign if present
                    .str.replace(',', '.', regex=False)  # handle decimal comma
                    .str.strip()
                    .replace({'': np.nan}))
    s = pd.to_numeric(s, errors='coerce')

    # If column appears to be 0–1 scale, scale UP to 0–100
    maxv = s.max(skipna=True)
    if pd.notna(maxv) and maxv <= 1.5:
        s = s * 100.0

    df_16[c] = s  # write back as float (e.g., 55.93)

# 3) Ensure avg_size columns stay numeric (unchanged values)
for c in size_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 4) (Optional) add "(%)" to headers of percentage columns (header-only; no value changes)
df_16.rename(columns=lambda x: f'{x} (%)' if x in pct_cols and '(%)' not in str(x) else x, inplace=True)

# 5) Display with blanks instead of NaN (values remain numeric under the hood)
display(df_16.style.format(na_rep=''))


metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
3,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
4,Canada,2.425303,,,50.919441,25.300814,25.618627,8.715567,,,29.347961,11.017031,,,,
5,Chile,2.8,,,,,,,,,,,,,,
6,Columbia,3.100732,,,,,,,,,,,,,,
7,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
8,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
9,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636


In [134]:
df_16.sample(10)

metric,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
22,Italy,2.2,3.6,2.4,46.704125,20.906322,25.797803,7.274459,5.649393,1.625066,36.638065,9.38335,77.789563,12.263053,8.28296,1.664424
25,Latvia,2.1,3.7,2.5,27.800585,12.205404,15.595181,13.436529,11.207919,2.228611,41.083861,17.679025,74.798867,14.050992,8.31728,2.832861
1,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
21,Israel,3.19,4.649476,2.863297,64.5,45.8,18.7,9.1,,,26.4,,,,,
13,Estonia,1.8,3.8,2.6,46.199713,25.464743,20.734971,6.828569,6.09075,0.737819,36.99157,9.980148,75.755102,12.530612,8.734694,2.979592
24,Korea,2.214024,3.553442,2.343395,43.480489,26.248858,17.23163,9.125677,6.849305,2.276371,35.468577,11.925258,78.516649,9.000955,9.824172,2.658224
45,United Kingdom,2.3,3.9,2.8,53.717778,20.285277,33.432479,7.4807,,,30.812606,7.988916,72.057972,12.09535,11.314672,4.532006
5,Chile,2.8,,,,,,,,,,,,,,
2,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
0,Australia,2.52758,3.932863,2.775636,55.926052,29.904701,26.021351,10.373549,,,25.124159,8.576123,,,,


In [135]:
df_16.columns.name = None
df_16.index.name = None

In [136]:
df_16.sample(10)

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,share_couple_total (%),share_couple_with_children (%),share_couple_without_children (%),share_single_parent_total (%),share_single_mother (%),share_single_father (%),share_single_person (%),share_other_types (%),share_hh_0_children (%),share_hh_1_child (%),share_hh_2_children (%),share_hh_3plus_children (%)
12,EU average,2.262963,,,47.974518,22.090625,25.883877,6.707648,5.532233,1.175415,34.001383,11.31672,75.100758,12.283532,9.461296,3.151375
38,Romania,2.5,3.8,2.4,45.661026,20.733677,24.927349,6.503113,4.564712,1.938401,33.625882,14.209979,72.458002,14.287791,9.236178,4.01935
13,Estonia,1.8,3.8,2.6,46.199713,25.464743,20.734971,6.828569,6.09075,0.737819,36.99157,9.980148,75.755102,12.530612,8.734694,2.979592
29,Mexico,3.56,4.08,2.76,50.357352,39.016163,11.341189,11.216618,,,12.464293,25.961737,50.23388,22.83395,17.403083,9.529087
36,Poland,2.3,3.7,2.5,48.321089,23.920366,24.400723,5.969036,5.147508,0.821529,23.435506,22.274353,74.390275,12.909212,9.842287,2.858225
10,Czechia,2.3,3.7,2.4,47.025112,21.703808,25.321304,7.154102,6.112638,1.041464,39.150397,6.670389,71.952265,13.852443,11.560185,2.635107
20,Ireland,2.4,4.0,2.7,53.027121,29.44911,23.578011,6.925656,6.122436,0.803221,23.135621,16.911603,69.020563,12.418831,12.184343,6.376263
26,Lithuania,1.7,3.7,2.4,50.089192,23.741196,26.347996,7.179025,6.676129,0.502896,35.155839,7.575945,80.437001,11.057086,6.99922,1.506693
28,Malta,2.5,3.7,2.5,46.920827,21.114001,25.806826,5.680812,4.557909,1.122903,32.512251,14.889819,76.48917,12.680505,7.806859,2.978339
37,Portugal,2.4,3.5,2.4,57.173852,25.765203,31.408649,7.221597,6.237527,0.98407,24.773372,10.831179,74.352268,15.871951,8.177408,1.598373


In [137]:
df_16.rename(columns=lambda c: re.sub(r'(?i)^share[_\s]*', '', c) if isinstance(c, str) else c, inplace=True)
df_16.drop_duplicates(inplace=True)
df_16.replace('', pd.NA, inplace=True)

metric_cols = [c for c in df_16.columns if c != 'country']

# A) Drop only rows where ALL metric columns are missing (safe):
df_16.dropna(subset=metric_cols, how='all', inplace=True)

# B) (optional) Also drop columns that are entirely missing:
df_16.dropna(axis=1, how='all', inplace=True)
df_16.isnull().sum()

country                                  0
avg_size_all                             2
avg_size_couple_with_children            8
avg_size_single_parent_with_children     8
couple_total (%)                         4
couple_with_children (%)                 5
couple_without_children (%)              5
single_parent_total (%)                  4
single_mother (%)                       11
single_father (%)                       11
single_person (%)                        4
other_types (%)                          5
hh_0_children (%)                       10
hh_1_child (%)                          10
hh_2_children (%)                       10
hh_3plus_children (%)                   10
dtype: int64

In [138]:
# Drop exactly these columns and create df_16_general
cols_to_drop = [
    "avg_size_couple_with_children",
    "avg_size_single_parent_with_children",
    "single_mother (%)",
    "single_father (%)",
    "hh_0_children (%)",
    "hh_1_child (%)",
    "hh_2_children (%)",
    "hh_3plus_children (%)",
]

df_16_general = df_16.drop(columns=cols_to_drop, errors="ignore").copy()

# quick check (optional)
print(df_16_general.columns.tolist())

['country', 'avg_size_all', 'couple_total (%)', 'couple_with_children (%)', 'couple_without_children (%)', 'single_parent_total (%)', 'single_person (%)', 'other_types (%)']


In [139]:
# Remove selected countries (exact matches)
df_16_general['country'] = df_16_general['country'].str.strip()  # trims spaces

countries_to_remove = ["Chile", "Columbia", "OECD average", "OECD-30 average", "OECD-36 average", "Israel"]
df_16_general = df_16_general[~df_16_general['country'].isin(countries_to_remove)].copy()


In [140]:
df_16_general = df_16_general.reset_index(drop=True)
df_16_general

Unnamed: 0,country,avg_size_all,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_person (%),other_types (%)
0,Australia,2.52758,55.926052,29.904701,26.021351,10.373549,25.124159,8.576123
1,Austria,2.2,48.927373,21.129493,27.79788,5.628256,38.337314,7.107057
2,Belgium,2.2,52.219429,23.979842,28.239587,7.423107,35.502304,4.855161
3,Bulgaria,2.2,40.303059,16.35041,23.95265,4.603051,35.80958,19.284309
4,Canada,2.425303,50.919441,25.300814,25.618627,8.715567,29.347961,11.017031
5,Costa Rica,3.462513,52.441873,38.147069,14.294803,10.548101,11.270909,25.739118
6,Croatia,2.6,51.508875,24.776445,26.732361,5.42231,27.80158,15.267514
7,Cyprus,2.5,56.920863,27.422456,29.498128,6.16526,24.490862,12.426368
8,Czechia,2.3,47.025112,21.703808,25.321304,7.154102,39.150397,6.670389
9,Denmark,1.9,48.59698,20.408734,28.188133,6.308205,37.574211,7.520455


In [141]:
df_16_general.isnull().sum()

country                        0
avg_size_all                   0
couple_total (%)               0
couple_with_children (%)       0
couple_without_children (%)    0
single_parent_total (%)        0
single_person (%)              0
other_types (%)                0
dtype: int64

In [142]:
drop_countries = ['OECD average', 'OECD-30 average', 'OECD-36 average', 'Canada', 'Chile', 'Columbia', 'United States','Iceland','Israel','EU average', 'New Zealand', 'Mexico', 'United Kingdom', 'Switzerland','Australia']

before = len(df_16)
df_16 = df_16[~df_16['country'].isin(drop_countries)].reset_index(drop=True)
print(f"Removed {before - len(df_16)} rows")

Removed 15 rows


In [143]:
df_16.isnull().sum()

country                                 0
avg_size_all                            0
avg_size_couple_with_children           0
avg_size_single_parent_with_children    0
couple_total (%)                        0
couple_with_children (%)                0
couple_without_children (%)             0
single_parent_total (%)                 0
single_mother (%)                       0
single_father (%)                       0
single_person (%)                       0
other_types (%)                         0
hh_0_children (%)                       0
hh_1_child (%)                          0
hh_2_children (%)                       0
hh_3plus_children (%)                   0
dtype: int64

In [144]:
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_mother (%),single_father (%),single_person (%),other_types (%),hh_0_children (%),hh_1_child (%),hh_2_children (%),hh_3plus_children (%)
0,Austria,2.2,3.8,2.5,48.927373,21.129493,27.79788,5.628256,4.77958,0.848677,38.337314,7.107057,77.781493,10.524053,8.574977,3.119476
1,Belgium,2.2,3.9,2.6,52.219429,23.979842,28.239587,7.423107,6.077464,1.345643,35.502304,4.855161,73.974236,11.757354,10.153817,4.112671
2,Bulgaria,2.2,3.5,2.3,40.303059,16.35041,23.95265,4.603051,3.875726,0.727325,35.80958,19.284309,78.210294,12.932273,7.480467,1.376966
3,Costa Rica,3.462513,4.372663,3.443867,52.441873,38.147069,14.294803,10.548101,9.489556,1.058545,11.270909,25.739118,30.290198,23.077315,24.608947,22.02354
4,Croatia,2.6,3.9,2.6,51.508875,24.776445,26.732361,5.42231,4.386251,1.036059,27.80158,15.267514,74.183029,11.964128,10.095795,3.757049
5,Cyprus,2.5,3.7,2.4,56.920863,27.422456,29.498128,6.16526,4.936008,1.229252,24.490862,12.426368,71.356113,13.881474,11.665777,3.096636
6,Czechia,2.3,3.7,2.4,47.025112,21.703808,25.321304,7.154102,6.112638,1.041464,39.150397,6.670389,71.952265,13.852443,11.560185,2.635107
7,Denmark,1.9,3.9,2.5,48.59698,20.408734,28.188133,6.308205,5.114202,1.194003,37.574211,7.520455,77.775278,10.541878,8.944527,2.738317
8,Estonia,1.8,3.8,2.6,46.199713,25.464743,20.734971,6.828569,6.09075,0.737819,36.99157,9.980148,75.755102,12.530612,8.734694,2.979592
9,Finland,1.9,4.0,2.6,45.63784,17.056364,28.581477,5.427769,4.497123,0.930646,45.335962,3.598356,81.983051,7.888136,6.986441,3.142373


In [145]:
#df_16_general.to_csv('../data/Cleaned/general/cleaned_household_general.csv', index=False)

In [146]:
df_16['country'] = df_16['country'].astype(str).str.strip()

# 2) Identify column groups
metric_cols = [c for c in df_16.columns if c != 'country']
pct_cols    = [c for c in metric_cols if '(%)' in str(c) or re.match(r'^share[_\s]', str(c) or '')]
size_cols   = [c for c in metric_cols if str(c).startswith('avg_size')]

# 3) Coerce to proper numeric types (DO NOT rescale percentages)
def to_float(s: pd.Series) -> pd.Series:
    return pd.to_numeric(
        s.astype(str)
         .str.replace('%', '', regex=False)   # strip a trailing % if any
         .str.replace(',', '.', regex=False)  # handle decimal comma
         .str.strip()
         .replace({'': np.nan}),
        errors='coerce'
    )

for c in pct_cols:
    df_16[c] = to_float(df_16[c])            # stays on 0–100 scale

for c in size_cols:
    df_16[c] = pd.to_numeric(df_16[c], errors='coerce')

# 4) Round to 2 decimals (all numeric metrics)
df_16[metric_cols] = df_16[metric_cols].round(2)

In [147]:
df_16

Unnamed: 0,country,avg_size_all,avg_size_couple_with_children,avg_size_single_parent_with_children,couple_total (%),couple_with_children (%),couple_without_children (%),single_parent_total (%),single_mother (%),single_father (%),single_person (%),other_types (%),hh_0_children (%),hh_1_child (%),hh_2_children (%),hh_3plus_children (%)
0,Austria,2.2,3.8,2.5,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11,77.78,10.52,8.57,3.12
1,Belgium,2.2,3.9,2.6,52.22,23.98,28.24,7.42,6.08,1.35,35.5,4.86,73.97,11.76,10.15,4.11
2,Bulgaria,2.2,3.5,2.3,40.3,16.35,23.95,4.6,3.88,0.73,35.81,19.28,78.21,12.93,7.48,1.38
3,Costa Rica,3.46,4.37,3.44,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74,30.29,23.08,24.61,22.02
4,Croatia,2.6,3.9,2.6,51.51,24.78,26.73,5.42,4.39,1.04,27.8,15.27,74.18,11.96,10.1,3.76
5,Cyprus,2.5,3.7,2.4,56.92,27.42,29.5,6.17,4.94,1.23,24.49,12.43,71.36,13.88,11.67,3.1
6,Czechia,2.3,3.7,2.4,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67,71.95,13.85,11.56,2.64
7,Denmark,1.9,3.9,2.5,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52,77.78,10.54,8.94,2.74
8,Estonia,1.8,3.8,2.6,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98,75.76,12.53,8.73,2.98
9,Finland,1.9,4.0,2.6,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6,81.98,7.89,6.99,3.14


In [148]:
#df_16.to_csv('../data/Cleaned/cleaned_household_oecd.csv', index=False)

In [149]:
#df_16.to_sql('household_oecd', engine, if_exists= 'replace', index= False)

In [150]:
df_17 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_17

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,


In [151]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [152]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [153]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [154]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [155]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [156]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [157]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [158]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [159]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [160]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [161]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [162]:
df_888= pd.read_csv('../data/Raw/OECD/Households-by-type,-presence-of-children-and-country,-2015-2024.csv')
df_888

Unnamed: 0,Category,Single adult with children,Single adult without children,Couple with children,Couple without children,Other type of household with children,Other type of household without children
0,2015,6147.3,64181.3,31679.8,46641.6,11698.9,30771.6
1,2016,6148.5,63891.1,31907.3,47308.2,11766.3,30559.5
2,2017,6108.5,65353.9,32091.5,47426.1,11530.2,30297.5
3,2018,6163.6,66165.5,31720.2,48194.8,11342.5,30224.0
4,2019,6246.4,67417.9,31710.1,48503.6,11285.7,30134.8
5,2020,6136.4,67412.9,31622.2,48831.2,11212.9,30445.2
6,2021,5691.9,70200.4,30558.3,47447.4,11611.8,30700.7
7,2022,5984.9,72134.3,30469.3,47995.5,11513.6,30412.1
8,2023,5924.8,73396.2,30313.0,48477.5,11443.5,30608.8
9,2024,6077.7,75049.7,30286.5,49058.4,11311.9,30487.3


In [163]:
df_999 = pd.read_csv('../data/Raw/OECD/Households-with-children-by-number-of-children,-2024.csv')
df_999

Unnamed: 0,Category,1 child,2 children,3 children or more
0,European Union,11.7,8.9,3.0
1,,,,
2,Slovakia,17.1,14.5,4.0
3,Ireland,12.4,12.2,6.4
4,Cyprus,13.9,11.7,3.1
5,Czechia,13.9,11.6,2.6
6,Romania,14.3,9.2,4.0
7,Luxembourg,12.5,12.1,2.4
8,Belgium,11.8,10.2,4.1
9,Croatia,12.0,10.1,3.8


In [164]:
import pandas as pd

df = pd.read_csv('../data/Raw/OECD/OECD,DF_FAMILY,+all.csv')

df_wide = df.pivot_table(
    index=['Country', 'TIME_PERIOD', 'COU'],
    columns='Indicator',
    values='OBS_VALUE'
).reset_index()

df_wide.columns.name = None

df_wide.to_csv("WIDE_FORMAT.csv", index=False)

print(df_wide)

# df_wide = df.pivot(index='id', columns='variable', values='value')
# df = df.drop(columns='indicator')

            Country  TIME_PERIOD  COU  Child poverty rate  \
0         Argentina         2001  ARG                 NaN   
1         Argentina         2002  ARG                 NaN   
2         Argentina         2003  ARG                 NaN   
3         Argentina         2004  ARG                 NaN   
4         Argentina         2005  ARG                 NaN   
...             ...          ...  ...                 ...   
1170  United States         2018  USA                 NaN   
1171  United States         2019  USA                 NaN   
1172  United States         2020  USA                 NaN   
1173  United States         2021  USA                 NaN   
1174  United States         2022  USA                 NaN   

      Country mean average score in mathematics, by sex  \
0                                                   NaN   
1                                                   NaN   
2                                                   NaN   
3                              