In [1490]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [1491]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [1492]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [1493]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [1494]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [1495]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [1496]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1497]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [1498]:
df_1.drop_duplicates(inplace=True)
df_1.dropna(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [1499]:
df_info = pd.DataFrame({
    'datatypes': df_1.dtypes,
    'null_count': df_1.isnull().sum(),
    'unique_count': df_1.nunique()
})
print(df_info)

                           datatypes  null_count  unique_count
country                       object           0           235
age_group                     object           0            63
sex                           object           0             2
marital_status                object           0            35
data_process                  object           0             6
data_collection_start_year     int32           0            62
data_collection_end_year       int32           0            60
data_source                   object           0            15


In [1500]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [1501]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [1502]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [1503]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1504]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [1505]:
df_2.drop_duplicates(inplace=True)
df_2.dropna(inplace=True)


In [1506]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [1507]:
df_info = pd.DataFrame({
    'datatypes': df_2.dtypes,
    'null_count': df_2.isnull().sum(),
    'unique_count': df_2.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            41
code                                   object           0            41
year                                    int32           0            32
mean_age_of_women_at_first_marriage   float64           0           179


In [1508]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [1509]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [1510]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [1511]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1512]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [1513]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [1514]:
df_3.drop_duplicates(inplace=True)
df_3.dropna(inplace=True)


In [1515]:
df_info = pd.DataFrame({
    'datatypes': df_3.dtypes,
    'null_count': df_3.isnull().sum(),
    'unique_count': df_3.nunique()
})
print(df_info)

                                              datatypes  null_count  \
country                                          object           0   
code                                             object           0   
year                                              int32           0   
crude_marriage_rate_marriages_per_1000_people   float64           0   

                                               unique_count  
country                                                  45  
code                                                     45  
year                                                    127  
crude_marriage_rate_marriages_per_1000_people           109  


In [1516]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [1517]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [1518]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [1519]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1520]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [1521]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [1522]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [1523]:
df_info = pd.DataFrame({
    'datatypes': df_4.dtypes,
    'null_count': df_4.isnull().sum(),
    'unique_count': df_4.nunique()
})
print(df_info)

                            datatypes  null_count  unique_count
country                        object           0            38
code                           object           0            38
year                            int64           0            61
crude_marriage_rate           float64           0           101
crude_marriage_rate_people1   float64           0            28
year_1                          Int64           0             1


In [1524]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [1525]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [1526]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [1527]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1528]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)
df_5.dropna(inplace=True)

In [1529]:
df_info = pd.DataFrame({
    'datatypes': df_5.dtypes,
    'null_count': df_5.isnull().sum(),
    'unique_count': df_5.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            42
code                                   object           0            42
year                                    int64           0            62
share_of_births_outside_of_marriage   float64           0           610


In [1530]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [1531]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [1532]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')
df_6

Unnamed: 0,Entity,Code,Year,Proportions of men or women who had ever married by a certain age for 1900 birth cohort,Proportions of men or women who had ever married by a certain age for 1920 birth cohort,Proportions of men or women who had ever married by a certain age for 1940 birth cohort,Proportions of men or women who had ever married by a certain age for 1960 birth cohort,Proportions of men or women who had ever married by a certain age for 1970 birth cohort,Proportions of men or women who had ever married by a certain age for 1980 birth cohort,Proportions of men or women who had ever married by a certain age for 1990 birth cohort,Proportions of men or women who had ever married by a certain age for 2000 birth cohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2
...,...,...,...,...,...,...,...,...,...,...,...
63,Women,,46,84.5,91.6,95.5,86.9,75.0,,,
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,
65,Women,,48,85.0,91.8,95.6,87.2,75.7,,,
66,Women,,49,85.2,91.9,95.7,87.3,76.0,,,


In [1533]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)

df_6.head()

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2


In [1534]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)
df_6

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
0,Men,17,0.0,0.0,0.0,0.1,0.0
1,Men,18,0.1,0.1,0.4,0.6,0.1
2,Men,19,0.8,0.6,2.0,2.5,0.7
3,Men,20,2.4,2.2,6.0,6.2,1.9
4,Men,21,6.1,7.4,13.6,11.9,3.9
...,...,...,...,...,...,...,...
63,Women,46,84.5,91.6,95.5,86.9,75.0
64,Women,47,84.8,91.7,95.6,87.0,75.4
65,Women,48,85.0,91.8,95.6,87.2,75.7
66,Women,49,85.2,91.9,95.7,87.3,76.0


In [1535]:
df_6.dropna(inplace=True)
df_6.describe

<bound method NDFrame.describe of       sex  year  1900_birthcohort  1920_birthcohort  1940_birthcohort  \
0     Men    17               0.0               0.0               0.0   
1     Men    18               0.1               0.1               0.4   
2     Men    19               0.8               0.6               2.0   
3     Men    20               2.4               2.2               6.0   
4     Men    21               6.1               7.4              13.6   
..    ...   ...               ...               ...               ...   
63  Women    46              84.5              91.6              95.5   
64  Women    47              84.8              91.7              95.6   
65  Women    48              85.0              91.8              95.6   
66  Women    49              85.2              91.9              95.7   
67  Women    50              85.4              92.0              95.7   

    1960_birthcohort  1970_birthcohort  
0                0.1               0.0  
1      

In [1536]:
df_info = pd.DataFrame({
    'datatypes': df_6.dtypes,
    'null_count': df_6.isnull().sum(),
    'unique_count': df_6.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [1537]:
df_6.sample(12)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
57,Women,40,82.4,90.5,95.0,85.3,72.0
15,Men,32,79.3,82.2,86.5,68.8,48.2
46,Women,29,68.7,80.4,89.8,74.5,52.4
39,Women,22,24.3,39.3,57.5,40.6,18.2
35,Women,18,0.4,1.6,4.6,4.6,1.3
21,Men,38,88.1,89.0,90.5,76.6,61.3
60,Women,43,83.7,91.1,95.3,86.2,73.7
47,Women,30,71.6,82.7,90.9,76.7,55.8
49,Women,32,75.5,85.8,92.4,79.9,61.0
31,Men,48,92.5,91.7,92.3,81.3,69.9


In [1538]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [1539]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [1540]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [1541]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1542]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.dropna(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
429,Cyprus,CYP,2005,4.4
1999,Switzerland,CHE,2014,21.7
1359,Netherlands,NLD,1967,2.1
479,Czechia,CZE,1995,15.6
2045,United Kingdom,GBR,1989,26.6


In [1543]:
df_info = pd.DataFrame({
    'datatypes': df_7.dtypes,
    'null_count': df_7.isnull().sum(),
    'unique_count': df_7.nunique()
})
print(df_info)

                                          datatypes  null_count  unique_count
country                                      object           0            42
code                                         object           0            42
year                                          int64           0            62
shareofbirthsoutsideofmarriageofallbirths   float64           0           610


In [1544]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [1545]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [1546]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [1547]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1548]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
64,Women,GBR,47,84.8,91.7,95.6,87.0,75.4,,,
35,Women,GBR,18,0.4,1.6,4.6,4.6,1.3,0.4,0.1,0.0
60,Women,GBR,43,83.7,91.1,95.3,86.2,73.7,63.7,,
58,Women,GBR,41,82.9,90.7,95.1,85.7,72.7,62.0,,
48,Women,GBR,31,73.9,84.5,91.7,78.4,58.7,42.0,30.2,


In [1549]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.dropna(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
28,Men,45,91.8,91.3,92.0,80.5,68.2
23,Men,40,89.6,89.9,91.1,78.0,64.0
54,Women,37,80.6,89.4,94.4,84.1,69.4
45,Women,28,65.0,77.4,88.4,72.0,48.8
13,Men,30,72.9,76.4,83.3,63.9,41.4


In [1550]:
df_info = pd.DataFrame({
    'datatypes': df_8.dtypes,
    'null_count': df_8.isnull().sum(),
    'unique_count': df_8.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [1551]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [1552]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [1553]:
#pip install openpyxl pywin32

In [1554]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [1555]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [1556]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [1557]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [1558]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [1559]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [1560]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [1561]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [1562]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [1563]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [1564]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [1565]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
6390,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,222,El Salvador,SV,SLV,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.67
22203,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,768,Togo,TG,TGO,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,72.1
13982,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,474,Martinique,MQ,MTQ,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,33.11
10284,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,360,Indonesia,ID,IDN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,70.99
9495,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,328,Guyana,GY,GUY,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.87


In [1566]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
21565,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,748,Eswatini,SZ,SWZ,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,44.4
15421,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,524,Nepal,NP,NPL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,76.01
9440,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,328,Guyana,GY,GUY,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.54
21265,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,729,Sudan,SD,SDN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,61.63
6122,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,212,Dominica,DM,DMA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,48.48


In [1567]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [1568]:
df_9.drop_duplicates(inplace=True)
df_9.dropna(inplace = True)

df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [1569]:
df_info = pd.DataFrame({
    'datatypes': df_9.dtypes,
    'null_count': df_9.isnull().sum(),
    'unique_count': df_9.nunique()
})
print(df_info)

                datatypes  null_count  unique_count
indicatorname      object           0             1
year                int64           0             1
country            object           0           224
code               object           0           224
time                int64           0            56
variant            object           0             1
sex                object           0             1
age                object           0             1
estimate_method     int64           0             2
estimatemethod     object           0             2
value             float64           0          3867


In [1570]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [1571]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [1572]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [1573]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
10791,Bermuda,60,Married or in-union women,2022,15-49,47.253411,6.399766,Projection
38829,Ethiopia,231,Married or in-union women,2044,40-44,86.150626,5172.445222,Projection
54363,Guatemala,320,Married or in-union women,2042,30-34,65.328324,605.349236,Projection
54675,Guinea,324,Married or in-union women,2000,30-34,95.814435,247.975903,Estimate
3978,Azerbaijan,31,Married or in-union women,1981,25-29,78.321579,204.771377,Estimate
51153,Greece,300,Married or in-union women,2046,20-24,4.708345,9.268588,Projection
52821,Guadeloupe,312,Married or in-union women,2011,40-44,52.365909,9.193883,Estimate
16541,Brunei Darussalam,96,Married or in-union women,2012,40-44,80.221802,12.041293,Projection
74039,Libya,434,Married or in-union women,1990,15-49,52.849501,497.521235,Estimate
58000,Hungary,348,Married or in-union women,2011,15-19,4.252439,12.212749,Estimate


In [1574]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
97399,Micronesia (Fed. States of),583,Married or in-union women,1994,15-49,51.041554,12.94031,Estimate
6549,Bahamas,44,Married or in-union women,1978,40-44,65.699174,3.737297,Estimate
142243,Uzbekistan,860,Married or in-union women,2011,30-34,89.926923,958.554454,Estimate
127097,Syrian Arab Republic,760,Married or in-union women,1981,20-24,59.587097,229.121623,Estimate
93509,New Zealand,554,Married or in-union women,1994,40-44,78.05344,98.633791,Estimate


In [1575]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [1576]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [1577]:
df_10.dropna(inplace=True)

In [1578]:
df_info = pd.DataFrame({
    'datatypes': df_10.dtypes,
    'null_count': df_10.isnull().sum(),
    'unique_count': df_10.nunique()
})
print(df_info)

              datatypes  null_count  unique_count
countryorarea    object           0           225
isocode           int64           0           225
indicator        object           0             1
year              int64           0            81
agegroup         object           0             8
percentage      float64           0          9667
number          float64           0         65394
data_process     object           0             2


In [1579]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [1580]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [1581]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [1582]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [1583]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
1386,Australia,36,1997,1997,Men,[25-29],25,29,34.7,Estimate,1997 Estimate,2037,Australia 1997 Estimate,UNSD,,,
35760,Norway,578,1980,1980,Women,[25-29],25,29,70.22,Estimate,1980 Estimate,2180,Norway 1980 Estimate,UNSD,,,
8406,"China, Macao SAR",446,2006,2006,Men,[50-54],50,54,91.78,Census,2006 Census,3177,"China, Macao (SAR) 2006 Census",UNSD,,,
44543,Slovenia,705,2013,2013,Women,[40-44],40,44,58.9,Estimate,2013 Estimate,2218,Slovenia 2013 Estimate,UNSD,,,
47031,Sweden,752,1996,1996,Women,[45-49],45,49,64.16,Estimate,1996 Estimate,2227,Sweden 1996 Estimate,UNSD,,,
11805,Denmark,208,2009,2009,Men,[60-64],60,64,72.71,Estimate,2009 Estimate,2081,Denmark 2009 Estimate,UNSD,,Based on data compiled from registers.,Excluding Faeroe Islands and Greenland shown s...
35077,Niger,562,2006,2006,Women,[20-24],20,24,86.4,Survey,2006 DHS,1736,Niger 2006 Demographic and Health Survey,DHS_STATcompiler,1.0,,
5846,Cabo Verde,132,2000,2000,Men,[30-34],30,34,59.4,Census,2000 Census,1219,Cape Verde 2000 Census,National statistics,1.0,,


In [1584]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
24565,Israel,376,2013,2013,Women,[40-44],40,44,74.57,Estimate,2013 Estimate,2127,Israel 2013 Estimate,UNSD,,,
14109,Falkland Islands (Malvinas),238,2016,2016,Men,[15-19],15,19,1.64,Census,2016 Census,7296,Falkland Islands (Malvinas) 2016 Census,National statistics,1.0,,
47633,Switzerland,756,1986,1986,Women,[10-14],10,14,0.0,Estimate,1986 Estimate,2228,Switzerland 1986 Estimate,UNSD,,,
43165,Seychelles,690,1994,1994,Men,[35-39],35,39,71.95,Census,1994 Census,2562,Seychelles 1994 Census,UNSD,1.0,,
18957,Guinea,324,2012,2012,Men,[30-34],30,34,79.5,Survey,2012 DHS,5422,Guinea 2012 Demographic and Health Survey,DHS_STATcompiler,1.0,,
4851,Bulgaria,100,1985,1985,Women,[15-19],15,19,15.83,Census,1985 Census,846,Bulgaria 1985 Census,UNSD,1.0,,
16016,France,250,1988,1988,Men,[25-29],25,29,45.94,Estimate,1988 Estimate,2094,France 1988 Estimate,UNSD,,,Excluding diplomatic personnel outside the cou...
15241,Finland,246,2011,2011,Men,[70-74],70,74,70.81,Estimate,2011 Estimate,2093,Finland 2011 Estimate,UNSD,1.0,,Excluding Åland Islands.


In [1585]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
39361,Republic of Moldova,498,Men,[50-54],50,54,91.02,Survey,2000 MICS_HH,MICS_HH
37147,Pakistan,586,Men,[30-34],30,34,80.87,Survey,2017 DHS,DHS_HH
14978,Finland,246,Women,[75+],75,999,19.91,Census,2000 Census,UNSD
4905,Bulgaria,100,Women,[45-49],45,49,81.38,Census,2001 Census,UNSD
52202,Vanuatu,548,Women,[25-29],25,29,86.05,Survey,2007 MICS,MICS
36886,Pakistan,586,Women,[35-39],35,39,94.75,Census,1981 Census,UNSD
49695,Trinidad and Tobago,780,Women,[65+],65,999,31.29,Census,2000 Census,UNSD
40934,Saint Vincent and the Grenadines,670,Women,[75+],75,999,18.53,Census,2012 Census,National statistics
45705,State of Palestine,275,Women,[15-19],15,19,26.9,Census,1997 Census,UNSD
41647,San Marino,674,Men,[30-34],30,34,76.34,Estimate,1986 Estimate,UNSD


In [1586]:
df_11.drop_duplicates(inplace=True)
df_11.dropna(inplace=True)

In [1587]:
df_info = pd.DataFrame({
    'datatypes': df_11.dtypes,
    'null_count': df_11.isnull().sum(),
    'unique_count': df_11.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           233
isocode                  int64           0           230
sex                     object           0             2
agegroup                object           0            23
age_start                int64           0            17
ageend                   int64           0            15
data_value             float64           0          9213
dataprocess             object           0             6
datacatalogshortname    object           0           412
data_source             object           0            15


In [1588]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [1589]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [1590]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [1591]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
7914,Chile,152,1970,1970,Men,[70-74],70,74,90.14,Census,1970 Census,1285,Chile 1970 Census,UNSD,1.0,Based on a sample of census returns. | Data ha...,
42888,Saint Vincent and the Grenadines,670,1991,1991,Women,[35-39],35,39,48.11,Census,1991 Census,377,Saint Vincent and the Grenadines 1991 Census,UNSD,,,
31189,Luxembourg,442,2011,2011,Men,[50-54],50,54,88.91,Census,2011 Census,4832,Luxembourg 2011 Census,Eurostat,1.0,Estimates computed based on data on marital st...,
13196,Dominica,212,2001,2001,Women,[50-54],50,54,60.42,Census,2001 Census,2345,Dominica 2001 Census,UNSD,,,
41781,Romania,642,1995,1995,Women,[70-74],70,74,96.81,Estimate,1995 Estimate,2199,Romania 1995 Estimate,UNSD,,,
32597,Mauritania,478,1976,1977,Men,[25-29],25,29,52.98,Census,1976-1977 Census,1063,Mauritania 1976-1977 Census,UNSD,,,Including nomads.
31906,Maldives,462,2009,2009,Women,[25-29],25,29,89.2,Survey,2009 DHS,4699,Maldives 2009 Demographic and Health Survey,DHS_HH,,,
26923,Japan,392,1975,1975,Men,[75+],75,999,98.81,Census,1975 Census,1608,Japan 1975 Census,UNSD,,,Excluding diplomatic personnel outside the cou...


In [1592]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
4529,Brazil,76,Women,[15-19],15,19,15.54,Census,UNSD
39510,Paraguay,600,Women,[35-39],35,39,81.44,Census,UNSD
10768,Cyprus,196,Men,[25-29],25,29,64.76,Census,UNSD
26223,Italy,380,Women,[45-49],45,49,87.13,Estimate,UNSD
23031,Iceland,352,Women,[20-24],20,24,8.65,Estimate,UNSD
13099,Djibouti,262,Women,[20-24],20,24,14.2,Survey,National statistics
32068,Mali,466,Women,[35-39],35,39,100.0,Survey,DHS_STATcompiler
11004,Czechia,203,Men,[25-29],25,29,68.09,Estimate,UNSD


In [1593]:
df_12.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_12.dtypes,
    'null_count': df_12.isnull().sum(),
    'unique_count': df_12.nunique()
})
print(df_info)

            datatypes  null_count  unique_count
country        object           0           233
isocode         int64           0           230
sex            object           0             2
agegroup       object           0            23
age_start       int64           0            17
age_end         int64           0            15
datavalue     float64           0          8396
dataprocess    object           0             6
datasource     object           0            15


In [1594]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [1595]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [1596]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [1597]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
19400,Cyprus,196,[30-34],ASFR3034,1998.5,90.8,"Estimates,Fertility data (Adjusted),HFC-ODE,20...",Fertility data (adjusted),Estimate,Estimate,2078,All sources of estimates,Estimates,1998,1998,European Demographic Observatory (ODE). Data c...,2011.0
77014,Venezuela (Bolivarian Republic of),862,[20-24],ASFR2024,2002.5,129.0069,"Register,Computed rate from DYB,DYB,556-135-56",Computed rate from DYB,Register,VR,556,Vital Registration,Register,2002,2002,Demographic Yearbook,2003.0
1840,Argentina,32,[20-24],ASFR2024,1987.50137,154.151,VR_Census,Direct,Register,VR,434,Vital Registration,Register,1987,1987,INDEC,2018.0
557,Albania,8,[40-44],ASFR4044,1979.5,33.712,"Estimates,Fertility data (Adjusted),HFC-ODE,20...",Fertility data (adjusted),Estimate,Estimate,2029,All sources of estimates,Estimates,1979,1979,European Demographic Observatory (ODE). Data c...,2011.0
75237,United States Virgin Islands,850,[Total],TFR,1999.5,1.9575,"Register, Direct, National Center for Health S...",Direct,Register,VR,553,Vital Registration,Register,1999,1999,National Center for Health Statistics. NVSRs.,
64944,Somalia,706,[20-24],ASFR2024,2000.084,221.0,"2002-2003 Reproductive Health Survey,Direct es...",Birth histories,Survey,Survey,2567,Somalia 2002-2003 Reproductive Health Survey,2002-2003 Survey,2002,2003,2002-2003 Reproductive Health Survey Final report,


In [1598]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [1599]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
59363,Russian Federation,[Total],1951,2.92,Direct,Register,Register,1951,1951
4730,Bahamas,[Total],1990,2.52,Direct,Estimate,Estimates,1990,1990
65875,Spain,[15-19],2006,12.0,Official estimates,Estimate,Estimates,2006,2006
3864,Austria,[15-19],1994,18.91,Official estimates,Estimate,Estimates,1994,1994
42787,Liberia,[40-44],2007,81.0,Direct,Survey,2009 MIS,2008,2009
65454,Spain,[25-29],1958,177.31,Fertility data (adjusted),Estimate,Estimates,1958,1958
58027,Republic of Korea,[30-34],2003,79.7,Direct,Register,Register,2003,2003
40686,Lao People's Dem. Republic,[30-34],1988,275.0,Recent births,Panel,1986-1988 Survey,1986,1988
61450,Senegal,[45-49],1979,63.37,Extrapolated from Truncated Birth Histories,Survey,1997 DHS,1997,1997
33777,India,[45-49],1987,4.43,Extrapolated from Truncated Birth Histories,Survey,1998-1999 NFHS,1998,1999


In [1600]:
df_13.drop_duplicates(inplace=True)
df_13.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_13.dtypes,
    'null_count': df_13.isnull().sum(),
    'unique_count': df_13.nunique()
})
print(df_info)

                    datatypes  null_count  unique_count
countryorarea          object           0           201
agegroup               object           0             8
date                    int32           0            69
value                 float64           0         18752
datatype               object           0            30
datasourcetype         object           0             7
datasourcenameshort    object           0           539
datasourcestartyear     int64           0            69
datasourceendyear      object           0            70


In [1601]:
#df_13.to_csv("../data/Cleaned/cleaned_fertility_indicators_un.csv", index=False)

In [1602]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [1603]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [1604]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
16075,Belgium,56,2001,2001,Women,Single,,,[15-19],15,...,2001 Census,3045,Belgium 2001 Census,UNSD,,,,,,
15904,Belgium,56,2000,2000,Men,Married,,,[45-49],45,...,2000 Estimate,2044,Belgium 2000 Estimate,UNSD,,,,,,
167647,Nepal,524,1976,1976,Women,Divorced,,,[50-54],50,...,1976 WFS,818,Nepal 1976 World Fertility Survey,National statistics,,,,,,
24437,Bulgaria,100,1985,1985,Men,Married,,,[65-69],65,...,1985 Census,846,Bulgaria 1985 Census,UNSD,,,,,,
2756,Angola,24,2001,2001,Women,Divorced,,,[35-39],35,...,2001 MICS_HH,1937,Angola 2001 Multiple Indicator Cluster Survey,MICS_HH,,,,,,


In [1605]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
60482,Denmark,1979,1979,Women,Married or married but separated,[50-54],50,54,78.35,Estimate,1979 Estimate,UNSD
202874,Romania,2003,2003,Men,Single,[20-24],20,24,90.3,Estimate,2003 Estimate,UNSD
151439,Malawi,2004,2005,Men,Divorced,[25-29],25,29,1.8,Survey,2004 DHS,DHS_STATcompiler
244572,Timor-Leste,2015,2015,Men,Married,[40-44],40,44,89.41,Census,2015 Census,National statistics
226565,South Africa,2003,2004,Women,Single,[20-24],20,24,79.4,Survey,2003 DHS,INED
245189,Togo,1988,1988,Women,Divorced,[30-34],30,34,4.6,Survey,1988 DHS,DHS_STATcompiler
134252,Kenya,2008,2009,Women,Divorced,[45-49],45,49,2.8,Survey,2008-2009 DHS,DHS_STATcompiler
213567,Sao Tome and Principe,2000,2000,Women,Divorced,[60-64],60,64,0.0,Survey,2000 MICS_HH,MICS_HH
266405,Viet Nam,2007,2007,Women,Separated,[25-29],25,29,0.59,Survey,2007 Annual APC,National statistics
151649,Malawi,2010,2010,Men,Married,[15-19],15,19,1.9,Survey,2010 DHS,DHS_STATcompiler


In [1606]:
df_14.drop_duplicates(inplace=True)
df_14.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_14.dtypes,
    'null_count': df_14.isnull().sum(),
    'unique_count': df_14.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           235
year_start               int64           0            62
year_end                 int64           0            60
sex                     object           0             2
marital_status          object           0            35
age_group               object           0            63
agestart                 int64           0            21
ageend                   int64           0            20
datavalue              float64           0          9994
dataprocess             object           0             6
datacatalogshortname    object           0           443
datasource              object           0            15


In [1607]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [1608]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [1609]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [1610]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
2264,Central and Southern Asia,62,SDG,Married or in-union women,2010,15-19,20.917967,37018.10493,Estimate
11728,South-Eastern Asia,35,SDG-M49,Married or in-union women,1978,15-19,19.945482,7403.985559,Estimate
11982,South-Eastern Asia,35,SDG-M49,Married or in-union women,2009,45-49,82.905225,29714.638645,Estimate
22701,Developed countries,901,Development group,Married or in-union women,1972,40-44,83.880085,29083.318654,Estimate
14932,Western Europe,926,M49,Married or in-union women,1973,35-39,84.469603,4583.709655,Estimate
5280,Eastern Africa,910,M49,Married or in-union women,1982,15-19,34.560942,5637.985147,Estimate


In [1611]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
4748,Africa,903,Married or in-union women,1996,35-39,83.885329,33393.70768,Estimate
25194,Least developed countries,941,Married or in-union women,2041,25-29,73.332935,49589.70622,Projection
18718,Northern America,905,Married or in-union women,2041,45-49,69.774072,8798.008483,Projection
16762,Caribbean,915,Married or in-union women,2040,25-29,42.399112,1351.398341,Projection
17438,Central America,916,Married or in-union women,2043,45-49,63.935546,9400.222135,Projection
13638,Northern Europe,924,Married or in-union women,1973,45-49,83.868229,2277.444691,Estimate
564,World,900,Married or in-union women,2040,35-39,82.05416,456697.750675,Projection
4310,Europe and Northern America,513,Married or in-union women,2022,45-49,70.730919,26332.78257,Estimate
11743,South-Eastern Asia,35,Married or in-union women,1979,15-49,63.610093,104972.554321,Estimate
13142,Eastern Europe,923,Married or in-union women,1992,45-49,77.798057,5743.34975,Estimate


In [1613]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [1614]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [1751]:
df_15.dropna(inplace=True)
df_15.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_15.dtypes,
    'null_count': df_15.isnull().sum(),
    'unique_count': df_15.nunique()
})
print(df_info)

           datatypes  null_count  unique_count
region        object           0            43
iso_code       int64           0            44
indicator     object           0             1
year           int64           0            81
age_group     object           0             8
percentage   float64           0          7796
number         int32           0         20311
process       object           0             2


In [1615]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [1616]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [1617]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [1618]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1619]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [1620]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [1621]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [1622]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [1623]:
df_16_1.sample(10)

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
13,Spain,2.5,3.7,2.4
16,Romania,2.5,3.8,2.4
39,Netherlands,2.0,3.9,2.6
28,Switzerland,2.21,4.02,2.58
14,Cyprus,2.5,3.7,2.4
23,Hungary,2.3,3.7,2.4
26,United Kingdom,2.3,3.9,2.8
20,Slovenia,2.4,3.9,2.5
41,Denmark,1.9,3.9,2.5
5,Slovak Republic,3.1,3.8,2.5


In [1624]:
#df_16_1.to_csv('../data/Cleaned/cleaned_average_size_of_households_type_2024_oecd.csv', index=False)

In [1625]:
#df_16_1.to_sql('average_size_of_households_type_2024_oecd', engine, if_exists = 'replace', index= False)

In [1626]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [1627]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [1628]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [1629]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1630]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [1631]:
df_16_2.rename(columns={
   "couple_total" : "couple_total(%)",
   "with_children" : "with_children(%)",
   "without_children" : "without_children(%)",
    "single_parent_total" : "single_parent_total(%)",
    "single_mother" : "single_mother(%)",
    "single_father" : "single_father(%)",
    "single_person" : "single_person(%)",
    "other_household_types" : "other_household_types(%)"
}, inplace=True)

In [1632]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total(%)         float64           0            36
with_children(%)        float64           0            35
without_children(%)     float64           0            36
single_parent_total(%)  float64           0            34
single_mother(%)        float64           0            32
single_father(%)        float64           0            31
single_person(%)        float64           0            35
other_households_types  float64           0            36
country                    object
couple_total(%)           float64
with_children(%)          float64
without_children(%)       float64
single_parent_total(%)    float64
single_mother(%)          float64
single_father(%)          float64
single_person(%)          float64
other_households_types    float64
dtype: object


In [1633]:
df_16_2.sample(10)

Unnamed: 0,country,couple_total(%),with_children(%),without_children(%),single_parent_total(%),single_mother(%),single_father(%),single_person(%),other_households_types
43,Romania,45.66,20.73,24.93,6.5,4.56,1.94,33.63,14.21
30,Slovak Republic,37.15,16.99,20.16,6.23,5.39,0.84,31.4,25.21
22,Lithuania,50.09,23.74,26.35,7.18,6.68,0.5,35.16,7.58
8,Denmark,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52
40,Croatia,51.51,24.78,26.73,5.42,4.39,1.04,27.8,15.27
14,Hungary,45.25,20.43,24.82,7.07,5.71,1.37,34.42,13.25
7,Czechia,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67
25,Netherlands,53.6,23.01,30.59,6.1,5.0,1.09,38.5,1.8
1,Austria,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11
13,Greece,52.14,24.03,28.11,4.66,3.82,0.84,32.35,10.85


In [1634]:
#df_16_2.to_csv('../data/Cleaned/cleaned_types_of_household_2021_oecd.csv', index = False)

In [1635]:
#df_16_2.to_sql('types_of_household_2021_oecd', engine, if_exists = 'replace', index= False)

In [1636]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [1637]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [1638]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1639]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [1640]:
df_16_3.sample(10)

Unnamed: 0,country,0_children,1_child,2_children,3_or_more_children,children_under_6
29,Portugal,74.35,15.87,8.18,1.6,9.85
23,Luxembourg,73.0,12.49,12.07,2.41,11.54
28,Poland,74.39,12.91,9.84,2.86,9.97
32,Spain,74.61,13.54,8.95,2.9,8.79
22,Lithuania,80.44,11.06,7.0,1.51,8.12
41,Cyprus,71.36,13.88,11.67,3.1,12.71
35,Türkiye,57.62,17.42,14.5,10.45,19.65
39,Bulgaria,78.21,12.93,7.48,1.38,6.9
2,Belgium,73.97,11.76,10.15,4.11,10.4
9,Estonia,75.76,12.53,8.73,2.98,9.85


In [1641]:
df_16_3.rename(columns={
    "0_children": "households_0_children(%)",
    "1_child": "households_1_child(%)",
    "2_children": "households_2_children(%)",
    "3_or_more_children": "households_3_or_more_children(%)"
}, inplace=True)

In [1642]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                                    dtype  null_count  unique_count
country                            object           0            33
households_0_children(%)          float64           0            32
households_1_child(%)             float64           0            32
households_2_children(%)          float64           0            33
households_3_or_more_children(%)  float64           0            31
children_under_6                  float64           0            31
country                              object
households_0_children(%)            float64
households_1_child(%)               float64
households_2_children(%)            float64
households_3_or_more_children(%)    float64
children_under_6                    float64
dtype: object


In [1643]:
df_16_3.sample(10)

Unnamed: 0,country,households_0_children(%),households_1_child(%),households_2_children(%),households_3_or_more_children(%),children_under_6
25,Netherlands,78.65,8.78,9.27,3.3,8.79
12,Germany,79.86,9.91,7.72,2.51,8.57
16,Ireland,69.02,12.42,12.18,6.38,11.81
29,Portugal,74.35,15.87,8.18,1.6,9.85
1,Austria,77.78,10.52,8.57,3.12,9.44
6,Costa Rica,30.29,23.08,24.61,22.02,26.3
9,Estonia,75.76,12.53,8.73,2.98,9.85
27,Norway,76.87,10.53,9.14,3.47,8.88
31,Slovenia,75.0,11.25,10.2,3.56,9.93
18,Italy,77.79,12.26,8.28,1.66,8.05


In [1644]:
#df_16_3.to_csv('../data/Cleaned/cleaned_households_by_number_of_children_2024_oecd.csv', index=False)

In [1645]:
#df_16_3.to_sql('households_by_number_of_children_2024_oecd', engine, index= False)

In [1646]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
#total_fertility_rates_from_1960_oecd
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [1647]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [1648]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1649]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1650]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [1651]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            49
1960     float64           0            47
1961     float64           0            47
1962     float64           0            47
1963     float64           0            46
...          ...         ...           ...
2019     float64           0            37
2020     float64           0            39
2021     float64           0            40
2022     float64           0            34
2023     float64           0            35

[65 rows x 3 columns]


In [1652]:
df_17_1.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
14,Hungary,2.02,1.94,1.79,1.82,1.8,1.81,1.88,2.01,2.06,...,1.42,1.45,1.51,1.51,1.5,1.51,1.59,1.61,1.55,1.51
25,New Zealand,4.24,4.31,4.19,4.05,3.8,3.54,3.41,3.35,3.34,...,1.92,1.99,1.87,1.81,1.71,1.72,1.61,1.64,1.66,1.56
30,Slovenia,2.18,2.26,2.27,2.28,2.32,2.45,2.48,2.38,2.28,...,1.58,1.57,1.58,1.62,1.6,1.61,1.59,1.64,1.55,1.51
33,Switzerland,2.44,2.53,2.6,2.67,2.68,2.61,2.52,2.41,2.3,...,1.54,1.54,1.54,1.52,1.52,1.48,1.46,1.52,1.39,1.33
20,Latvia,1.94,1.94,1.91,1.85,1.79,1.74,1.76,1.8,1.83,...,1.65,1.7,1.74,1.69,1.6,1.61,1.55,1.57,1.47,1.36
31,Spain,2.86,2.76,2.8,2.88,3.01,2.94,2.99,3.03,2.96,...,1.32,1.33,1.34,1.31,1.26,1.23,1.19,1.19,1.16,1.12
32,Sweden,2.2,2.23,2.26,2.34,2.48,2.42,2.36,2.27,2.07,...,1.88,1.85,1.85,1.78,1.75,1.7,1.66,1.67,1.52,1.45
17,Italy,2.41,2.41,2.46,2.56,2.7,2.67,2.63,2.54,2.49,...,1.38,1.36,1.36,1.34,1.31,1.27,1.24,1.25,1.24,1.2
23,Mexico,6.77,6.76,6.76,6.75,6.75,6.76,6.77,6.79,6.81,...,2.21,2.14,2.09,2.04,2.03,2.02,1.99,1.97,1.94,1.91
44,Argentina,3.14,3.12,3.13,3.11,3.08,3.01,2.97,2.98,3.01,...,2.39,2.35,2.24,2.17,2.07,1.88,1.6,1.59,1.48,1.5


In [1653]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [1654]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [1655]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [1656]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [1657]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_17_2.head()

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341


In [1658]:
# --- Ensure "country" and "birth order" are strings (tidy casing/spacing) ---
df_17_2["country"] = df_17_2["country"].astype(str).str.strip().str.title()
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_17_2.columns if c not in ["country", "birth_order"]]
# --- Robust cleaning -> convert to float ---
df_17_2[num_cols] = (
    df_17_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_17_2[num_cols] = df_17_2[num_cols].round(2)


In [1659]:
df_17_2.drop_duplicates(inplace=True)
df_17_2.dropna(inplace=True)

In [1660]:
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str) + "(%)"

In [1661]:
df_17_2.head(10)

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth(%),47.6,47.8,46.7,46.2,46.5,46.1,45.2,44.5,...,48.0,48.3,47.3,47.5,47.1,47.2,47.7,47.6,48.4,48.1
1,Austria,Second birth(%),33.7,33.7,34.3,34.9,34.5,34.8,35.8,36.4,...,35.5,35.3,35.6,35.3,35.3,35.1,35.3,35.5,34.9,35.1
2,Austria,Third birth or higher(%),18.8,18.5,19.0,18.9,19.0,19.1,18.9,19.1,...,16.5,16.4,17.1,17.2,17.6,17.7,17.0,16.9,16.7,16.8
3,Belgium,First birth(%),46.8,46.9,47.3,47.3,48.1,47.2,46.9,47.2,...,42.3,43.5,44.1,43.6,42.9,42.6,45.0,44.0,44.7,45.5
4,Belgium,Second birth(%),33.0,32.9,32.7,32.8,32.3,32.8,33.5,33.0,...,35.1,34.8,34.5,34.6,34.5,34.7,34.2,35.1,34.3,34.1
5,Belgium,Third birth or higher(%),20.2,20.2,19.9,19.9,19.6,20.0,19.6,19.8,...,22.6,21.8,21.4,21.9,22.6,22.6,20.8,20.9,20.9,20.4
6,Czechia,First birth(%),46.7,46.6,47.4,47.8,50.1,49.8,48.5,47.7,...,47.4,48.1,48.7,48.7,48.0,47.8,47.6,46.4,46.3,46.3
7,Czechia,Second birth(%),37.7,37.6,37.4,37.2,35.5,35.8,36.8,36.9,...,37.5,37.3,36.7,36.6,37.2,37.6,37.6,39.0,38.6,39.1
8,Czechia,Third birth or higher(%),15.6,15.8,15.2,15.0,14.4,14.4,14.8,15.4,...,15.1,14.7,14.6,14.7,14.7,14.6,14.8,14.6,15.0,14.6
9,Estonia,First birth(%),43.5,43.5,44.0,46.2,49.5,50.3,49.6,49.6,...,41.9,42.3,40.8,40.2,36.7,38.8,38.0,37.2,39.8,39.7


In [1662]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

               dtype  null_count  unique_count
country       object           0            17
birth_order   object           0             3
1987         float64           0            48
1988         float64           0            49
1989         float64           0            48
1990         float64           0            44
1991         float64           0            48
1992         float64           0            46
1993         float64           0            47
1994         float64           0            47
1995         float64           0            48
1996         float64           0            47
1997         float64           0            49
1998         float64           0            50
1999         float64           0            49
2000         float64           0            48
2001         float64           0            50
2002         float64           0            47
2003         float64           0            50
2004         float64           0            49
2005         

In [1663]:
#df_17_2.to_csv('../data/Cleaned/cleaned_births_by_birth_order_oecd.csv', index=False)

In [1664]:
#df_17_2.to_sql('births_by_birth_order_oecd', engine, if_exists='replace', index=False)

In [1665]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1666]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [1667]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [1668]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [1669]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1670]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [1671]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [1672]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [1673]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [1674]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [1675]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [1676]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
#age_of_mothers_at_childbirth
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [1677]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1963     object           0            19
1964     object           0            22
1965     object           0            22
1966     object           0            22
1967     object           0            22
1968     object           0            20
1969     object           0            21
1970     object           0            19
1971     object           0            19
1972     object           0            20
1973     object           0            20
1974     object           0            24
1975     object           0            21
1976     object           0            22
1977     object           0            20
1978     object           0            22
1979     object           0            23
1980     object           0            22
1981     object           0            20
1982     object           0            18
1983     object           0            20
1984     object           0       

In [1678]:
df_19_1.columns = df_19_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1679]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_19_1.columns if c != "country"]

df_19_1[num_cols] = (
    df_19_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1680]:
df_19_1.drop_duplicates(inplace=True)
df_19_1.dropna(inplace=True)

In [1681]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1963     float64           0            16
1964     float64           0            18
1965     float64           0            18
1966     float64           0            18
1967     float64           0            18
1968     float64           0            17
1969     float64           0            17
1970     float64           0            15
1971     float64           0            17
1972     float64           0            18
1973     float64           0            18
1974     float64           0            20
1975     float64           0            18
1976     float64           0            18
1977     float64           0            16
1978     float64           0            18
1979     float64           0            21
1980     float64           0            20
1981     float64           0            17
1982     float64           0            17
1983     float64           0            18
1984     fl

In [1682]:
df_19_1.sample(10)

Unnamed: 0,country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
9,Finland,28.1,28.0,28.0,27.8,27.7,27.5,27.4,27.1,26.9,...,30.4,30.5,30.5,30.6,30.8,30.9,31.0,31.1,31.2,31.4
23,United States,26.5,26.6,26.6,26.4,26.3,26.3,26.2,26.1,26.0,...,28.0,28.2,28.3,28.5,28.7,28.8,29.0,29.1,29.2,29.4
0,Australia,27.5,27.5,27.4,27.3,27.3,27.2,27.2,27.1,26.9,...,30.1,30.1,30.2,30.3,30.5,30.6,30.7,30.8,30.9,31.1
2,Belgium,27.8,27.7,27.6,27.5,27.4,27.3,27.2,27.2,27.0,...,30.0,30.2,30.3,30.4,30.5,30.6,30.7,30.8,30.8,31.0
13,Ireland,31.1,31.1,31.0,30.9,30.8,30.6,30.5,30.4,30.2,...,31.5,31.7,31.8,31.9,32.1,32.1,32.2,32.4,32.6,32.7
22,Switzerland,27.8,27.8,27.7,27.7,27.5,27.5,27.9,27.8,27.7,...,31.5,31.6,31.8,31.8,31.9,31.9,32.0,32.1,32.2,32.3
17,Netherlands,29.3,29.2,29.0,28.8,28.5,28.4,28.3,28.2,28.0,...,30.9,31.0,31.1,31.2,31.3,31.4,31.5,31.6,31.7,31.8
8,Estonia,27.6,27.4,27.3,27.3,27.1,26.9,26.9,26.7,26.7,...,29.6,29.5,29.6,29.9,30.2,30.4,30.5,30.6,30.7,31.0
18,New Zealand,27.5,27.4,27.2,27.0,26.9,26.8,26.8,26.7,26.5,...,29.8,29.9,30.1,30.2,30.4,30.4,30.6,30.7,30.8,31.0
19,Norway,27.8,27.8,27.7,27.6,27.5,27.3,27.2,27.0,26.8,...,30.3,30.5,30.6,30.7,30.8,31.0,31.1,31.3,31.4,31.5


In [1683]:
#df_19_1.to_csv('../data/Cleaned/age_of_mothers_at_childbirth_oecd.csv', index=False)

In [1684]:
#df_19_1.to_sql('age_of_mothers_at_childbirth_oecd', engine, if_exists='replace', index=False)

In [1685]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
#fertility_per_1000_from 1960
df_19_2.head()

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [1686]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            21
Age group  object           0             7
1960       object           0           136
1961       object           0           140
1962       object           0           140
...           ...         ...           ...
2017       object           0           124
2018       object           0           128
2019       object           0           126
2020       object           0           121
2021       object           7           119

[64 rows x 3 columns]


In [1687]:
df_19_2.columns = df_19_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_2.head()

Unnamed: 0,country,age_group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [1688]:
# --- Ensure "country" and "age_group" are strings
df_19_2["country"] = df_19_2["country"].astype(str).str.strip().str.title()
df_19_2["age_group"] = df_19_2["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_2.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_2[num_cols] = (
    df_19_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_2[num_cols] = df_19_2[num_cols].round(2)

In [1689]:
df_19_2.drop_duplicates(inplace=True)
df_19_2.dropna(inplace = True)

In [1690]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            19
age_group   object           0             7
1960       float64           0           124
1961       float64           0           126
1962       float64           0           126
...            ...         ...           ...
2017       float64           0           118
2018       float64           0           121
2019       float64           0           120
2020       float64           0           115
2021       float64           0           118

[64 rows x 3 columns]


In [1691]:
#df_19_2.to_csv('../data/Cleaned/fertility_per_1000_by_age_from 1960_oecd.csv', index=False)

In [1692]:
#df_19_2.to_sql('fertility_per_1000_from_1960_oecd', engine, if_exists='replace', index=False)

In [1693]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
#fertility_per_1000_from_2000
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [1694]:
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            43
Age group  object           0             7
2000       object           0           233
2001       object           0           248
2002       object           0           240
2003       object           0           239
2004       object           0           245
2005       object           0           240
2006       object           0           239
2007       object           0           242
2008       object           0           252
2009       object           0           251
2010       object           0           239
2011       object           0           235
2012       object           0           242
2013       object           0           234
2014       object           0           238
2015       object           0           237
2016       object           0           248
2017       object           0           236
2018       object           0           245
2019       object           0   

In [1695]:
df_19_3.columns = df_19_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_3.head()

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587


In [1696]:
# --- Ensure "country" and "age_group" are strings
df_19_3["country"] = df_19_3["country"].astype(str).str.strip().str.title()
df_19_3["age_group"] = df_19_3["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_3.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_3[num_cols] = (
    df_19_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_3[num_cols] = df_19_3[num_cols].round(2)

In [1697]:
df_19_3.drop_duplicates(inplace=True)
df_19_3.dropna(inplace=True)

In [1698]:
#Check again
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            41
age_group   object           0             7
2000       float64           0           225
2001       float64           0           237
2002       float64           0           232
2003       float64           0           229
2004       float64           0           233
2005       float64           0           229
2006       float64           0           229
2007       float64           0           230
2008       float64           0           238
2009       float64           0           238
2010       float64           0           230
2011       float64           0           227
2012       float64           0           231
2013       float64           0           225
2014       float64           0           226
2015       float64           0           225
2016       float64           0           237
2017       float64           0           227
2018       float64           0           233
2019      

In [1699]:
df_19_3.sample(10)

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
57,Czech Republic,20-24,67.8,61.5,58.5,54.0,51.0,49.1,47.3,48.7,...,42.4,41.9,43.0,45.5,49.4,50.8,51.3,49.7,49.7,47.6
185,Netherlands,30-34,130.2,129.0,131.2,131.3,129.4,127.6,128.8,128.1,...,130.6,127.9,132.1,129.0,129.8,127.8,126.3,125.6,124.9,133.5
276,Bulgaria,30-34,32.3,32.5,34.3,37.7,42.5,48.4,53.6,58.0,...,65.8,64.6,68.5,69.7,71.2,71.5,72.3,72.8,71.3,73.6
92,Germany,20-24,56.9,54.6,52.4,50.5,49.5,47.3,45.3,43.9,...,37.1,35.9,35.7,35.8,39.3,36.7,35.8,34.0,32.5,31.2
73,Estonia,30-34,52.7,51.9,56.7,57.9,64.5,69.8,74.1,81.2,...,85.8,82.4,84.7,89.1,95.4,95.2,103.5,101.3,98.8,102.6
5,Oecd-Average,40-44,7.6,7.8,8.0,8.3,8.6,8.8,9.1,9.6,...,11.3,11.5,11.8,12.2,12.8,12.9,13.1,13.4,13.3,13.7
202,Norway,45-49,0.2,0.3,0.2,0.3,0.3,0.4,0.4,0.4,...,0.6,0.5,0.7,0.8,0.7,0.7,0.8,0.9,0.7,0.8
300,Romania,45-49,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.3,0.3,0.3,0.3,0.3,0.4,0.4,0.5,0.6,0.5
100,Greece,25-29,81.2,83.1,84.9,84.0,84.0,82.7,83.5,82.2,...,77.2,72.2,70.1,68.7,68.8,65.7,64.4,61.9,64.6,64.1
220,Slovak Republic,30-34,44.2,43.5,46.1,50.5,55.0,59.2,62.9,66.4,...,76.1,76.0,78.6,80.9,84.1,86.4,87.5,87.8,89.0,92.2


In [1700]:
#df_19_3.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_2000_oecd.csv',index=False)

In [1701]:
#df_19_3.to_sql('fertility_per_1000_from_2000_oecd',engine, if_exists='replace', index=False)

In [1702]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
#(%)share_of_births_outside_of_marriage
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [1703]:
df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1960     object           0            26
1961     object           0            24
1962     object           0            24
1963     object           0            24
...         ...         ...           ...
2016     object           0            24
2017     object           0            26
2018     object           0            25
2019     object           0            25
2020     object           0            24

[62 rows x 3 columns]


In [1704]:
df_20.columns = df_20.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1705]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_20.columns if c != "country"]

df_20[num_cols] = (
    df_20[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1706]:
df_20.drop_duplicates(inplace=True)
df_20.dropna(inplace=True)

df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1960     float64           0            22
1961     float64           0            20
1962     float64           0            21
1963     float64           0            21
...          ...         ...           ...
2016     float64           0            20
2017     float64           0            22
2018     float64           0            21
2019     float64           0            22
2020     float64           0            22

[62 rows x 3 columns]


In [1707]:
df_20.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
23,United States,5.3,5.6,5.9,6.3,6.9,7.7,8.4,9.0,9.7,...,40.7,40.7,40.6,40.2,40.3,39.8,39.8,39.6,40.0,40.5
11,Latvia,11.9,12.3,12.4,12.3,12.8,13.3,12.6,12.3,12.1,...,44.6,45.0,44.6,44.0,41.5,40.9,40.4,39.5,38.4,39.5
17,Slovak Republic,4.7,4.4,4.6,4.7,5.0,5.3,5.3,5.7,5.9,...,34.0,35.4,37.0,38.9,39.2,40.2,40.1,40.0,40.1,40.7
2,Czech Republic,4.9,4.6,4.5,4.7,4.8,5.0,5.3,5.3,5.4,...,41.8,43.4,45.0,46.7,47.8,48.6,49.0,48.5,48.2,48.5
12,Luxembourg,3.2,3.4,3.1,3.1,3.2,3.7,3.2,3.5,3.2,...,34.1,37.1,37.8,39.1,38.8,40.7,40.8,39.5,40.4,41.6
20,Sweden,11.3,11.7,12.4,12.6,13.1,13.8,14.5,15.1,16.0,...,54.3,54.5,54.4,54.6,54.7,54.9,54.5,54.5,54.5,55.2
10,Italy,2.4,2.4,2.2,2.2,2.0,2.0,2.0,2.0,2.0,...,23.7,25.0,26.9,28.8,30.0,31.5,32.8,34.0,35.4,33.8
21,Switzerland,3.8,4.0,4.2,4.1,4.2,3.9,3.8,3.9,3.8,...,19.3,20.2,21.1,21.7,22.9,24.2,25.2,25.7,26.5,27.7
6,Greece,1.2,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.1,...,7.4,7.6,7.0,8.2,8.8,9.4,10.3,11.1,12.4,13.8
7,Hungary,5.5,5.5,5.4,5.3,5.2,5.2,5.1,5.0,5.0,...,42.3,44.5,45.6,47.3,47.9,46.7,44.7,43.9,38.7,30.4


In [1708]:
#df_20.to_csv('../data/Cleaned/cleaned_share_of_births_outside_of_marriage_oecd.csv', index=False)

In [1709]:
#df_20.to_sql('share_of_births_outside_of_marriage_oecd',engine, if_exists='replace', index=False)

In [1710]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
#mean_age_first_marriage
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [1711]:
df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            10
Gender     object           0             2
1990       object           0            17
1991       object           0            18
1992       object           0            18
1993       object           0            19
1994       object           0            16
1995       object           0            18
1996       object           0            19
1997       object           0            17
1998       object           0            14
1999       object           0            19
2000       object           0            18
2001       object           0            18
2002       object           0            19
2003       object           0            19
2004       object           0            16
2005       object           0            18
2006       object           0            18
2007       object           0            19
2008       object           0            18
2009       object           0   

In [1712]:
df_21_1.columns = df_21_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1713]:
# --- Ensure "country" and "gender" are strings
df_21_1["country"] = df_21_1["country"].astype(str).str.strip().str.title()
df_21_1["gender"] = df_21_1["gender"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_1.columns if c not in ["country", "gender"]]
# --- Robust cleaning -> convert to float ---
df_21_1[num_cols] = (
    df_21_1[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_1[num_cols] = df_21_1[num_cols].round(2)

In [1714]:
df_21_1.drop_duplicates(inplace=True)
df_21_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0             9
gender     object           0             2
1990      float64           0            15
1991      float64           0            16
1992      float64           0            16
1993      float64           0            17
1994      float64           0            15
1995      float64           0            16
1996      float64           0            17
1997      float64           0            15
1998      float64           0            13
1999      float64           0            17
2000      float64           0            16
2001      float64           0            16
2002      float64           0            17
2003      float64           0            17
2004      float64           0            15
2005      float64           0            17
2006      float64           0            17
2007      float64           0            17
2008      float64           0            16
2009      float64           0   

In [1715]:
#df_21_1.to_csv('../data/Cleaned/cleaned_mean_age_first_marriage_oecd.csv',index=False)

In [1716]:
#df_21_1.to_sql('mean_age_first_marriage_oecd', engine, if_exists='replace', index= False)

In [1717]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
#divorce_rates_per_1000_oecd
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [1718]:
df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            28
1970       object           0            18
1971       object           0            19
1972       object           0            19
1973       object           0            18
1974       object           0            18
1975       object           0            19
1976       object           0            18
1977       object           0            18
1978       object           0            18
1979       object           0            15
1980       object           0            18
1981       object           0            20
1982       object           0            22
1983       object           0            24
1984       object           0            20
1985       object           0            19
1986       object           0            20
1987       object           0            20
1988       object           0            20
1989       object           0            19
1990       object           0   

In [1719]:
df_21_2.columns = df_21_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1720]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_21_2.columns if c != "country"]

df_21_2[num_cols] = (
    df_21_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1721]:
df_21_2.drop_duplicates(inplace=True)
df_21_2.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0            23
1970      float64           0            15
1971      float64           0            17
1972      float64           0            15
1973      float64           0            14
1974      float64           0            15
1975      float64           0            16
1976      float64           0            14
1977      float64           0            13
1978      float64           0            15
1979      float64           0            12
1980      float64           0            14
1981      float64           0            17
1982      float64           0            17
1983      float64           0            19
1984      float64           0            16
1985      float64           0            15
1986      float64           0            16
1987      float64           0            16
1988      float64           0            15
1989      float64           0            15
1990      float64           0   

In [1722]:
df_21_2.head(8)

Unnamed: 0,country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,1.4,1.3,1.3,1.3,1.4,1.4,1.5,1.5,1.6,...,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.6,1.5
1,Belgium,0.7,0.7,0.8,0.9,1.0,1.1,1.3,1.3,1.4,...,2.2,2.2,2.2,2.1,2.0,2.0,2.0,1.8,1.9,1.7
2,Czechia,2.2,2.4,2.3,2.5,2.5,2.6,2.5,2.5,2.6,...,2.7,2.5,2.5,2.4,2.4,2.3,2.3,2.0,2.0,1.9
3,Denmark,1.9,2.7,2.6,2.5,2.6,2.6,2.6,2.6,2.6,...,3.4,3.4,2.9,3.0,2.6,2.6,1.8,2.7,2.2,2.1
5,Finland,1.3,1.6,1.8,1.9,2.1,2.0,2.1,2.1,2.2,...,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.4,2.2,2.0
6,Germany,1.3,1.4,1.5,1.6,1.8,1.9,2.0,1.5,1.0,...,2.1,2.1,2.0,2.0,1.9,1.8,1.8,1.7,1.7,1.6
8,Hungary,2.2,2.3,2.3,2.4,2.3,2.5,2.6,2.6,2.7,...,2.0,2.0,2.1,2.0,1.9,1.7,1.8,1.5,1.9,1.8
10,Japan,0.9,1.0,1.0,1.0,1.0,1.1,1.1,1.1,1.2,...,1.8,1.8,1.8,1.7,1.7,1.7,1.7,1.6,1.47,1.52


In [1723]:
#df_21_2.to_csv('../data/Cleaned/cleaned_divorce_rates_per_1000_oecd.csv', index=False)

In [1724]:
#df_21_2.to_sql('divorce_rates_per_1000_oecd',engine, if_exists= 'replace' , index=False)

In [1725]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
#share_of_previous_marital_status
df_21_3

Unnamed: 0,Country,Previous marital status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216
5,Austria,Widowed,12,11,12,11,12,12,11,10,...,10,9,10,9,10,8,9,9,8,4
6,Czechia,Single never married,749,745,743,740,739,742,745,726,...,740,740,752,756,766,767,764,764,761,759
7,Czechia,Divorced,237,242,244,247,247,245,244,261,...,249,249,238,234,224,223,226,226,229,230
8,Czechia,Widowed,14,13,13,13,14,12,11,13,...,12,11,10,10,10,10,10,10,10,11
9,Denmark,Single never married,759,760,762,764,760,756,756,763,...,772,760,750,762,761,769,764,771,776,783


In [1726]:
df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
Country                    object           0            20
Previous marital status    object           0             3
2000                       object           0            47
2001                       object           0            51
2002                       object           0            56
2003                       object           0            50
2004                       object           0            50
2005                       object           0            52
2006                       object           0            49
2008                       object           0            47
2009                       object           0            50
2010                       object           0            49
2011                       object           0            49
2012                       object           0            53
2013                       object           0            49
2014                       object       

In [1727]:
df_21_3.columns = df_21_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216


In [1728]:
# --- Ensure "country" and "previous_marital_status" are strings
df_21_3["country"] = df_21_3["country"].astype(str).str.strip().str.title()
df_21_3["previous_marital_status"] = df_21_3["previous_marital_status"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_3.columns if c not in ["country", "previous_marital_status"]]
# --- Robust cleaning -> convert to float ---
df_21_3[num_cols] = (
    df_21_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_3[num_cols] = df_21_3[num_cols].round(2)

In [1729]:
df_21_3.drop_duplicates(inplace=True)
df_21_3.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
country                    object           0            20
previous_marital_status    object           0             3
2000                      float64           0            47
2001                      float64           0            51
2002                      float64           0            56
2003                      float64           0            50
2004                      float64           0            50
2005                      float64           0            52
2006                      float64           0            49
2008                      float64           0            47
2009                      float64           0            50
2010                      float64           0            49
2011                      float64           0            49
2012                      float64           0            53
2013                      float64           0            49
2014                      float64       

In [1730]:
#df_21_3.to_csv('../data/Cleaned/cleaned_share_of_previous_marital_status_oecd.csv', index=False)

In [1731]:
#df_21_3.to_sql('share_of_previous_marital_status_oecd', engine, if_exists= 'replace', index =  False)

In [1732]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
#hauseholds_by_partnership_status_oecd
df_22_1

Unnamed: 0,Country,20+_All_Total_Living_with_a_partner(%),20+_All_Married or in a civil or registered partnership_living_with_a_partner(%),20+_All_Cohabiting_living_with_a_partner(%),20+_All_Not living with a partner(%),20/34_Total_living_with_a_partner(%),20/34_Married or in a civil or registered partnership_living_with_a_partner(%),20/34_Cohabiting_living_with_a_partner(%),Not living with a partner_Total(%),Living with at least one parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620
5,Denmark,6415,5002,1412,3585,5054,2186,2868,4946,1067
6,Estonia,5393,3730,1664,4607,4531,1781,2750,5469,2646
7,France,6414,4941,1472,3586,5042,2189,2853,4958,2208
8,Germany,6261,5391,869,3739,3953,2215,1739,5974,2754
9,Greece,6023,5852,171,3977,3313,2924,390,6687,4543


In [1733]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
20+_All_Total_Living_with_a_partner(%)                object           0   
20+_All_Married or in a civil or registered par...    object           0   
20+_All_Cohabiting_living_with_a_partner(%)           object           0   
20+_All_Not living with a partner(%)                  object           0   
20/34_Total_living_with_a_partner(%)                  object           0   
20/34_Married or in a civil or registered partn...    object           0   
20/34_Cohabiting_living_with_a_partner(%)             object           0   
Not living with a partner_Total(%)                    object           0   
Living with at least one parent(%)                    object           0   

                                                    unique_count  
Country                                                       37  
20+_All_Total_Living_with_a_p

In [1734]:
df_22_1.columns = df_22_1.columns.str.lower() \
                .str.replace(' ', '_') \


df_22_1.head()

Unnamed: 0,country,20+_all_total_living_with_a_partner(%),20+_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20+_all_cohabiting_living_with_a_partner(%),20+_all_not_living_with_a_partner(%),20/34_total_living_with_a_partner(%),20/34_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20/34_cohabiting_living_with_a_partner(%),not_living_with_a_partner_total(%),living_with_at_least_one_parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620


In [1735]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_1.columns if c != "country"]

df_22_1[num_cols] = (
    df_22_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1736]:
df_22_1["country"] = df_22_1["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_1["country"].unique())

['Australia' 'Austria' 'Belgium' 'Canada' 'Czech Republic' 'Denmark'
 'Estonia' 'France' 'Germany' 'Greece' 'Hungary' 'Iceland' 'Ireland'
 'Italy' 'Latvia' 'Luxembourg' 'Netherlands' 'New Zealand' 'Norway'
 'Poland' 'Portugal' 'Slovak Republic' 'Slovenia' 'Spain' 'Sweden'
 'Switzerland' 'United Kingdom' 'United States' 'OECD-28 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [1737]:
df_22_1.drop_duplicates(inplace = True)
df_22_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
20+_all_total_living_with_a_partner(%)               float64           0   
20+_all_married_or_in_a_civil_or_registered_par...   float64           0   
20+_all_cohabiting_living_with_a_partner(%)          float64           0   
20+_all_not_living_with_a_partner(%)                 float64           0   
20/34_total_living_with_a_partner(%)                 float64           0   
20/34_married_or_in_a_civil_or_registered_partn...   float64           0   
20/34_cohabiting_living_with_a_partner(%)            float64           0   
not_living_with_a_partner_total(%)                   float64           0   
living_with_at_least_one_parent(%)                   float64           0   

                                                    unique_count  
country                                                       34  
20+_all_total_living_with_a_p

In [1738]:
df_22_1.sample(8)

Unnamed: 0,country,20+_all_total_living_with_a_partner(%),20+_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20+_all_cohabiting_living_with_a_partner(%),20+_all_not_living_with_a_partner(%),20/34_total_living_with_a_partner(%),20/34_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20/34_cohabiting_living_with_a_partner(%),not_living_with_a_partner_total(%),living_with_at_least_one_parent(%)
18,Norway,60.96,46.07,14.89,39.04,41.99,18.94,23.05,58.01,25.54
19,Poland,57.72,55.6,2.12,42.28,37.6,34.2,3.4,62.4,45.65
14,Latvia,47.87,38.55,9.32,52.13,33.13,20.71,12.41,66.87,41.78
8,Germany,62.61,53.91,8.69,37.39,39.53,22.15,17.39,59.74,27.54
25,Switzerland,64.15,53.46,10.69,35.85,42.43,25.4,17.03,55.0,31.44
16,Netherlands,66.82,53.1,13.72,33.18,46.76,21.4,25.35,53.23,23.7
12,Ireland,58.99,50.23,8.75,41.01,37.8,21.4,16.4,57.25,28.18
23,Spain,61.43,52.55,8.88,38.57,38.43,22.51,15.92,61.57,45.02


In [1739]:
#df_22_1.to_csv('../data/Cleaned/cleaned_hauseholds_by_partnership_status_oecd.csv', index=False)

In [1740]:
#df_22_1.to_sql('hauseholds_by_partnership_status_oecd', engine, if_exists='replace', index= False)

In [1741]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
#level_of_educational_attainment
df_22_2

Unnamed: 0,Country,Low_Education_Total_living_with_a_partner(%),Low_educationMarried or in a civil or registered partnership_living_with_a_partner(%),Low_education_Cohabiting_living_with_a_partner(%),Not living with a partner_Low_education(%),Medium education_Total_Living with a partner(%),Medium education_Married or in a civil or registered partnership_Living with a partner(%),Medium education_Cohabiting_Living with a partner(%),Not living with a partner_Medium education(%),High education_Total_Living with a partner(%),High education_Married or in a civil or registered partnership_Living with a partner(%),High education_Cohabiting_Living with a partner(%),Not living with a partner_High education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442
5,Germany,5446,4879,567,4554,6238,5313,925,3762.0,6889,5916,974,3111
6,Greece,6381,6288,93,3619,5700,5488,212,4300.0,5833,5570,263,4167
7,Hungary,5033,4038,995,4967,5794,4678,1115,4206.0,5956,5102,855,4044
8,Iceland,5186,4102,1084,4814,5831,4657,1174,4169.0,6972,5453,1519,3028
9,Latvia,3627,2592,1035,6373,4932,3954,978,5068.0,5291,4539,752,4709


In [1742]:
df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
Low_Education_Total_living_with_a_partner(%)          object           0   
Low_educationMarried or in a civil or registere...    object           0   
Low_education_Cohabiting_living_with_a_partner(%)     object           0   
Not living with a partner_Low_education(%)            object           0   
Medium education_Total_Living with a partner(%)       object           0   
Medium education_Married or in a civil or regis...    object           0   
Medium education_Cohabiting_Living with a partn...    object           0   
Not living with a partner_Medium education(%)         object           2   
High education_Total_Living with a partner(%)         object           0   
High education_Married or in a civil or registe...    object           0   
High education_Cohabiting_Living with a partner(%)    object           0   
Not living w

In [1743]:
df_22_2.columns = df_22_2.columns.str.lower() \
                .str.replace(' ', '_') \


df_22_2.head()

Unnamed: 0,country,low_education_total_living_with_a_partner(%),low_educationmarried_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),low_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_low_education(%),medium_education_total_living_with_a_partner(%),medium_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),medium_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_medium_education(%),high_education_total_living_with_a_partner(%),high_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),high_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_high_education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442


In [1744]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_2.columns if c != "country"]

df_22_2[num_cols] = (
    df_22_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1745]:
df_22_2["country"] = df_22_2["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_2["country"].unique())

['Austria' 'Belgium' 'Czech Republic' 'Estonia' 'France' 'Germany'
 'Greece' 'Hungary' 'Iceland' 'Latvia' 'Luxembourg' 'Norway' 'Poland'
 'Portugal' 'Slovenia' 'Spain' 'Sweden' 'United Kingdom' 'OECD-19 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [1746]:
df_22_2.drop_duplicates(inplace=True)
df_22_2.dropna(inplace=True)


df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
low_education_total_living_with_a_partner(%)         float64           0   
low_educationmarried_or_in_a_civil_or_registere...   float64           0   
low_education_cohabiting_living_with_a_partner(%)    float64           0   
not_living_with_a_partner_low_education(%)           float64           0   
medium_education_total_living_with_a_partner(%)      float64           0   
medium_education_married_or_in_a_civil_or_regis...   float64           0   
medium_education_cohabiting_living_with_a_partn...   float64           0   
not_living_with_a_partner_medium_education(%)        float64           0   
high_education_total_living_with_a_partner(%)        float64           0   
high_education_married_or_in_a_civil_or_registe...   float64           0   
high_education_cohabiting_living_with_a_partner(%)   float64           0   
not_living_w

In [1747]:
#df_22_2.to_csv('../data/Cleaned/cleaned_level_of_educational_attainment_oecd.csv', index=False)

In [1748]:
#df_22_2.to_sql('level_of_educational_attainment_oecd',engine, if_exists='replace', index= False)

In [1749]:
df_6666 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_6666

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,
