In [860]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook
from pathlib import Path

In [861]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [862]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [863]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [864]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [865]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [866]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [867]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [868]:
df_1.drop_duplicates(inplace=True)
df_1.dropna(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [869]:
df_info = pd.DataFrame({
    'datatypes': df_1.dtypes,
    'null_count': df_1.isnull().sum(),
    'unique_count': df_1.nunique()
})
print(df_info)

                           datatypes  null_count  unique_count
country                       object           0           235
age_group                     object           0            63
sex                           object           0             2
marital_status                object           0            35
data_process                  object           0             6
data_collection_start_year     int32           0            62
data_collection_end_year       int32           0            60
data_source                   object           0            15


In [870]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [871]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [872]:
s_1 = ('../data/Cleaned/cleaned_world_marriage.csv')

In [873]:
AGE_COL = "age_group"

# 0) Ensure s_1 is a DataFrame
if isinstance(s_1, (str, Path)):
    s_1 = pd.read_csv(s_1)
elif isinstance(s_1, tuple) and len(s_1) == 1 and isinstance(s_1[0], (str, Path)):
    s_1 = pd.read_csv(s_1[0])

# 1) Normalize age labels; keep [+75]; map any 65+ to [65-69]

def norm_age(x):
    if pd.isna(x): return x
    x = str(x).replace("–","-").replace("—","-").replace("to","-").replace("_","-")
    x = re.sub(r"[()]", "", x)
    x = re.sub(r"\s+", "", x)
    if re.search(r"\[\+75\]|\[75\+\]|75\+|\+75", x):   # preserve 75+
        return "[+75]"
    if re.fullmatch(r"\+?65\+?|\[?\+65\]?|\[?65\+\]?", x, flags=re.I):  # merge 65+
        return "[65-69]"
    m = re.match(r"^\[?(\d{1,3})-(\d{1,3})\]?$", x)   # standard ranges
    if m:
        a, b = map(int, m.groups())
        return f"[{a}-{b}]"
    m = re.search(r"(\d{1,3})", x)                    # fallback: first number
    return f"[{m.group(1)}]" if m else x

s_1[AGE_COL] = s_1[AGE_COL].astype(str).map(norm_age)

# 2) Keep top-14 age buckets by frequency (delete all others)
top14 = s_1[AGE_COL].value_counts(dropna=False).nlargest(14).index.tolist()
s_1 = s_1[s_1[AGE_COL].isin(top14)].copy()

# 3) Natural ordering (put [+75] last)
def start_num(lbl):
    return 10**9 if lbl == "[+75]" else int(re.search(r"\d+", str(lbl)).group())
cats = sorted(top14, key=start_num)
s_1[AGE_COL] = pd.Categorical(s_1[AGE_COL], categories=cats, ordered=True)
s_1 = s_1.sort_values(AGE_COL).reset_index(drop=True)

print("Kept age buckets (14):", list(s_1[AGE_COL].cat.categories))

Kept age buckets (14): ['[10-14]', '[15-19]', '[20-24]', '[25-29]', '[30-34]', '[35-39]', '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65-69]', '[70-74]', '[+75]']


In [874]:
df_info = pd.DataFrame({
    'datatypes': s_1.dtypes,
    'null_count': s_1.isnull().sum(),
    'unique_count': s_1.nunique()
})
print(df_info)

                           datatypes  null_count  unique_count
country                       object           0           235
age_group                   category           0            14
sex                           object           0             2
marital_status                object           0            35
data_process                  object           0             6
data_collection_start_year     int64           0            62
data_collection_end_year       int64           0            60
data_source                   object           0            15


In [875]:
#s_1.to_csv('../data/Prep/prep_world_marriage.csv', index= False)

In [876]:
#s_1.to_sql('prep_world_marriage', engine, if_exists='replace', index=False)

In [877]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [878]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [879]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [880]:
df_2.drop_duplicates(inplace=True)
df_2.dropna(inplace=True)


In [881]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [882]:
df_info = pd.DataFrame({
    'datatypes': df_2.dtypes,
    'null_count': df_2.isnull().sum(),
    'unique_count': df_2.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            41
code                                   object           0            41
year                                    int32           0            32
mean_age_of_women_at_first_marriage   float64           0           179


In [883]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [884]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [885]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [886]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [887]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [888]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [889]:
df_3.drop_duplicates(inplace=True)
df_3.dropna(inplace=True)


In [890]:
df_info = pd.DataFrame({
    'datatypes': df_3.dtypes,
    'null_count': df_3.isnull().sum(),
    'unique_count': df_3.nunique()
})
print(df_info)

                                              datatypes  null_count  \
country                                          object           0   
code                                             object           0   
year                                              int32           0   
crude_marriage_rate_marriages_per_1000_people   float64           0   

                                               unique_count  
country                                                  45  
code                                                     45  
year                                                    127  
crude_marriage_rate_marriages_per_1000_people           109  


In [891]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [892]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [893]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [894]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [895]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [896]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [897]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [898]:
df_info = pd.DataFrame({
    'datatypes': df_4.dtypes,
    'null_count': df_4.isnull().sum(),
    'unique_count': df_4.nunique()
})
print(df_info)

                            datatypes  null_count  unique_count
country                        object           0            38
code                           object           0            38
year                            int64           0            61
crude_marriage_rate           float64           0           101
crude_marriage_rate_people1   float64           0            28
year_1                          Int64           0             1


In [899]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [900]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [901]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [902]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [903]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)
df_5.dropna(inplace=True)

In [904]:
df_info = pd.DataFrame({
    'datatypes': df_5.dtypes,
    'null_count': df_5.isnull().sum(),
    'unique_count': df_5.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            42
code                                   object           0            42
year                                    int64           0            62
share_of_births_outside_of_marriage   float64           0           610


In [905]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [906]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [907]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')
df_6

Unnamed: 0,Entity,Code,Year,Proportions of men or women who had ever married by a certain age for 1900 birth cohort,Proportions of men or women who had ever married by a certain age for 1920 birth cohort,Proportions of men or women who had ever married by a certain age for 1940 birth cohort,Proportions of men or women who had ever married by a certain age for 1960 birth cohort,Proportions of men or women who had ever married by a certain age for 1970 birth cohort,Proportions of men or women who had ever married by a certain age for 1980 birth cohort,Proportions of men or women who had ever married by a certain age for 1990 birth cohort,Proportions of men or women who had ever married by a certain age for 2000 birth cohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2
...,...,...,...,...,...,...,...,...,...,...,...
63,Women,,46,84.5,91.6,95.5,86.9,75.0,,,
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,
65,Women,,48,85.0,91.8,95.6,87.2,75.7,,,
66,Women,,49,85.2,91.9,95.7,87.3,76.0,,,


In [908]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)

df_6.head()

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2


In [909]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)
df_6

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
0,Men,17,0.0,0.0,0.0,0.1,0.0
1,Men,18,0.1,0.1,0.4,0.6,0.1
2,Men,19,0.8,0.6,2.0,2.5,0.7
3,Men,20,2.4,2.2,6.0,6.2,1.9
4,Men,21,6.1,7.4,13.6,11.9,3.9
...,...,...,...,...,...,...,...
63,Women,46,84.5,91.6,95.5,86.9,75.0
64,Women,47,84.8,91.7,95.6,87.0,75.4
65,Women,48,85.0,91.8,95.6,87.2,75.7
66,Women,49,85.2,91.9,95.7,87.3,76.0


In [910]:
df_6.dropna(inplace=True)
df_6.describe

<bound method NDFrame.describe of       sex  year  1900_birthcohort  1920_birthcohort  1940_birthcohort  \
0     Men    17               0.0               0.0               0.0   
1     Men    18               0.1               0.1               0.4   
2     Men    19               0.8               0.6               2.0   
3     Men    20               2.4               2.2               6.0   
4     Men    21               6.1               7.4              13.6   
..    ...   ...               ...               ...               ...   
63  Women    46              84.5              91.6              95.5   
64  Women    47              84.8              91.7              95.6   
65  Women    48              85.0              91.8              95.6   
66  Women    49              85.2              91.9              95.7   
67  Women    50              85.4              92.0              95.7   

    1960_birthcohort  1970_birthcohort  
0                0.1               0.0  
1      

In [911]:
df_info = pd.DataFrame({
    'datatypes': df_6.dtypes,
    'null_count': df_6.isnull().sum(),
    'unique_count': df_6.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [912]:
df_6.sample(12)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
17,Men,34,83.2,85.5,88.5,72.4,53.7
56,Women,39,81.8,90.2,94.8,85.0,71.3
37,Women,20,6.8,13.3,27.0,21.5,7.6
67,Women,50,85.4,92.0,95.7,87.5,76.3
31,Men,48,92.5,91.7,92.3,81.3,69.9
32,Men,49,92.7,91.8,92.3,81.5,70.3
52,Women,35,79.0,88.3,93.8,82.8,66.7
40,Women,23,32.7,49.5,68.2,48.4,24.0
45,Women,28,65.0,77.4,88.4,72.0,48.8
54,Women,37,80.6,89.4,94.4,84.1,69.4


In [913]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [914]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [915]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [916]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [917]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.dropna(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1130,Latvia,LVA,2001,42.1
18,Australia,AUS,1993,24.9
1471,New Zealand,NZL,2018,48.2
1290,Malta,MLT,1988,1.7
1592,Portugal,PRT,1981,9.5


In [918]:
df_info = pd.DataFrame({
    'datatypes': df_7.dtypes,
    'null_count': df_7.isnull().sum(),
    'unique_count': df_7.nunique()
})
print(df_info)

                                          datatypes  null_count  unique_count
country                                      object           0            42
code                                         object           0            42
year                                          int64           0            62
shareofbirthsoutsideofmarriageofallbirths   float64           0           610


In [919]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [920]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [921]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [922]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [923]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
37,Women,GBR,20,6.8,13.3,27.0,21.5,7.6,2.8,0.9,0.4
66,Women,GBR,49,85.2,91.9,95.7,87.3,76.0,,,
9,Men,GBR,26,48.3,51.0,67.7,46.8,24.1,11.2,7.1,
11,Men,GBR,28,62.7,66.3,77.7,56.8,33.1,17.8,12.9,
6,Men,GBR,23,21.4,26.8,38.1,26.2,10.5,3.9,2.1,0.8


In [924]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.dropna(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
46,Women,29,68.7,80.4,89.8,74.5,52.4
31,Men,48,92.5,91.7,92.3,81.3,69.9
62,Women,45,84.2,91.4,95.4,86.7,74.5
63,Women,46,84.5,91.6,95.5,86.9,75.0
58,Women,41,82.9,90.7,95.1,85.7,72.7


In [925]:
df_info = pd.DataFrame({
    'datatypes': df_8.dtypes,
    'null_count': df_8.isnull().sum(),
    'unique_count': df_8.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [926]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [927]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [928]:
#pip install openpyxl pywin32

In [929]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [930]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [931]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [932]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [933]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [934]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [935]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [936]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [937]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [938]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [939]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [940]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
2426,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,76,Brazil,BR,BRA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,59.78
16334,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,562,Niger,NE,NER,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,83.14
9277,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,320,Guatemala,GT,GTM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,56.5
3941,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,140,Central African Republic,CF,CAF,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,72.17
13352,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,454,Malawi,MW,MWI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,72.93


In [941]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
10168,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,356,India,IN,IND,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,73.19
7148,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,238,Falkland Islands (Malvinas),FK,FLK,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,47.71
20348,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,703,Slovakia,SK,SVK,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,54.22
22222,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,768,Togo,TG,TGO,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.41
4847,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,175,Mayotte,YT,MYT,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,51.63


In [942]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [943]:
df_9.drop_duplicates(inplace=True)
df_9.dropna(inplace = True)

df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [944]:
df_info = pd.DataFrame({
    'datatypes': df_9.dtypes,
    'null_count': df_9.isnull().sum(),
    'unique_count': df_9.nunique()
})
print(df_info)

                datatypes  null_count  unique_count
indicatorname      object           0             1
year                int64           0             1
country            object           0           224
code               object           0           224
time                int64           0            56
variant            object           0             1
sex                object           0             1
age                object           0             1
estimate_method     int64           0             2
estimatemethod     object           0             2
value             float64           0          3867


In [945]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [946]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [947]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [948]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
136073,Uganda,800,Married or in-union women,2050,20-24,38.86858,1491.353203,Projection
125683,Swaziland,748,Married or in-union women,2047,30-34,56.519441,32.949704,Projection
27999,Mayotte,175,Married or in-union women,1986,15-49,52.018778,8.000228,Estimate
18022,Myanmar,104,Married or in-union women,2035,45-49,71.236833,1405.29755,Projection
139895,United States of America,840,Married or in-union women,2041,15-49,52.533451,41578.134847,Projection
122230,Spain,724,Married or in-union women,2020,45-49,72.523,1397.232469,Estimate
60010,Indonesia,360,Married or in-union women,2019,25-29,83.006566,8621.901585,Estimate
113590,Sao Tome and Principe,678,Married or in-union women,1993,45-49,68.41319,1.287878,Estimate
139970,United States Virgin Islands,850,Married or in-union women,1970,25-29,39.49,1.281451,Estimate
61070,Iraq,368,Married or in-union women,1989,45-49,82.462414,237.389086,Estimate


In [949]:
df_10.rename(columns={
    "dataprocess": "data_process",
    "countryorarea": "country"
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,country,isocode,indicator,year,agegroup,percentage,number,data_process
100266,Panama,591,Married or in-union women,2029,25-29,56.434706,99.849643,Projection
52405,Grenada,308,Married or in-union women,2040,40-44,67.231045,2.844546,Projection
15,Afghanistan,4,Married or in-union women,1971,15-49,80.901721,2080.26682,Estimate
62530,Israel,376,Married or in-union women,2010,25-29,56.298558,153.699005,Estimate
19417,Belarus,112,Married or in-union women,2048,20-24,28.372882,41.008744,Projection


In [950]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [951]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [952]:
df_10.dropna(inplace=True)

In [953]:
df_info = pd.DataFrame({
    'datatypes': df_10.dtypes,
    'null_count': df_10.isnull().sum(),
    'unique_count': df_10.nunique()
})
print(df_info)

             datatypes  null_count  unique_count
country         object           0           225
isocode          int64           0           225
indicator       object           0             1
year             int64           0            81
agegroup        object           0             8
percentage     float64           0          9667
number         float64           0         65394
data_process    object           0             2


In [954]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   country       145800 non-null  object 
 1   isocode       145800 non-null  int64  
 2   indicator     145800 non-null  object 
 3   year          145800 non-null  int64  
 4   agegroup      145800 non-null  object 
 5   percentage    145800 non-null  float64
 6   number        145800 non-null  float64
 7   data_process  145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [955]:
#df_10.to_csv("../data/Cleaned/cleaned_countries_1970_2025_un.csv", index=False)

In [956]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [957]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [958]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
3039,Belarus,112,1999,1999,Men,[40-44],40,44,81.58,Census,1999 Census,1668,Belarus 1999 Census,UNSD,1.0,,
23578,Ireland,372,1988,1988,Men,[65+],65,999,59.24,Estimate,1988 Estimate,2126,Ireland 1988 Estimate,UNSD,,,
32591,Mozambique,508,2009,2009,Women,[70-74],70,74,25.9,Survey,2009 AIS,4693,Mozambique 2009 HIV-AIDS Indicator Survey,DHS_HH,1.0,,
47861,Switzerland,756,1994,1994,Women,[30-34],30,34,70.28,Estimate,1994 Estimate,2228,Switzerland 1994 Estimate,UNSD,,,
17963,Greenland,304,1970,1970,Men,[10-14],10,14,0.0,Census,1970 Census,2390,Greenland 1970 Census,UNSD,,,
2448,Bahamas,44,1990,1990,Women,[20-24],20,24,24.14,Census,1990 Census,1010,Bahamas 1990 Census,UNSD,1.0,,
19378,Haiti,332,1971,1971,Men,[55-59],55,59,81.82,Census,1971 Census,1656,Haiti 1971 Census,UNSD,1.0,,
19010,Guinea,324,2014,2014,Women,[35-39],35,39,92.5,Census,2014 Census,5254,Guinea 2014 Census,UNSD,1.0,,


In [959]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
44525,Slovenia,705,2013,2013,Men,[20-24],20,24,1.81,Estimate,2013 Estimate,2218,Slovenia 2013 Estimate,UNSD,,,
41763,San Marino,674,1991,1991,Men,[50-54],50,54,90.52,Estimate,1991 Estimate,2208,San Marino 1991 Estimate,UNSD,,,
34300,Netherlands,528,2016,2016,Men,[65-69],65,69,75.95,Estimate,2016 Estimate,2170,Netherlands 2016 Estimate,UNSD,1.0,,
14560,Finland,246,1982,1982,Women,[55-59],55,59,64.91,Estimate,1982 Estimate,2093,Finland 1982 Estimate,UNSD,,,
17156,Germany,276,2004,2004,Women,[60-64],60,64,70.87,Estimate,2004 Estimate,2102,Germany 2004 Estimate,UNSD,,,
1533,Australia,36,2011,2011,Women,[30-34],30,34,54.95,Census,2011 Census,4857,Australia 2011 Census,UNSD,,Data randomly rounded to protect confidentiali...,"Including population in off-shore, migratory a..."
3864,Bermuda,60,2016,2016,Men,[30-34],30,34,40.26,Census,2016 Census,7266,Bermuda 2016 Census,UNSD,,,
51855,United States of America,840,2010,2010,Men,[30-34],30,34,54.36,Census,2010 Census,4761,United States of America 2010 Census,US Census Bureau,,,


In [960]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
31399,Mexico,484,Women,[75+],75,999,24.44,Census,1995 Sample Census,IPUMS
53171,Zambia,894,Men,[35-39],35,39,87.68,Survey,1999 MICS_HH,MICS_HH
26229,Kenya,404,Women,[35-39],35,39,79.3,Survey,2003 DHS,DHS_STATcompiler
42658,Senegal,686,Women,[25-29],25,29,74.96,Census,2002 Census,UNSD
29512,Luxembourg,442,Men,[55-59],55,59,83.12,Estimate,1987 Estimate,UNSD
31601,Mexico,484,Women,[70-74],70,74,44.29,Estimate,2016 Estimate,UNSD
27352,Latvia,428,Men,[35-39],35,39,64.64,Census,2011 Census,Eurostat
45281,South Sudan,728,Women,[75+],75,999,61.86,Census,2008 Census,UNSD
1568,Australia,36,Women,[75+],75,999,34.45,Census,2016 Census,National statistics
50732,Uganda,800,Men,[45-49],45,49,89.2,Survey,2016 DHS,DHS_STATcompiler


In [961]:
df_11.drop_duplicates(inplace=True)
df_11.dropna(inplace=True)

In [962]:
df_info = pd.DataFrame({
    'datatypes': df_11.dtypes,
    'null_count': df_11.isnull().sum(),
    'unique_count': df_11.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           233
isocode                  int64           0           230
sex                     object           0             2
agegroup                object           0            23
age_start                int64           0            17
ageend                   int64           0            15
data_value             float64           0          9213
dataprocess             object           0             6
datacatalogshortname    object           0           412
data_source             object           0            15


In [963]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [964]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [965]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [966]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
50313,Switzerland,756,2013,2013,Women,[25-29],25,29,31.96,Estimate,2013 Estimate,2228,Switzerland 2013 Estimate,UNSD,1.0,,
8962,"China, Taiwan Province of China",158,1980,1980,Women,[40-44],40,44,98.77,Census,1980 Census,4418,Taiwan (Province of China) 1980 Census,US Census Bureau,1.0,De jure population.,
29497,Lesotho,426,2011,2011,Men,[30-34],30,34,72.0,Survey,2011 DS,5608,Lesotho 2011 Demographic Survey,National statistics,1.0,Totals may not add up to the sum of the respec...,
50510,Syrian Arab Republic,760,1981,1981,Women,[30-34],30,34,91.79,Census,1981 Census,1160,Syrian Arab Republic 1981 Census,UNSD,,,Including Palestinian refugees.
47462,Spain,724,1986,1986,Women,[75+],75,999,85.91,Estimate,1986 Estimate,2222,Spain 1986 Estimate,UNSD,,,
25544,Israel,376,1984,1984,Women,[50-54],50,54,96.15,Estimate,1984 Estimate,2127,Israel 1984 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
33251,Mexico,484,2018,2018,Men,[35-39],35,39,85.22,Estimate,2018 Estimate,2158,Mexico 2018 Estimate,UNSD,1.0,,
46032,Slovakia,703,2015,2015,Men,[70-74],70,74,96.11,Estimate,2015 Estimate,2216,Slovakia 2015 Estimate,UNSD,,,


In [967]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
47066,South Africa,710,Women,[40-44],40,44,78.89,Census,UNSD
31031,Luxembourg,442,Men,[75+],75,999,89.62,Census,UNSD
44928,Senegal,686,Women,[70-74],70,74,99.67,Survey,DHS_HH
28127,Kuwait,414,Men,[30-34],30,34,84.62,Census,UNSD
38018,Norway,578,Women,[25-29],25,29,50.73,Census,UNSD
50862,Thailand,764,Men,[40-44],40,44,84.7,Census,UNSD
39152,Pakistan,586,Men,[25-29],25,29,55.5,Survey,DHS_STATcompiler
10749,Cyprus,196,Men,[70-74],70,74,95.35,Census,UNSD


In [968]:
df_12.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_12.dtypes,
    'null_count': df_12.isnull().sum(),
    'unique_count': df_12.nunique()
})
print(df_info)

            datatypes  null_count  unique_count
country        object           0           233
isocode         int64           0           230
sex            object           0             2
agegroup       object           0            23
age_start       int64           0            17
age_end         int64           0            15
datavalue     float64           0          8396
dataprocess    object           0             6
datasource     object           0            15


In [969]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [970]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [971]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [972]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
14868,China,156,[20-24],ASFR2024,2001.0,145.0,"2001 FPRHS,Birth Histories,China's recent fert...",Birth histories,Survey,Survey,2912,China 2001 National Family Planning and Reprod...,2001 FPRHS,2001,2001,China's-recent-fertility-decline Evidence-from...,2010
59856,Russian Federation,643,[45-49],ASFR4549,1990.5,0.1,NSO.20170421,Direct,Register,VR,545,Vital Registration,Register,1990,1990,Central Statistics Database,2017
13356,Chile,152,[Total],MAC,1985.5,27.2079,"Estimates,Fertility data (Adjusted),HFC-LAFD,2...",Fertility data (adjusted),Estimate,Estimate,2067,All sources of estimates,Estimates,1985,1985,Age-specific fertility rates calculated from o...,1989
26106,France,250,[20-24],ASFR2024,2000.5,64.1,Eurostat.20190531,Official estimates,Estimate,Estimate,2244,All sources of estimates,Estimates,2000,2000,"Eurostat Statistics, Fertility rates by age [d...",2019
75175,United States of America,840,[20-24],ASFR2024,2013.5,80.7,NSO.2017,Direct,Register,VR,552,Vital Registration,Register,2013,2013,NVSR Report,2017
63532,Singapore,702,[15-19],ASFR1519,1988.50274,7.3,"NSO.2019, Direct",Direct,Register,VR,528,Vital Registration,Register,1988,1988,"SingStat Table Builder, M810091-Births And Fer...",2019


In [973]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13 = df_13.rename(columns={
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type"
})

In [974]:
df_13.head()

Unnamed: 0,country,age_group,date,value,data_type,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
0,Afghanistan,[Total],1964.977051,7.966653,Reverse survival method,Census,1979 Census,1979,1979
1,Afghanistan,[Total],1965.977051,8.212275,Reverse survival method,Census,1979 Census,1979,1979
2,Afghanistan,[Total],1966.977051,8.317603,Reverse survival method,Census,1979 Census,1979,1979
3,Afghanistan,[Total],1967.977051,8.225812,Reverse survival method,Census,1979 Census,1979,1979
4,Afghanistan,[Total],1968.977051,8.068459,Reverse survival method,Census,1979 Census,1979,1979


In [975]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,country,age_group,date,value,data_type,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
68286,Switzerland,[45-49],2003,0.32,Official estimates,Estimate,Estimates,2003,2003
10702,Burkina Faso,[20-24],1985,330.0,Direct,Survey,1993 DHS,1992,1993
20445,Dem. Rep. of the Congo,[35-39],1994,221.0,Own-children method,Survey,1995 MICS,1995,1995
64765,Solomon Islands,[25-29],1994,257.0,Birth histories,Survey,2006-2007 DHS,2006,2007
67962,Switzerland,[25-29],1963,177.08,Official estimates,Estimate,Estimates,1963,1963
16567,Colombia,[Total],2005,1.83,Computed rate from DYB,Register,Register,2005,2005
50335,Nepal,[Total],1988,5.43,Own-children method,Survey,1995-1996 LSMS,1995,1996
53771,Norway,[45-49],2005,0.37,Official estimates,Estimate,Estimates,2005,2005
21219,Djibouti,[30-34],2005,210.0,Birth histories,Survey,2012 FHS,2012,2012
1794,Argentina,[20-24],1982,159.99,Direct,Register,Register,1982,1982


In [976]:
df_13.drop_duplicates(inplace=True)
df_13.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_13.dtypes,
    'null_count': df_13.isnull().sum(),
    'unique_count': df_13.nunique()
})
print(df_info)

                    datatypes  null_count  unique_count
country                object           0           201
age_group              object           0             8
date                    int32           0            69
value                 float64           0         18752
data_type              object           0            30
datasourcetype         object           0             7
datasourcenameshort    object           0           539
datasourcestartyear     int64           0            69
datasourceendyear      object           0            70


In [977]:
#df_13.to_csv("../data/Cleaned/cleaned_fertility_indicators_un.csv", index=False)

In [978]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [979]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [980]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
180753,Norway,578,1975,1975,Men,Married,,,[20-24],20,...,1975 Estimate,2180,Norway 1975 Estimate,UNSD,,,,,,
194274,Philippines,608,2000,2000,Men,Divorced or Separated,,1.0,[60-64],60,...,2000 Census,1297,Philippines 2000 Census,UNSD,,,,,,
218888,Sierra Leone,694,2000,2000,Women,Widowed,,,[30-34],30,...,2000 MICS_HH,1936,Sierra Leone 2000 Multiple Indicator Cluster S...,MICS_HH,,,,,,
53289,Croatia,191,1991,1991,Women,Single,,,[15-19],15,...,1991 Census,897,Croatia 1991 Census,UNSD,,,,,,
8982,Austria,40,1978,1978,Men,Married,,,[40-44],40,...,1978 Estimate,2038,Austria 1978 Estimate,UNSD,,,,,,


In [981]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
210162,San Marino,1975,1975,Men,Widowed,[55-59],55,59,2.13,Estimate,1975 Estimate,UNSD
114960,Iceland,2016,2016,Women,Widowed,[65-69],65,69,8.88,Estimate,2016 Estimate,UNSD
209628,Samoa,2016,2016,Women,Married,[45-49],45,49,81.3,Census,2016 Census,UNSD
58565,Democratic Republic of the Congo,2001,2001,Women,Never married,[75+],75,999,0.65,Survey,2001 MICS_HH,MICS_HH
176496,Nicaragua,1998,1998,Women,Divorced,[55-59],55,59,2.23,Survey,1997-1998 DHS,DHS_HH
19112,Bermuda,2000,2000,Men,Single,[65-69],65,69,6.16,Census,2000 Census,UNSD
94113,Greenland,1982,1982,Men,Single,[15-19],15,19,100.0,Estimate,1982 Estimate,UNSD
7212,Australia,1996,1996,Women,Widowed,[60-64],60,64,15.73,Census,1996 Census,UNSD
236763,Sweden,2010,2010,Men,Married,[35-39],35,39,43.33,Estimate,2010 Estimate,UNSD
115359,India,1971,1971,Women,Single,[60-64],60,64,0.35,Census,1971 Census,UNSD


In [982]:
df_14.drop_duplicates(inplace=True)
df_14.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_14.dtypes,
    'null_count': df_14.isnull().sum(),
    'unique_count': df_14.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           235
year_start               int64           0            62
year_end                 int64           0            60
sex                     object           0             2
marital_status          object           0            35
age_group               object           0            63
agestart                 int64           0            21
ageend                   int64           0            20
datavalue              float64           0          9994
dataprocess             object           0             6
datacatalogshortname    object           0           443
datasource              object           0            15


In [983]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [984]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [985]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [986]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
265,World,900,M49,Married or in-union women,2003,20-24,52.82535,263258.951259,Estimate
14488,Southern Europe,925,SDG-M49,Married or in-union women,1999,15-19,4.11641,186.24222,Estimate
20349,Australia and New Zealand,1834,SDG-M49,Married or in-union women,2026,45-49,67.228452,679.583846,Projection
9246,Central Asia,5500,SDG-M49,Married or in-union women,1991,45-49,80.041644,1211.118913,Estimate
3682,Oceania excluding Australia and New Zealand,543,SDG,Married or in-union women,2025,25-29,77.171306,875.796318,Projection
7099,Northern Africa,912,SDG-M49,Married or in-union women,2047,30-34,81.445827,22757.612416,Projection


In [987]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
10727,Southern Asia,5501,Married or in-union women,2014,15-49,72.364553,689655.575474,Estimate
6769,Northern Africa,912,Married or in-union women,2006,20-24,42.337098,8177.990878,Estimate
18843,Oceania,909,Married or in-union women,1976,30-34,82.041414,713.335737,Estimate
24994,Least developed countries,941,Married or in-union women,2016,25-29,79.922423,32885.225358,Estimate
23273,Developed countries,901,Married or in-union women,2044,20-24,14.497811,4916.764045,Projection
28313,No income group available,1518,Married or in-union women,2026,20-24,36.753227,906.025119,Projection
24613,Other developing countries,934,Married or in-union women,2049,40-44,82.391437,172191.087816,Projection
20357,Australia and New Zealand,1834,Married or in-union women,2027,25-29,48.361719,494.217592,Projection
14653,Southern Europe,925,Married or in-union women,2019,40-44,76.729765,4374.244966,Estimate
10942,Southern Asia,5501,Married or in-union women,2041,45-49,87.342604,138416.985178,Projection


In [988]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [989]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [990]:
df_15.dropna(inplace=True)
df_15.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_15.dtypes,
    'null_count': df_15.isnull().sum(),
    'unique_count': df_15.nunique()
})
print(df_info)

           datatypes  null_count  unique_count
region        object           0            43
iso_code       int64           0            44
indicator     object           0             1
year           int64           0            81
age_group     object           0             8
percentage   float64           0          7796
number         int32           0         20311
process       object           0             2


In [991]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [992]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [993]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition_S1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [994]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [995]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [996]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [997]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [998]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [999]:
if "year" not in df_16_1.columns:
    df_16_1["year"] = 2024
df_16_1['source'] = 'OECD'
df_16_1.sample(10)

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children,year,source
20,Slovenia,2.4,3.9,2.5,2024,OECD
43,Estonia,1.8,3.8,2.6,2024,OECD
41,Denmark,1.9,3.9,2.5,2024,OECD
0,Mexico,3.56,4.08,2.76,2024,OECD
34,Bulgaria,2.2,3.5,2.3,2024,OECD
42,Finland,1.9,4.0,2.6,2024,OECD
26,United Kingdom,2.3,3.9,2.8,2024,OECD
29,Korea,2.21,3.55,2.34,2024,OECD
37,Sweden,2.1,3.9,2.6,2024,OECD
28,Switzerland,2.21,4.02,2.58,2024,OECD


In [1000]:
df_16_1.to_csv('../data/Cleaned/cleaned_average_size_of_households_type_2024_oecd.csv', index=False)

In [1001]:
df_16_1.to_sql('average_size_of_households_type_2024_oecd', engine, if_exists = 'replace', index= False)

39

In [1002]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition_S2.csv', header=1)
df_16_2.head(10)
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [1003]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [1004]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)
df_16_2.sample(10)

Unnamed: 0,country,total,with_children,without_children,total1,single_mother_households,single_father_households,single_person_households,other_households_types
15,Iceland,45.19,25.42,19.77,7.35,6.23,1.12,29.16,18.29
11,France,49.73,22.19,27.54,7.68,6.23,1.45,37.78,4.81
38,OECD-36 average,49.05,,,7.38,,,32.11,11.46
14,Hungary,45.25,20.43,24.82,7.07,5.71,1.37,34.42,13.25
23,Luxembourg,53.06,26.86,26.2,6.63,5.39,1.24,28.87,11.44
16,Ireland,53.03,29.45,23.58,6.93,6.12,0.8,23.14,16.91
42,Malta,46.92,21.11,25.81,5.68,4.56,1.12,32.51,14.89
26,New Zealand,57.33,29.25,28.07,10.39,,,22.79,9.49
25,Netherlands,53.6,23.01,30.59,6.1,5.0,1.09,38.5,1.8
17,Israel,64.5,45.8,18.7,9.1,,,26.4,


In [1005]:
df_16_2.drop(columns=["total", "total1"], errors="ignore", inplace=True)

In [1006]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)


In [1007]:
df_16_2['unit'] = '%'
df_16_2['source'] = 'OECD'

In [1008]:
if "year" not in df_16_2.columns:
    df_16_2["year"] = 2021
df_16_2.sample(10)

Unnamed: 0,country,with_children,without_children,single_mother_households,single_father_households,single_person_households,other_households_types,unit,source,year
21,Latvia,12.21,15.6,11.21,2.23,41.08,17.68,%,OECD,2021
35,Türkiye,40.84,13.54,7.75,2.31,18.88,16.68,%,OECD,2021
30,Slovak Republic,16.99,20.16,5.39,0.84,31.4,25.21,%,OECD,2021
10,Finland,17.06,28.58,4.5,0.93,45.34,3.6,%,OECD,2021
44,EU average,22.09,25.88,5.53,1.18,34.0,11.32,%,OECD,2021
37,United States,19.85,33.34,5.21,1.58,27.61,12.41,%,OECD,2021
33,Sweden,22.49,26.78,4.91,1.76,39.24,4.82,%,OECD,2021
20,Korea,26.25,17.23,6.85,2.28,35.47,11.93,%,OECD,2021
43,Romania,20.73,24.93,4.56,1.94,33.63,14.21,%,OECD,2021
2,Belgium,23.98,28.24,6.08,1.35,35.5,4.86,%,OECD,2021


In [1009]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                            dtype  null_count  unique_count
country                    object           0            36
with_children             float64           0            35
without_children          float64           0            36
single_mother_households  float64           0            32
single_father_households  float64           0            31
single_person_households  float64           0            35
other_households_types    float64           0            36
unit                       object           0             1
source                     object           0             1
year                        int64           0             1
country                      object
with_children               float64
without_children            float64
single_mother_households    float64
single_father_households    float64
single_person_households    float64
other_households_types      float64
unit                         object
source                       object
year            

In [1010]:
#df_16_2.to_csv('../data/Cleaned/cleaned_types_of_household_2021_oecd.csv', index = False)

In [1011]:
#df_16_2.to_sql('types_of_household_2021_oecd', engine, if_exists = 'replace', index= False)

In [1012]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition_S3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [1013]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [1014]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1015]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [1016]:
if "year" not in df_16_3.columns:
    df_16_3["year"] = 2024
df_16_3.sample(10)

Unnamed: 0,country,0_children,1_child,2_children,3_or_more_children,children_under_6,year
43,Romania,72.46,14.29,9.24,4.02,9.64,2024
23,Luxembourg,73.0,12.49,12.07,2.41,11.54,2024
16,Ireland,69.02,12.42,12.18,6.38,11.81,2024
41,Cyprus,71.36,13.88,11.67,3.1,12.71,2024
32,Spain,74.61,13.54,8.95,2.9,8.79,2024
22,Lithuania,80.44,11.06,7.0,1.51,8.12,2024
9,Estonia,75.76,12.53,8.73,2.98,9.85,2024
10,Finland,81.98,7.89,6.99,3.14,7.14,2024
11,France,75.36,11.43,9.23,3.99,9.86,2024
33,Sweden,74.84,10.77,9.83,4.56,9.95,2024


In [1017]:
df_16_3.rename(columns={
    "0_children": "households_0_children",
    "1_child": "households_1_child",
    "2_children": "households_2_children",
    "3_or_more_children": "households_3_or_more_children"
}, inplace=True)

In [1018]:
df_16_3["unit"] = "%"
df_16_3["source"] = "OECD"
df_16_3.sample(10)

Unnamed: 0,country,households_0_children,households_1_child,households_2_children,households_3_or_more_children,children_under_6,year,unit,source
7,Czechia,71.95,13.85,11.56,2.64,12.29,2024,%,OECD
25,Netherlands,78.65,8.78,9.27,3.3,8.79,2024,%,OECD
13,Greece,74.31,11.83,9.97,3.89,9.37,2024,%,OECD
40,Croatia,74.18,11.96,10.1,3.76,10.44,2024,%,OECD
10,Finland,81.98,7.89,6.99,3.14,7.14,2024,%,OECD
42,Malta,76.49,12.68,7.81,2.98,9.61,2024,%,OECD
19,Japan,81.94,8.78,7.17,2.11,7.58,2024,%,OECD
31,Slovenia,75.0,11.25,10.2,3.56,9.93,2024,%,OECD
27,Norway,76.87,10.53,9.14,3.47,8.88,2024,%,OECD
22,Lithuania,80.44,11.06,7.0,1.51,8.12,2024,%,OECD


In [1019]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                                 dtype  null_count  unique_count
country                         object           0            33
households_0_children          float64           0            32
households_1_child             float64           0            32
households_2_children          float64           0            33
households_3_or_more_children  float64           0            31
children_under_6               float64           0            31
year                             int64           0             1
unit                            object           0             1
source                          object           0             1
country                           object
households_0_children            float64
households_1_child               float64
households_2_children            float64
households_3_or_more_children    float64
children_under_6                 float64
year                               int64
unit                              object
source                

In [1020]:
#df_16_3.to_csv('../data/Cleaned/cleaned_households_by_number_of_children_2024_oecd.csv', index=False)

In [1021]:
#df_16_3.to_sql('households_by_number_of_children_2024_oecd', engine, index= False)

In [1022]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
#total_fertility_rates_from_1960_oecd
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [1023]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [1024]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1025]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1026]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [1027]:
# Identify id and year columns
id_cols = ["country"]
year_cols = [c for c in df_17_1.columns if re.fullmatch(r"\d{4}", str(c))]

# Wide → Long
df_17_1 = df_17_1.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name="year",
    value_name="value"
)

# Fix types
df_17_1["year"] = df_17_1["year"].astype(int)
df_17_1["value"] = pd.to_numeric(df_17_1["value"], errors="coerce")
df_17_1 = df_17_1.dropna(subset=["value"])

# Add metadata
df_17_1["indicator"] = "total_fertility_rate"
df_17_1["source"] = "OECD"

# Final column order
df_17_1 = df_17_1[["country","year","indicator","value","source"]]

df_17_1.sample(10)

Unnamed: 0,country,year,indicator,value,source
1067,China,1981,total_fertility_rate,2.79,OECD
3085,Croatia,2022,total_fertility_rate,1.53,OECD
1638,Lithuania,1993,total_fertility_rate,1.74,OECD
2733,China,2015,total_fertility_rate,1.67,OECD
1762,Croatia,1995,total_fertility_rate,1.5,OECD
1138,France,1983,total_fertility_rate,1.78,OECD
900,Japan,1978,total_fertility_rate,1.79,OECD
2553,Colombia,2012,total_fertility_rate,1.9,OECD
1608,Indonesia,1992,total_fertility_rate,2.95,OECD
1101,Mexico,1982,total_fertility_rate,4.46,OECD


In [1028]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            49
year         int32           0            64
indicator   object           0             1
value      float64           0           490
source      object           0             1


In [1029]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [1030]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [1031]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [1032]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [1033]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_17_2.head()

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341


In [1034]:
# 0) Detect year columns first (keep as-is)
year_cols = [c for c in df_17_2.columns.astype(str) if re.fullmatch(r"\d{4}", c)]

# 1) Normalize only non-year columns
df_17_2.columns = [
    c.strip().lower().replace(" ", "_").replace("(", "").replace(")", "")
    if c not in year_cols else c
    for c in df_17_2.columns.astype(str)
]

# 2) Wide → Long
df_17_2 = df_17_2.melt(
    id_vars=[c for c in ["country", "birth_order"] if c in df_17_2.columns],
    value_vars=year_cols,
    var_name="year",
    value_name="value"
)

# 3) Types
df_17_2["year"] = df_17_2["year"].astype(int)
df_17_2["value"] = pd.to_numeric(df_17_2["value"].astype(str).str.replace(",", ".", regex=False),
                                 errors="coerce")
df_17_2 = df_17_2.dropna(subset=["value"]).drop_duplicates()

# 4) Mart fields + order
df_17_2["indicator"] = "births_by_birth_order"
df_17_2["unit"] = "%"
df_17_2["source"] = "OECD"
df_17_2 = df_17_2[["country", "year", "indicator", "birth_order", "unit", "value", "source"]]


In [1035]:
df_17_2

Unnamed: 0,country,year,indicator,birth_order,unit,value,source
0,Austria,1987,births_by_birth_order,First birth,%,47.6,OECD
1,Austria,1987,births_by_birth_order,Second birth,%,33.7,OECD
2,Austria,1987,births_by_birth_order,Third birth or higher,%,18.8,OECD
3,Belgium,1987,births_by_birth_order,First birth,%,46.8,OECD
4,Belgium,1987,births_by_birth_order,Second birth,%,33.0,OECD
...,...,...,...,...,...,...,...
1882,Switzerland,2023,births_by_birth_order,Second birth,%,37.1,OECD
1883,Switzerland,2023,births_by_birth_order,Third birth or higher,%,14.4,OECD
1884,Romania,2023,births_by_birth_order,First birth,%,49.9,OECD
1885,Romania,2023,births_by_birth_order,Second birth,%,30.8,OECD


In [1036]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

               dtype  null_count  unique_count
country       object           0            17
year           int32           0            37
indicator     object           0             1
birth_order   object           0             3
unit          object           0             1
value        float64           0           422
source        object           0             1


In [1037]:
#df_17_2.to_csv('../data/Cleaned/cleaned_births_by_birth_order_oecd.csv', index=False)

In [1038]:
#df_17_2.to_sql('births_by_birth_order_oecd', engine, if_exists='replace', index=False)

In [1039]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1040]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [1041]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [1042]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [1043]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1044]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [1045]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [1046]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [1047]:
df_18['unit'] = '%'
df_18['source'] = 'OECD'

df_18.sample(10)

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other,unit,source
131,Finland,2012,86.5,13.1,0.4,%,OECD
434,Switzerland,2013,88.8,10.8,0.4,%,OECD
162,Germany,2015,82.8,16.0,1.3,%,OECD
330,Poland,2005,86.6,12.1,1.2,%,OECD
305,Netherlands,2011,86.7,12.5,0.8,%,OECD
199,Iceland,2008,82.7,16.8,0.5,%,OECD
155,Germany,2008,83.9,15.0,1.0,%,OECD
356,Portugal,2016,78.4,20.0,1.6,%,OECD
31,Belgium,2014,74.5,23.6,1.9,%,OECD
314,Norway,2003,80.8,17.7,1.6,%,OECD


In [1048]:

df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            38
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           0            50
unit                          object           0             1
source                        object           0             1


In [1049]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children_oecd.csv', index=False)

In [1050]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [1051]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
#age_of_mothers_at_childbirth
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [1052]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1963     object           0            19
1964     object           0            22
1965     object           0            22
1966     object           0            22
1967     object           0            22
1968     object           0            20
1969     object           0            21
1970     object           0            19
1971     object           0            19
1972     object           0            20
1973     object           0            20
1974     object           0            24
1975     object           0            21
1976     object           0            22
1977     object           0            20
1978     object           0            22
1979     object           0            23
1980     object           0            22
1981     object           0            20
1982     object           0            18
1983     object           0            20
1984     object           0       

In [1053]:
df_19_1.columns = df_19_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1054]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_19_1.columns if c != "country"]

df_19_1[num_cols] = (
    df_19_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1055]:
df_19_1.drop_duplicates(inplace=True)
df_19_1.dropna(inplace=True)

In [1056]:
# --- Identify id columns and year columns
id_cols = [c for c in ['country','age','age_group','indicator'] if c in df_19_1.columns]
year_cols = [c for c in df_19_1.columns if re.fullmatch(r'\d{4}', c)]

# --- Wide → Long
df_19_1 = df_19_1.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# --- Fix types
df_19_1['year'] = df_19_1['year'].astype(int)
df_19_1['age'] = pd.to_numeric(df_19_1['value'], errors='coerce')
df_19_1 = df_19_1.dropna(subset=['value'])

# --- Add unit & indicator if missing
if 'indicator' not in df_19_1.columns:
    df_19_1['indicator'] = 'fertility_by_age'
df_19_1['unit'] = 'age'
df_19_1['source'] = 'OECD'
df_19_1.sample(10)

Unnamed: 0,country,year,value,age,indicator,unit,source
1018,Estonia,2009,28.9,28.9,fertility_by_age,age,OECD
419,Austria,1982,26.3,26.3,fertility_by_age,age,OECD
462,Australia,1984,27.6,27.6,fertility_by_age,age,OECD
1071,New Zealand,2011,29.7,29.7,fertility_by_age,age,OECD
449,Hungary,1983,24.8,24.8,fertility_by_age,age,OECD
823,Hungary,2000,27.3,27.3,fertility_by_age,age,OECD
857,Bulgaria,2001,25.1,25.1,fertility_by_age,age,OECD
1167,Austria,2016,30.6,30.6,fertility_by_age,age,OECD
911,Hungary,2004,28.2,28.2,fertility_by_age,age,OECD
770,Australia,1998,29.3,29.3,fertility_by_age,age,OECD


In [1057]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            22
year         int32           0            59
value      float64           0            90
age        float64           0            90
indicator   object           0             1
unit        object           0             1
source      object           0             1


In [1058]:
#df_19_1.to_csv('../data/Cleaned/age_of_mothers_at_childbirth_oecd.csv', index=False)

In [1059]:
#df_19_1.to_sql('age_of_mothers_at_childbirth_oecd', engine, if_exists='replace', index=False)

In [1060]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
#fertility_per_1000_from 1960
df_19_2.head()

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [1061]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            21
Age group  object           0             7
1960       object           0           136
1961       object           0           140
1962       object           0           140
...           ...         ...           ...
2017       object           0           124
2018       object           0           128
2019       object           0           126
2020       object           0           121
2021       object           7           119

[64 rows x 3 columns]


In [1062]:
df_19_2.columns = df_19_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_2.head()

Unnamed: 0,country,age_group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [1063]:
# --- Ensure "country" and "age_group" are strings
df_19_2["country"] = df_19_2["country"].astype(str).str.strip().str.title()
df_19_2["age_group"] = df_19_2["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_2.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_2[num_cols] = (
    df_19_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_2[num_cols] = df_19_2[num_cols].round(2)

In [1064]:
df_19_2.drop_duplicates(inplace=True)
df_19_2.dropna(inplace = True)

In [1065]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            19
age_group   object           0             7
1960       float64           0           124
1961       float64           0           126
1962       float64           0           126
...            ...         ...           ...
2017       float64           0           118
2018       float64           0           121
2019       float64           0           120
2020       float64           0           115
2021       float64           0           118

[64 rows x 3 columns]


In [1066]:
# --- Identify id columns and year columns
id_cols = [c for c in ['country','age','age_group','indicator'] if c in df_19_2.columns]
year_cols = [c for c in df_19_2.columns if re.fullmatch(r'\d{4}', c)]

# --- Wide → Long
df_19_2 = df_19_2.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# --- Fix types
df_19_2['year'] = df_19_2['year'].astype(int)
df_19_2['value'] = pd.to_numeric(df_19_2['value'], errors='coerce')
df_19_2 = df_19_2.dropna(subset=['value'])

# --- Add unit & indicator if missing
if 'indicator' not in df_19_2.columns:
    df_19_2['indicator'] = 'fertility_by_age'
df_19_2['unit'] = '‰'
df_19_2['source'] = 'OECD'

In [1067]:
df_19_2

Unnamed: 0,country,age_group,year,value,indicator,unit,source
0,Australia,15-19,1960,44.3,fertility_by_age,‰,OECD
1,Australia,20-24,1960,220.1,fertility_by_age,‰,OECD
2,Australia,25-29,1960,216.3,fertility_by_age,‰,OECD
3,Australia,30-34,1960,127.5,fertility_by_age,‰,OECD
4,Australia,35-39,1960,62.3,fertility_by_age,‰,OECD
...,...,...,...,...,...,...,...
8241,Bulgaria,25-29,2021,91.7,fertility_by_age,‰,OECD
8242,Bulgaria,30-34,2021,73.6,fertility_by_age,‰,OECD
8243,Bulgaria,35-39,2021,34.8,fertility_by_age,‰,OECD
8244,Bulgaria,40-44,2021,7.8,fertility_by_age,‰,OECD


In [1068]:
df_19_2.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_1960_oecd.csv', index=False)

In [1069]:
df_19_2.to_sql('fertility_per_1000_from_1960_oecd', engine, if_exists='replace', index=False)

246

In [1070]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
#fertility_per_1000_from_2000
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [1071]:
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            43
Age group  object           0             7
2000       object           0           233
2001       object           0           248
2002       object           0           240
2003       object           0           239
2004       object           0           245
2005       object           0           240
2006       object           0           239
2007       object           0           242
2008       object           0           252
2009       object           0           251
2010       object           0           239
2011       object           0           235
2012       object           0           242
2013       object           0           234
2014       object           0           238
2015       object           0           237
2016       object           0           248
2017       object           0           236
2018       object           0           245
2019       object           0   

In [1072]:
df_19_3.columns = df_19_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_3.head()

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587


In [1073]:
# --- Ensure "country" and "age_group" are strings
df_19_3["country"] = df_19_3["country"].astype(str).str.strip().str.title()
df_19_3["age_group"] = df_19_3["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_3.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_3[num_cols] = (
    df_19_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_3[num_cols] = df_19_3[num_cols].round(2)

In [1074]:
df_19_3.drop_duplicates(inplace=True)
df_19_3.dropna(inplace=True)

In [1075]:
# --- Identify id columns and year columns
id_cols = [c for c in ['country','age','age_group','indicator'] if c in df_19_3.columns]
year_cols = [c for c in df_19_3.columns if re.fullmatch(r'\d{4}', c)]

# --- Wide → Long
df_19_3 = df_19_3.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# --- Fix types
df_19_3['year'] = df_19_3['year'].astype(int)
df_19_3['value'] = pd.to_numeric(df_19_3['value'], errors='coerce')
df_19_3 = df_19_3.dropna(subset=['value'])

# --- Add unit & indicator if missing
if 'indicator' not in df_19_3.columns:
    df_19_3['indicator'] = 'fertility_by_age'
df_19_3['unit'] = '‰'
df_19_3['source'] = 'OECD'

In [1076]:
df_19_3

Unnamed: 0,country,age_group,year,value,indicator,unit,source
0,Oecd-Average,15-19,2000,22.6,fertility_by_age,‰,OECD
1,Oecd-Average,20-24,2000,71.7,fertility_by_age,‰,OECD
2,Oecd-Average,25-29,2000,107.9,fertility_by_age,‰,OECD
3,Oecd-Average,30-34,2000,88.1,fertility_by_age,‰,OECD
4,Oecd-Average,35-39,2000,38.1,fertility_by_age,‰,OECD
...,...,...,...,...,...,...,...
6287,Romania,25-29,2021,110.9,fertility_by_age,‰,OECD
6288,Romania,30-34,2021,87.5,fertility_by_age,‰,OECD
6289,Romania,35-39,2021,41.1,fertility_by_age,‰,OECD
6290,Romania,40-44,2021,8.2,fertility_by_age,‰,OECD


In [1077]:
#Check again
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            41
age_group   object           0             7
year         int32           0            22
value      float64           0          1337
indicator   object           0             1
unit        object           0             1
source      object           0             1


In [1078]:
df_19_3.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_2000_oecd.csv',index=False)

In [1079]:
df_19_3.to_sql('fertility_per_1000_from_2000_oecd',engine, if_exists='replace', index=False)

292

In [1080]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
#(%)share_of_births_outside_of_marriage
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [1081]:
df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1960     object           0            26
1961     object           0            24
1962     object           0            24
1963     object           0            24
...         ...         ...           ...
2016     object           0            24
2017     object           0            26
2018     object           0            25
2019     object           0            25
2020     object           0            24

[62 rows x 3 columns]


In [1082]:
df_20.columns = df_20.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1083]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_20.columns if c != "country"]

df_20[num_cols] = (
    df_20[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1084]:
df_20.drop_duplicates(inplace=True)
df_20.dropna(inplace=True)



In [1085]:

# Identify columns
id_cols = [c for c in ['country','category','sex'] if c in df_20.columns]
year_cols = [c for c in df_20.columns if re.fullmatch(r'\d{4}', c)]

# Melt wide → long
df_20 = df_20.melt(id_vars=id_cols, value_vars=year_cols,
                       var_name='year', value_name='value')

# Fix dtypes
df_20['year'] = df_20['year'].astype(int)
df_20['value'] = pd.to_numeric(df_20['value'], errors='coerce')
df_20 = df_20.dropna(subset=['value'])

# Add required columns
if 'category' not in df_20.columns: df_20['category'] = 'total'
if 'sex' not in df_20.columns:      df_20['sex'] = 'all'
df_20['indicator'] = 'mean_age_first_marriage'
df_20['unit'] = '%'
df_20['source'] = 'OECD'

df_20.sample(10)

Unnamed: 0,country,year,value,category,sex,indicator,unit,source
865,Italy,1999,8.7,total,all,mean_age_first_marriage,%,OECD
1184,Switzerland,2013,21.1,total,all,mean_age_first_marriage,%,OECD
465,Finland,1981,13.3,total,all,mean_age_first_marriage,%,OECD
519,Portugal,1983,10.7,total,all,mean_age_first_marriage,%,OECD
1245,Portugal,2016,52.8,total,all,mean_age_first_marriage,%,OECD
627,New Zealand,1988,30.6,total,all,mean_age_first_marriage,%,OECD
551,Czech Republic,1985,7.3,total,all,mean_age_first_marriage,%,OECD
1160,Spain,2012,39.0,total,all,mean_age_first_marriage,%,OECD
147,Slovenia,1966,8.7,total,all,mean_age_first_marriage,%,OECD
684,Denmark,1991,46.5,total,all,mean_age_first_marriage,%,OECD


In [1086]:


df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            22
year         int32           0            61
value      float64           0           488
category    object           0             1
sex         object           0             1
indicator   object           0             1
unit        object           0             1
source      object           0             1


In [1087]:
#df_20.to_csv('../data/Cleaned/cleaned_share_of_births_outside_of_marriage_oecd.csv', index=False)

In [1088]:
#df_20.to_sql('share_of_births_outside_of_marriage_oecd',engine, if_exists='replace', index=False)

In [1089]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
#mean_age_first_marriage
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [1090]:
df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            10
Gender     object           0             2
1990       object           0            17
1991       object           0            18
1992       object           0            18
1993       object           0            19
1994       object           0            16
1995       object           0            18
1996       object           0            19
1997       object           0            17
1998       object           0            14
1999       object           0            19
2000       object           0            18
2001       object           0            18
2002       object           0            19
2003       object           0            19
2004       object           0            16
2005       object           0            18
2006       object           0            18
2007       object           0            19
2008       object           0            18
2009       object           0   

In [1091]:
df_21_1.columns = df_21_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1092]:
# --- Ensure "country" and "gender" are strings
df_21_1["country"] = df_21_1["country"].astype(str).str.strip().str.title()
df_21_1["gender"] = df_21_1["gender"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_1.columns if c not in ["country", "gender"]]
# --- Robust cleaning -> convert to float ---
df_21_1[num_cols] = (
    df_21_1[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_1[num_cols] = df_21_1[num_cols].round(2)

In [1093]:
# 1) Gender -> sex
df_21_1.columns = df_21_1.columns.astype(str).str.strip()
df_21_1.rename(columns={'Gender':'sex','gender':'sex'}, inplace=True)
df_21_1['sex'] = df_21_1['sex'].astype(str).str.strip().str.lower()

# 2) Year columns
year_cols = [c for c in df_21_1.columns if re.fullmatch(r'\d{4}', str(c))]

# 3) Wide -> Long (keep sex)
df_21_1 = df_21_1.melt(
    id_vars=[c for c in ['country','sex'] if c in df_21_1.columns],
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# 4) Types
df_21_1['year']  = df_21_1['year'].astype(int)
df_21_1['value'] = pd.to_numeric(df_21_1['value'], errors='coerce')
df_21_1.dropna(subset=['value'], inplace=True)

# 5) Metadata
df_21_1['indicator'] = 'divorce_rates_per_1000'
df_21_1['unit']      = '‰'
df_21_1['source']    = 'OECD'


# 7) Final columns (only those that exist)
final_cols = [c for c in ['country','year','indicator','sex','unit','value','source'] if c in df_21_1.columns]
df_21_1 = df_21_1[final_cols]
df_21_1.sample(10)

Unnamed: 0,country,year,indicator,sex,unit,value,source
332,New Zealand,2006,divorce_rates_per_1000,male,‰,30.0,OECD
288,Japan,2004,divorce_rates_per_1000,male,‰,29.6,OECD
39,United States,1991,divorce_rates_per_1000,female,‰,24.1,OECD
356,United Kingdom,2007,divorce_rates_per_1000,male,‰,32.0,OECD
338,United States,2006,divorce_rates_per_1000,male,‰,27.5,OECD
308,Japan,2005,divorce_rates_per_1000,male,‰,29.8,OECD
68,Japan,1993,divorce_rates_per_1000,male,‰,28.4,OECD
257,United Kingdom,2002,divorce_rates_per_1000,female,‰,28.7,OECD
440,Australia,2012,divorce_rates_per_1000,male,‰,29.8,OECD
368,Japan,2008,divorce_rates_per_1000,male,‰,30.2,OECD


In [1094]:
df_21_1.drop_duplicates(inplace=True)
df_21_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

          datatypes  null_count  unique_count
country      object           0            10
year          int32           0            31
indicator    object           0             1
sex          object           0             2
unit         object           0             1
value       float64           0           114
source       object           0             1


In [1095]:
df_21_1.to_csv('../data/Cleaned/cleaned_mean_age_first_marriage_oecd.csv',index=False)

In [1096]:
df_21_1.to_sql('mean_age_first_marriage_oecd', engine, if_exists='replace', index= False)

618

In [1097]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
#divorce_rates_per_1000_oecd
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [1098]:
df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            28
1970       object           0            18
1971       object           0            19
1972       object           0            19
1973       object           0            18
1974       object           0            18
1975       object           0            19
1976       object           0            18
1977       object           0            18
1978       object           0            18
1979       object           0            15
1980       object           0            18
1981       object           0            20
1982       object           0            22
1983       object           0            24
1984       object           0            20
1985       object           0            19
1986       object           0            20
1987       object           0            20
1988       object           0            20
1989       object           0            19
1990       object           0   

In [1099]:
df_21_2.columns = df_21_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1100]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_21_2.columns if c != "country"]

df_21_2[num_cols] = (
    df_21_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1101]:
df_21_2.drop_duplicates(inplace=True)
df_21_2.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0            23
1970      float64           0            15
1971      float64           0            17
1972      float64           0            15
1973      float64           0            14
1974      float64           0            15
1975      float64           0            16
1976      float64           0            14
1977      float64           0            13
1978      float64           0            15
1979      float64           0            12
1980      float64           0            14
1981      float64           0            17
1982      float64           0            17
1983      float64           0            19
1984      float64           0            16
1985      float64           0            15
1986      float64           0            16
1987      float64           0            16
1988      float64           0            15
1989      float64           0            15
1990      float64           0   

In [1102]:
# Identify columns
id_cols = [c for c in ['country','category','sex'] if c in df_21_2.columns]
year_cols = [c for c in df_21_2.columns if re.fullmatch(r'\d{4}', c)]

# Melt wide → long
df_21_2 = df_21_2.melt(id_vars=id_cols, value_vars=year_cols,
                       var_name='year', value_name='value')

# Fix dtypes
df_21_2['year'] = df_21_2['year'].astype(int)
df_21_2['value'] = pd.to_numeric(df_21_2['value'], errors='coerce')
df_21_2 = df_21_2.dropna(subset=['value'])

# Add required columns
if 'sex' not in df_21_2.columns:      df_21_2['sex'] = 'all'
df_21_2['indicator'] = 'divorce_rates_per_1000'
df_21_2['unit'] = '‰'
df_21_2['source'] = 'OECD'
df_21_2

Unnamed: 0,country,year,value,sex,indicator,unit,source
0,Austria,1970,1.4,all,divorce_rates_per_1000,‰,OECD
1,Belgium,1970,0.7,all,divorce_rates_per_1000,‰,OECD
2,Czechia,1970,2.2,all,divorce_rates_per_1000,‰,OECD
3,Denmark,1970,1.9,all,divorce_rates_per_1000,‰,OECD
4,Finland,1970,1.3,all,divorce_rates_per_1000,‰,OECD
...,...,...,...,...,...,...,...
1214,Sweden,2022,2.1,all,divorce_rates_per_1000,‰,OECD
1215,Switzerland,2022,1.8,all,divorce_rates_per_1000,‰,OECD
1216,Bulgaria,2022,1.4,all,divorce_rates_per_1000,‰,OECD
1217,Croatia,2022,1.2,all,divorce_rates_per_1000,‰,OECD


In [1103]:
#df_21_2.to_csv('../data/Cleaned/cleaned_divorce_rates_per_1000_oecd.csv', index=False)

In [1104]:
#df_21_2.to_sql('divorce_rates_per_1000_oecd',engine, if_exists= 'replace' , index=False)

In [1105]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
#share_of_previous_marital_status
df_21_3.sample

<bound method NDFrame.sample of             Country Previous marital status  2000  2001  2002  2003  2004  \
0         Australia    Single never married  75,9  76,1  75,5  75,6  76,2   
1         Australia                Divorced  22,0  21,8  22,4  22,3  21,8   
2         Australia                 Widowed   2,1   2,1   2,1   2,1   1,9   
3           Austria    Single never married  76,6  74,7  74,1  73,7  72,9   
4           Austria                Divorced  22,2  24,2  24,7  25,2  25,9   
5           Austria                 Widowed   1,2   1,1   1,2   1,1   1,2   
6           Czechia    Single never married  74,9  74,5  74,3  74,0  73,9   
7           Czechia                Divorced  23,7  24,2  24,4  24,7  24,7   
8           Czechia                 Widowed   1,4   1,3   1,3   1,3   1,4   
9           Denmark    Single never married  75,9  76,0  76,2  76,4  76,0   
10          Denmark                Divorced  22,0  21,9  21,8  21,7  22,1   
11          Denmark                 Widowed 

In [1106]:
df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
Country                    object           0            20
Previous marital status    object           0             3
2000                       object           0            47
2001                       object           0            51
2002                       object           0            56
2003                       object           0            50
2004                       object           0            50
2005                       object           0            52
2006                       object           0            49
2008                       object           0            47
2009                       object           0            50
2010                       object           0            49
2011                       object           0            49
2012                       object           0            53
2013                       object           0            49
2014                       object       

In [1107]:
df_21_3.columns = df_21_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216


In [1108]:
# --- Ensure id cols are strings
df_21_3["country"] = df_21_3["country"].astype(str).str.strip().str.title()
df_21_3["previous_marital_status"] = df_21_3["previous_marital_status"].astype(str).str.strip()

# --- Identify numeric columns
id_cols = {"country", "previous_marital_status"}
num_cols = [c for c in df_21_3.columns if c not in id_cols]

# --- Clean numeric cols
for c in num_cols:
    s = df_21_3[c].astype(str)

    # Normalize spaces & decimal separator
    s = (s.replace({"\xa0": "", "\u202f": ""}, regex=True)   # remove no-break/narrow spaces
           .str.replace(",", ".", regex=False))              # comma → dot

    # Keep only digits, dot, minus; collapse multiple dots
    s = (s.str.replace(r"[^\d\.\-]", "", regex=True)
           .str.replace(r"\.\.+", ".", regex=True)
           .str.replace(r"^\.$|^\s*$", "", regex=True))      # lone dot/empty → ""

    # Convert to numeric ("" → NaN)
    df_21_3[c] = pd.to_numeric(s, errors="coerce")

# --- Round numeric cols
df_21_3[num_cols] = df_21_3[num_cols].round(2)

# --- Round after numeric conversion
df_21_3[num_cols] = df_21_3[num_cols].round(2)

In [1109]:
df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,75.9,76.1,75.5,75.6,76.2,76.9,77.3,78.2,...,79.6,79.7,80.0,80.5,80.5,80.1,80.3,80.1,80.3,80.7
1,Australia,Divorced,22.0,21.8,22.4,22.3,21.8,21.3,20.9,20.2,...,19.0,18.8,18.6,18.2,18.1,18.5,18.3,18.5,18.3,18.0
2,Australia,Widowed,2.1,2.1,2.1,2.1,1.9,1.8,1.8,1.7,...,1.5,1.5,1.4,1.3,1.4,1.4,1.4,1.4,1.3,1.3
3,Austria,Single never married,76.6,74.7,74.1,73.7,72.9,73.1,73.9,74.8,...,75.5,75.7,76.7,77.1,77.5,77.7,78.1,78.1,78.2,78.0
4,Austria,Divorced,22.2,24.2,24.7,25.2,25.9,25.7,24.9,24.2,...,23.5,23.4,22.3,22.0,21.5,21.5,20.9,21.0,21.0,21.6


In [1110]:
df_21_3.drop_duplicates(inplace=True)
df_21_3.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
country                    object           0            20
previous_marital_status    object           0             3
2000                      float64           0            47
2001                      float64           0            51
2002                      float64           0            56
2003                      float64           0            50
2004                      float64           0            50
2005                      float64           0            52
2006                      float64           0            49
2008                      float64           0            47
2009                      float64           0            50
2010                      float64           0            49
2011                      float64           0            49
2012                      float64           0            53
2013                      float64           0            49
2014                      float64       

In [1111]:
df_21_3.sample(10)

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
10,Denmark,Divorced,22.0,21.9,21.8,21.7,22.1,22.2,22.4,21.8,...,21.0,22.1,23.0,22.0,22.3,21.4,21.9,21.3,21.0,20.4
22,Lithuania,Divorced,18.6,19.1,19.2,18.7,20.0,20.2,20.5,21.0,...,19.1,20.2,19.6,20.0,20.7,20.5,21.6,22.0,22.7,23.6
32,Poland,Widowed,2.5,2.5,2.3,2.2,2.1,1.9,1.8,8.9,...,1.6,1.7,1.7,1.8,1.8,1.8,1.8,1.8,1.7,1.9
11,Denmark,Widowed,2.1,2.1,2.0,2.0,1.9,2.2,1.9,1.9,...,1.8,1.9,1.9,1.8,1.6,1.7,1.7,1.6,1.4,1.4
29,New Zealand,Widowed,2.5,2.9,2.7,2.5,2.7,2.6,2.3,2.4,...,2.2,2.2,2.1,2.4,2.3,2.1,2.0,2.0,2.1,2.1
23,Lithuania,Widowed,2.4,2.1,1.8,2.0,1.8,1.9,2.0,1.7,...,1.6,1.6,1.6,1.6,1.6,1.4,1.6,1.6,1.6,1.6
8,Czechia,Widowed,1.4,1.3,1.3,1.3,1.4,1.2,1.1,1.3,...,1.2,1.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.1
24,Luxembourg,Single never married,76.3,75.3,76.1,74.0,75.6,74.8,74.5,75.9,...,76.2,77.9,76.8,74.9,75.6,75.3,74.3,75.5,76.9,77.3
46,Sweden,Divorced,20.6,22.0,21.9,21.4,21.9,21.0,20.3,20.6,...,20.1,19.8,20.3,21.6,21.4,21.0,20.9,20.4,20.0,20.3
7,Czechia,Divorced,23.7,24.2,24.4,24.7,24.7,24.5,24.4,26.1,...,24.9,24.9,23.8,23.4,22.4,22.3,22.6,22.6,22.9,23.0


In [1112]:
year_cols = [c for c in df_21_3.columns if re.fullmatch(r"\d{4}", str(c))]

# Wide → Long
df_21_3 = df_21_3.melt(
    id_vars=["country", "previous_marital_status"],
    value_vars=year_cols,
    var_name="year",
    value_name="value"
)

df_21_3["year"] = df_21_3["year"].astype(int)
df_21_3["value"] = pd.to_numeric(df_21_3["value"], errors="coerce")
df_21_3 = df_21_3.dropna(subset=["value"])

# Standardize
df_21_3 = df_21_3.rename(columns={"previous_marital_status": "category"})
df_21_3["sex"] = "all"
df_21_3["indicator"] = "share_of_previous_marital_status"
df_21_3["unit"] = "%"
df_21_3["source"] = "OECD"

# Final column order
df_21_3 = df_21_3[["country","year","indicator","category","sex","unit","value","source"]]

df_21_3.sample(15)

Unnamed: 0,country,year,indicator,category,sex,unit,value,source
1120,Slovenia,2019,share_of_previous_marital_status,Divorced,all,%,10.3,OECD
589,Switzerland,2010,share_of_previous_marital_status,Divorced,all,%,21.0,OECD
316,Greece,2005,share_of_previous_marital_status,Divorced,all,%,10.9,OECD
724,Austria,2013,share_of_previous_marital_status,Divorced,all,%,22.0,OECD
87,New Zealand,2001,share_of_previous_marital_status,Single never married,all,%,73.5,OECD
129,Denmark,2002,share_of_previous_marital_status,Single never married,all,%,76.2,OECD
746,Luxembourg,2013,share_of_previous_marital_status,Widowed,all,%,1.7,OECD
1080,Australia,2019,share_of_previous_marital_status,Single never married,all,%,80.7,OECD
42,Spain,2000,share_of_previous_marital_status,Single never married,all,%,94.0,OECD
312,Finland,2005,share_of_previous_marital_status,Single never married,all,%,77.2,OECD


In [1113]:
#df_21_3.to_csv('../data/Cleaned/cleaned_share_of_previous_marital_status_oecd.csv', index=False)

In [1114]:
#df_21_3.to_sql('share_of_previous_marital_status_oecd', engine, if_exists= 'replace', index =  False)

In [1115]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
#hauseholds_by_partnership_status_oecd
df_22_1

Unnamed: 0,Country,20+_All_Total_Living_with_a_partner(%),20+_All_Married or in a civil or registered partnership_living_with_a_partner(%),20+_All_Cohabiting_living_with_a_partner(%),20+_All_Not living with a partner(%),20/34_Total_living_with_a_partner(%),20/34_Married or in a civil or registered partnership_living_with_a_partner(%),20/34_Cohabiting_living_with_a_partner(%),Not living with a partner_Total(%),Living with at least one parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620
5,Denmark,6415,5002,1412,3585,5054,2186,2868,4946,1067
6,Estonia,5393,3730,1664,4607,4531,1781,2750,5469,2646
7,France,6414,4941,1472,3586,5042,2189,2853,4958,2208
8,Germany,6261,5391,869,3739,3953,2215,1739,5974,2754
9,Greece,6023,5852,171,3977,3313,2924,390,6687,4543


In [1116]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
20+_All_Total_Living_with_a_partner(%)                object           0   
20+_All_Married or in a civil or registered par...    object           0   
20+_All_Cohabiting_living_with_a_partner(%)           object           0   
20+_All_Not living with a partner(%)                  object           0   
20/34_Total_living_with_a_partner(%)                  object           0   
20/34_Married or in a civil or registered partn...    object           0   
20/34_Cohabiting_living_with_a_partner(%)             object           0   
Not living with a partner_Total(%)                    object           0   
Living with at least one parent(%)                    object           0   

                                                    unique_count  
Country                                                       37  
20+_All_Total_Living_with_a_p

In [1117]:
df_22_1.columns = df_22_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)


df_22_1.head()

Unnamed: 0,country,20_all_total_living_with_a_partner,20_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,20_all_cohabiting_living_with_a_partner,20_all_not_living_with_a_partner,2034_total_living_with_a_partner,2034_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,2034_cohabiting_living_with_a_partner,not_living_with_a_partner_total,living_with_at_least_one_parent
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620


In [1118]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_1.columns if c != "country"]

df_22_1[num_cols] = (
    df_22_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1119]:
df_22_1["country"] = df_22_1["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_1["country"].unique())

['Australia' 'Austria' 'Belgium' 'Canada' 'Czech Republic' 'Denmark'
 'Estonia' 'France' 'Germany' 'Greece' 'Hungary' 'Iceland' 'Ireland'
 'Italy' 'Latvia' 'Luxembourg' 'Netherlands' 'New Zealand' 'Norway'
 'Poland' 'Portugal' 'Slovak Republic' 'Slovenia' 'Spain' 'Sweden'
 'Switzerland' 'United Kingdom' 'United States' 'OECD-28 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [1120]:
df_22_1.drop_duplicates(inplace = True)
df_22_1.dropna(inplace=True)

if "year" not in df_22_1.columns :
    df_22_1["year"] = 2021
df_22_1["unit"] = "%"
df_22_1["source"] = "OECD"  
df_22_1.sample(8)

Unnamed: 0,country,20_all_total_living_with_a_partner,20_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,20_all_cohabiting_living_with_a_partner,20_all_not_living_with_a_partner,2034_total_living_with_a_partner,2034_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,2034_cohabiting_living_with_a_partner,not_living_with_a_partner_total,living_with_at_least_one_parent,year,unit,source
33,Malta,61.79,59.23,2.56,38.21,34.61,30.44,4.16,65.39,55.11,2021,%,OECD
6,Estonia,53.93,37.3,16.64,46.07,45.31,17.81,27.5,54.69,26.46,2021,%,OECD
27,United States,59.5,52.4,7.1,40.5,41.9,29.75,12.15,58.11,29.68,2021,%,OECD
29,Bulgaria,60.18,50.79,9.39,39.82,43.93,24.41,19.51,56.07,34.56,2021,%,OECD
26,United Kingdom,60.68,48.42,12.26,39.32,43.7,21.84,21.86,56.3,24.99,2021,%,OECD
36,Eurozone average,59.15,51.1,8.05,40.85,37.87,24.08,13.79,61.77,38.93,2021,%,OECD
34,Romania,62.74,58.37,4.37,37.26,47.19,40.44,6.75,52.75,35.7,2021,%,OECD
2,Belgium,62.15,53.51,8.64,37.85,45.28,29.33,15.94,54.72,31.34,2021,%,OECD


In [1121]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
20_all_total_living_with_a_partner                   float64           0   
20_all_married_or_in_a_civil_or_registered_part...   float64           0   
20_all_cohabiting_living_with_a_partner              float64           0   
20_all_not_living_with_a_partner                     float64           0   
2034_total_living_with_a_partner                     float64           0   
2034_married_or_in_a_civil_or_registered_partne...   float64           0   
2034_cohabiting_living_with_a_partner                float64           0   
not_living_with_a_partner_total                      float64           0   
living_with_at_least_one_parent                      float64           0   
year                                                   int64           0   
unit                                                  object           0   
source      

In [1122]:
#df_22_1.to_csv('../data/Cleaned/cleaned_households_by_partnership_status_oecd.csv', index=False)

In [1123]:
#df_22_1.to_sql('households_by_partnership_status_oecd', engine, if_exists='replace', index= False)

In [1124]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
#level_of_educational_attainment
df_22_2

Unnamed: 0,Country,Low_Education_Total_living_with_a_partner(%),Low_educationMarried or in a civil or registered partnership_living_with_a_partner(%),Low_education_Cohabiting_living_with_a_partner(%),Not living with a partner_Low_education(%),Medium education_Total_Living with a partner(%),Medium education_Married or in a civil or registered partnership_Living with a partner(%),Medium education_Cohabiting_Living with a partner(%),Not living with a partner_Medium education(%),High education_Total_Living with a partner(%),High education_Married or in a civil or registered partnership_Living with a partner(%),High education_Cohabiting_Living with a partner(%),Not living with a partner_High education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442
5,Germany,5446,4879,567,4554,6238,5313,925,3762.0,6889,5916,974,3111
6,Greece,6381,6288,93,3619,5700,5488,212,4300.0,5833,5570,263,4167
7,Hungary,5033,4038,995,4967,5794,4678,1115,4206.0,5956,5102,855,4044
8,Iceland,5186,4102,1084,4814,5831,4657,1174,4169.0,6972,5453,1519,3028
9,Latvia,3627,2592,1035,6373,4932,3954,978,5068.0,5291,4539,752,4709


In [1125]:
df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
Low_Education_Total_living_with_a_partner(%)          object           0   
Low_educationMarried or in a civil or registere...    object           0   
Low_education_Cohabiting_living_with_a_partner(%)     object           0   
Not living with a partner_Low_education(%)            object           0   
Medium education_Total_Living with a partner(%)       object           0   
Medium education_Married or in a civil or regis...    object           0   
Medium education_Cohabiting_Living with a partn...    object           0   
Not living with a partner_Medium education(%)         object           2   
High education_Total_Living with a partner(%)         object           0   
High education_Married or in a civil or registe...    object           0   
High education_Cohabiting_Living with a partner(%)    object           0   
Not living w

In [1126]:
df_22_2.columns = df_22_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)


df_22_2.head()

Unnamed: 0,country,low_education_total_living_with_a_partner,low_educationmarried_or_in_a_civil_or_registered_partnership_living_with_a_partner,low_education_cohabiting_living_with_a_partner,not_living_with_a_partner_low_education,medium_education_total_living_with_a_partner,medium_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,medium_education_cohabiting_living_with_a_partner,not_living_with_a_partner_medium_education,high_education_total_living_with_a_partner,high_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,high_education_cohabiting_living_with_a_partner,not_living_with_a_partner_high_education
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442


In [1127]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_2.columns if c != "country"]

df_22_2[num_cols] = (
    df_22_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1128]:
df_22_2["country"] = df_22_2["country"].str.replace(r"\s*\(.*?\)", "", regex=True)

print(df_22_2["country"].unique())

['Austria' 'Belgium' 'Czech Republic' 'Estonia' 'France' 'Germany'
 'Greece' 'Hungary' 'Iceland' 'Latvia' 'Luxembourg' 'Norway' 'Poland'
 'Portugal' 'Slovenia' 'Spain' 'Sweden' 'United Kingdom' 'OECD-19 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [1129]:
df_22_2.drop_duplicates(inplace=True)
df_22_2.dropna(inplace=True)


In [1130]:
if "year" not in df_22_1.columns :
    df_22_2["year"] = 2021
df_22_2["unit"] = "%"
df_22_2["source"] = "OECD"    

df_22_2.sample(10)

Unnamed: 0,country,low_education_total_living_with_a_partner,low_educationmarried_or_in_a_civil_or_registered_partnership_living_with_a_partner,low_education_cohabiting_living_with_a_partner,not_living_with_a_partner_low_education,medium_education_total_living_with_a_partner,medium_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,medium_education_cohabiting_living_with_a_partner,not_living_with_a_partner_medium_education,high_education_total_living_with_a_partner,high_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,high_education_cohabiting_living_with_a_partner,not_living_with_a_partner_high_education,unit,source
14,Slovenia,51.15,45.57,5.59,48.85,51.64,43.8,7.83,48.36,55.28,45.94,9.34,44.72,%,OECD
8,Iceland,51.86,41.02,10.84,48.14,58.31,46.57,11.74,41.69,69.72,54.53,15.19,30.28,%,OECD
4,France,61.12,51.93,9.18,38.88,65.68,49.17,16.51,34.32,65.58,46.6,18.98,34.42,%,OECD
21,Cyprus,66.81,64.65,2.17,33.19,64.46,59.71,4.75,35.54,65.75,59.86,5.89,34.25,%,OECD
23,Malta,65.14,63.3,1.84,34.86,54.84,51.17,3.67,45.16,57.23,53.14,4.09,42.77,%,OECD
3,Estonia,42.17,26.39,15.78,57.83,54.41,36.61,17.79,45.59,60.14,44.45,15.69,39.86,%,OECD
9,Latvia,36.27,25.92,10.35,63.73,49.32,39.54,9.78,50.68,52.91,45.39,7.52,47.09,%,OECD
26,Eurozone average,57.4,51.11,6.29,42.6,58.18,48.92,9.26,41.82,61.13,51.69,9.45,38.87,%,OECD
7,Hungary,50.33,40.38,9.95,49.67,57.94,46.78,11.15,42.06,59.56,51.02,8.55,40.44,%,OECD
19,Bulgaria,58.16,45.7,12.46,41.84,59.72,51.6,8.12,40.28,63.75,55.52,8.23,36.25,%,OECD


In [1131]:
df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
low_education_total_living_with_a_partner            float64           0   
low_educationmarried_or_in_a_civil_or_registere...   float64           0   
low_education_cohabiting_living_with_a_partner       float64           0   
not_living_with_a_partner_low_education              float64           0   
medium_education_total_living_with_a_partner         float64           0   
medium_education_married_or_in_a_civil_or_regis...   float64           0   
medium_education_cohabiting_living_with_a_partner    float64           0   
not_living_with_a_partner_medium_education           float64           0   
high_education_total_living_with_a_partner           float64           0   
high_education_married_or_in_a_civil_or_registe...   float64           0   
high_education_cohabiting_living_with_a_partner      float64           0   
not_living_w

In [1132]:
#df_22_2.to_csv('../data/Cleaned/cleaned_level_of_educational_attainment_oecd.csv', index=False)

In [1133]:
#df_22_2.to_sql('level_of_educational_attainment_oecd',engine, if_exists='replace', index= False)

In [None]:
df_23_1 = pd.read_csv('../data/Raw/OECD/SF_1_3_Living_arrangements_of_children_by_income_status.csv')

df_23_1.head()

Unnamed: 0,country,Married_living with two parents,Cohabiting_living with two parents,Mother_living with one parent,Father_living with one parent,Other,Married_living with two parents.1,Cohabiting_living with two parents.1,Mother_living with one parent.1,Fatherl_iving with one parent,Other.1
0,Austria,6712,347,2710,169,62,7093,1368,1320,144,75
1,Belgium,2824,1559,4727,646,245,5817,2164,1509,432,78
2,Czech Republic,2954,2958,3756,238,94,7182,1785,884,91,58
3,Estonia,4198,2625,3019,33,125,5447,3195,1226,91,41
4,Finland,3293,1330,3892,1261,224,7011,1791,989,171,37


In [1135]:
df_info = pd.DataFrame({
    'datatypes': df_23_1.dtypes,
    'null_count': df_23_1.isnull().sum(),
    'unique_count': df_23_1.nunique()
})
print(df_info)

                                               datatypes  null_count  \
Unnamed: 0                                        object           2   
Children living in relative income poverty        object           0   
Unnamed: 2                                        object           1   
Unnamed: 3                                        object           0   
Unnamed: 4                                        object           1   
Unnamed: 5                                        object           1   
Children not living in relative income poverty    object           0   
Unnamed: 7                                        object           1   
Unnamed: 8                                        object           0   
Unnamed: 9                                        object           1   
Unnamed: 10                                       object           1   

                                                unique_count  
Unnamed: 0                                                31  
Children 

In [1136]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_23_1.columns if c != "country"]

df_23_1[num_cols] = (
    df_23_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)
df_23_1.head()

  df_23_1[num_cols].astype(str)


Unnamed: 0.1,Unnamed: 0,Children living in relative income poverty,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Children not living in relative income poverty,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,67.12,3.47,27.1,1.69,0.62,70.93,13.68,13.2,1.44,0.75
3,,28.24,15.59,47.27,6.46,2.45,58.17,21.64,15.09,4.32,0.78
4,,29.54,29.58,37.56,2.38,0.94,71.82,17.85,8.84,0.91,0.58


In [1140]:
# 3) Long format
df_23_1 = df_23_1.melt(id_vars="country",
                       var_name="living_arrangement",
                       value_name="value")

# 4) Add indicator based on block position (left block = poverty, right block = not poverty)
# After header=1 the column names repeat; split by original order:
unique_cols = [c for c in df_23_1["living_arrangement"].unique()]
half = len(unique_cols) // 2
poverty_cols = set(unique_cols[:half])         # first block
not_pov_cols = set(unique_cols[half:])         # second block

df_23_1["indicator"] = df_23_1["living_arrangement"].apply(
    lambda c: "living_in_poverty" if c in poverty_cols else "not_in_poverty"
)

# 5) Clean numbers (comma -> dot)
df_23_1["value"] = (
    df_23_1["value"].astype(str)
    .str.replace("\xa0","", regex=False)
    .str.replace("\u202f","", regex=False)
    .str.replace(",", ".", regex=False)
)
df_23_1["value"] = pd.to_numeric(df_23_1["value"], errors="coerce")

df_23_1.head(10)
# tidy.head()

Unnamed: 0,country,living_arrangement,value,indicator
0,,Living with two parents,,living_in_poverty
1,Austria,Living with two parents,67.12,living_in_poverty
2,Belgium,Living with two parents,28.24,living_in_poverty
3,Czech Republic,Living with two parents,29.54,living_in_poverty
4,Estonia,Living with two parents,41.98,living_in_poverty
5,Finland,Living with two parents,32.93,living_in_poverty
6,France,Living with two parents,32.0,living_in_poverty
7,Greece,Living with two parents,85.64,living_in_poverty
8,Hungary,Living with two parents,40.02,living_in_poverty
9,Iceland,Living with two parents,21.46,living_in_poverty


In [None]:
df_23_2 = pd.read_csv('../data/Raw/OECD/SF_1_3_Living_arrangements_of_children_by_mothers_level_of_education.csv')
df_23_2.sample(10)

In [None]:
df_info = pd.DataFrame({
    'datatypes': df_23_2.dtypes,
    'null_count': df_23_2.isnull().sum(),
    'unique_count': df_23_2.nunique()
})
print(df_info)

In [None]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_1.columns if c != "country"]

df_23_2[num_cols] = (
    df_23_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)