In [367]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook

In [368]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [369]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [370]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [371]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [372]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [373]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [374]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [375]:
df_1.drop_duplicates(inplace=True)
df_1.dropna(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [376]:
df_info = pd.DataFrame({
    'datatypes': df_1.dtypes,
    'null_count': df_1.isnull().sum(),
    'unique_count': df_1.nunique()
})
print(df_info)

                           datatypes  null_count  unique_count
country                       object           0           235
age_group                     object           0            63
sex                           object           0             2
marital_status                object           0            35
data_process                  object           0             6
data_collection_start_year     int32           0            62
data_collection_end_year       int32           0            60
data_source                   object           0            15


In [377]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [378]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [379]:
s_1 = ('../data/Cleaned/cleaned_world_marriage.csv')

In [None]:
AGE_COL = "age_group"

# 0) Ensure s_1 is a DataFrame
if isinstance(s_1, (str, Path)):
    s_1 = pd.read_csv(s_1)
elif isinstance(s_1, tuple) and len(s_1) == 1 and isinstance(s_1[0], (str, Path)):
    s_1 = pd.read_csv(s_1[0])

# 1) Normalize age labels; keep [+75]; map any 65+ to [65-69]

def norm_age(x):
    if pd.isna(x): return x
    x = str(x).replace("–","-").replace("—","-").replace("to","-").replace("_","-")
    x = re.sub(r"[()]", "", x)
    x = re.sub(r"\s+", "", x)
    if re.search(r"\[\+75\]|\[75\+\]|75\+|\+75", x):   # preserve 75+
        return "[+75]"
    if re.fullmatch(r"\+?65\+?|\[?\+65\]?|\[?65\+\]?", x, flags=re.I):  # merge 65+
        return "[65-69]"
    m = re.match(r"^\[?(\d{1,3})-(\d{1,3})\]?$", x)   # standard ranges
    if m:
        a, b = map(int, m.groups())
        return f"[{a}-{b}]"
    m = re.search(r"(\d{1,3})", x)                    # fallback: first number
    return f"[{m.group(1)}]" if m else x

s_1[AGE_COL] = s_1[AGE_COL].astype(str).map(norm_age)

# 2) Keep top-14 age buckets by frequency (delete all others)
top14 = s_1[AGE_COL].value_counts(dropna=False).nlargest(14).index.tolist()
s_1 = s_1[s_1[AGE_COL].isin(top14)].copy()

# 3) Natural ordering (put [+75] last)
def start_num(lbl):
    return 10**9 if lbl == "[+75]" else int(re.search(r"\d+", str(lbl)).group())
cats = sorted(top14, key=start_num)
s_1[AGE_COL] = pd.Categorical(s_1[AGE_COL], categories=cats, ordered=True)
s_1 = s_1.sort_values(AGE_COL).reset_index(drop=True)

print("Kept age buckets (14):", list(s_1[AGE_COL].cat.categories))

Kept age buckets (14): ['[10-14]', '[15-19]', '[20-24]', '[25-29]', '[30-34]', '[35-39]', '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65-69]', '[70-74]', '[+75]']


In [None]:
#s_1.to_csv('../data/Staging/staging_world_marriage.csv', index= False)

In [None]:
#s_1.to_sql('staging_world_marriage', engine, if_exists='replace', index=False)

249

In [383]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [384]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [385]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [386]:
df_2.drop_duplicates(inplace=True)
df_2.dropna(inplace=True)


In [387]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [388]:
df_info = pd.DataFrame({
    'datatypes': df_2.dtypes,
    'null_count': df_2.isnull().sum(),
    'unique_count': df_2.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            41
code                                   object           0            41
year                                    int32           0            32
mean_age_of_women_at_first_marriage   float64           0           179


In [389]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [390]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [391]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [392]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [393]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [394]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [395]:
df_3.drop_duplicates(inplace=True)
df_3.dropna(inplace=True)


In [396]:
df_info = pd.DataFrame({
    'datatypes': df_3.dtypes,
    'null_count': df_3.isnull().sum(),
    'unique_count': df_3.nunique()
})
print(df_info)

                                              datatypes  null_count  \
country                                          object           0   
code                                             object           0   
year                                              int32           0   
crude_marriage_rate_marriages_per_1000_people   float64           0   

                                               unique_count  
country                                                  45  
code                                                     45  
year                                                    127  
crude_marriage_rate_marriages_per_1000_people           109  


In [397]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [398]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [399]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [400]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [401]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [402]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [403]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [404]:
df_info = pd.DataFrame({
    'datatypes': df_4.dtypes,
    'null_count': df_4.isnull().sum(),
    'unique_count': df_4.nunique()
})
print(df_info)

                            datatypes  null_count  unique_count
country                        object           0            38
code                           object           0            38
year                            int64           0            61
crude_marriage_rate           float64           0           101
crude_marriage_rate_people1   float64           0            28
year_1                          Int64           0             1


In [405]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [406]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [407]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [408]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [409]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)
df_5.dropna(inplace=True)

In [410]:
df_info = pd.DataFrame({
    'datatypes': df_5.dtypes,
    'null_count': df_5.isnull().sum(),
    'unique_count': df_5.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            42
code                                   object           0            42
year                                    int64           0            62
share_of_births_outside_of_marriage   float64           0           610


In [411]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [412]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [413]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')
df_6

Unnamed: 0,Entity,Code,Year,Proportions of men or women who had ever married by a certain age for 1900 birth cohort,Proportions of men or women who had ever married by a certain age for 1920 birth cohort,Proportions of men or women who had ever married by a certain age for 1940 birth cohort,Proportions of men or women who had ever married by a certain age for 1960 birth cohort,Proportions of men or women who had ever married by a certain age for 1970 birth cohort,Proportions of men or women who had ever married by a certain age for 1980 birth cohort,Proportions of men or women who had ever married by a certain age for 1990 birth cohort,Proportions of men or women who had ever married by a certain age for 2000 birth cohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2
...,...,...,...,...,...,...,...,...,...,...,...
63,Women,,46,84.5,91.6,95.5,86.9,75.0,,,
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,
65,Women,,48,85.0,91.8,95.6,87.2,75.7,,,
66,Women,,49,85.2,91.9,95.7,87.3,76.0,,,


In [414]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)

df_6.head()

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2


In [415]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)
df_6

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
0,Men,17,0.0,0.0,0.0,0.1,0.0
1,Men,18,0.1,0.1,0.4,0.6,0.1
2,Men,19,0.8,0.6,2.0,2.5,0.7
3,Men,20,2.4,2.2,6.0,6.2,1.9
4,Men,21,6.1,7.4,13.6,11.9,3.9
...,...,...,...,...,...,...,...
63,Women,46,84.5,91.6,95.5,86.9,75.0
64,Women,47,84.8,91.7,95.6,87.0,75.4
65,Women,48,85.0,91.8,95.6,87.2,75.7
66,Women,49,85.2,91.9,95.7,87.3,76.0


In [416]:
df_6.dropna(inplace=True)
df_6.describe

<bound method NDFrame.describe of       sex  year  1900_birthcohort  1920_birthcohort  1940_birthcohort  \
0     Men    17               0.0               0.0               0.0   
1     Men    18               0.1               0.1               0.4   
2     Men    19               0.8               0.6               2.0   
3     Men    20               2.4               2.2               6.0   
4     Men    21               6.1               7.4              13.6   
..    ...   ...               ...               ...               ...   
63  Women    46              84.5              91.6              95.5   
64  Women    47              84.8              91.7              95.6   
65  Women    48              85.0              91.8              95.6   
66  Women    49              85.2              91.9              95.7   
67  Women    50              85.4              92.0              95.7   

    1960_birthcohort  1970_birthcohort  
0                0.1               0.0  
1      

In [417]:
df_info = pd.DataFrame({
    'datatypes': df_6.dtypes,
    'null_count': df_6.isnull().sum(),
    'unique_count': df_6.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [418]:
df_6.sample(12)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
36,Women,19,2.1,5.3,13.4,12.1,3.8
67,Women,50,85.4,92.0,95.7,87.5,76.3
61,Women,44,84.0,91.3,95.4,86.5,74.1
0,Men,17,0.0,0.0,0.0,0.1,0.0
43,Women,26,55.1,68.3,84.2,65.2,40.1
9,Men,26,48.3,51.0,67.7,46.8,24.1
3,Men,20,2.4,2.2,6.0,6.2,1.9
53,Women,36,79.9,88.9,94.2,83.5,68.2
40,Women,23,32.7,49.5,68.2,48.4,24.0
18,Men,35,84.8,86.7,89.1,73.7,56.3


In [419]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [420]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [421]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [422]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [423]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.dropna(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1946,Switzerland,CHE,1961,4.0
38,Australia,AUS,2013,34.4
616,Finland,FIN,1979,12.0
744,Greece,GRC,1962,1.2
1966,Switzerland,CHE,1981,5.2


In [424]:
df_info = pd.DataFrame({
    'datatypes': df_7.dtypes,
    'null_count': df_7.isnull().sum(),
    'unique_count': df_7.nunique()
})
print(df_info)

                                          datatypes  null_count  unique_count
country                                      object           0            42
code                                         object           0            42
year                                          int64           0            62
shareofbirthsoutsideofmarriageofallbirths   float64           0           610


In [425]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [426]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [427]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [428]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [429]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
47,Women,GBR,30,71.6,82.7,90.9,76.7,55.8,37.8,28.9,
28,Men,GBR,45,91.8,91.3,92.0,80.5,68.2,,,
50,Women,GBR,33,76.8,86.8,93.0,81.1,63.1,49.0,38.7,
9,Men,GBR,26,48.3,51.0,67.7,46.8,24.1,11.2,7.1,
13,Men,GBR,30,72.9,76.4,83.3,63.9,41.4,25.5,20.1,


In [430]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.dropna(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
31,Men,48,92.5,91.7,92.3,81.3,69.9
57,Women,40,82.4,90.5,95.0,85.3,72.0
49,Women,32,75.5,85.8,92.4,79.9,61.0
8,Men,25,39.6,41.6,59.7,40.6,19.4
27,Men,44,91.5,91.1,91.8,80.0,67.5


In [431]:
df_info = pd.DataFrame({
    'datatypes': df_8.dtypes,
    'null_count': df_8.isnull().sum(),
    'unique_count': df_8.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [432]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [433]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [434]:
#pip install openpyxl pywin32

In [435]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [436]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [437]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [438]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [439]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [440]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [441]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [442]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [443]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [444]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [445]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [446]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
22451,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,776,Tonga,TO,TON,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,51.71
13303,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,450,Madagascar,MG,MDG,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,66.03
17274,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,598,Papua New Guinea,PG,PNG,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,70.18
22213,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,768,Togo,TG,TGO,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,69.71
7934,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,266,Gabon,GA,GAB,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,49.71


In [447]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
5843,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,204,Benin,BJ,BEN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,80.97
3854,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,136,Cayman Islands,KY,CYM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.27
12726,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,434,Libya,LY,LBY,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,46.41
11073,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,388,Jamaica,JM,JAM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,35.55
15720,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,533,Aruba,AW,ABW,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.78


In [448]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [449]:
df_9.drop_duplicates(inplace=True)
df_9.dropna(inplace = True)

df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [450]:
df_info = pd.DataFrame({
    'datatypes': df_9.dtypes,
    'null_count': df_9.isnull().sum(),
    'unique_count': df_9.nunique()
})
print(df_info)

                datatypes  null_count  unique_count
indicatorname      object           0             1
year                int64           0             1
country            object           0           224
code               object           0           224
time                int64           0            56
variant            object           0             1
sex                object           0             1
age                object           0             1
estimate_method     int64           0             2
estimatemethod     object           0             2
value             float64           0          3867


In [451]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [452]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [453]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [454]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
63963,Côte d'Ivoire,384,Married or in-union women,2027,30-34,83.550418,909.7358,Projection
61871,Ireland,372,Married or in-union women,2008,15-49,48.520619,576.089924,Estimate
105948,Puerto Rico,630,Married or in-union women,2010,35-39,58.81741,73.443829,Estimate
36799,Ecuador,218,Married or in-union women,2033,15-49,44.295833,2272.981305,Projection
93181,Vanuatu,548,Married or in-union women,2034,40-44,88.504628,11.164859,Projection
68205,Republic of Korea,410,Married or in-union women,1990,40-44,86.238571,1042.043943,Estimate
92053,New Caledonia,540,Married or in-union women,1974,40-44,65.1,2.012567,Estimate
88050,Oman,512,Married or in-union women,2041,25-29,48.666962,130.401907,Projection
8517,Armenia,51,Married or in-union women,1981,40-44,83.156667,89.180951,Estimate
27277,Comoros,174,Married or in-union women,1977,40-44,77.72,4.653874,Estimate


In [455]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
78733,Malaysia,458,Married or in-union women,2010,40-44,88.0459,755.581299,Estimate
107321,Réunion,638,Married or in-union women,2020,20-24,22.6,5.756559,Estimate
129423,Togo,768,Married or in-union women,2028,15-49,60.560894,1539.134218,Projection
98893,Palau,585,Married or in-union women,2019,40-44,67.862428,0.422104,Estimate
558,Afghanistan,4,Married or in-union women,2039,45-49,86.738172,1154.617773,Projection


In [456]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [457]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [458]:
df_10.dropna(inplace=True)

In [459]:
df_info = pd.DataFrame({
    'datatypes': df_10.dtypes,
    'null_count': df_10.isnull().sum(),
    'unique_count': df_10.nunique()
})
print(df_info)

              datatypes  null_count  unique_count
countryorarea    object           0           225
isocode           int64           0           225
indicator        object           0             1
year              int64           0            81
agegroup         object           0             8
percentage      float64           0          9667
number          float64           0         65394
data_process     object           0             2


In [460]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [461]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [462]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [463]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [464]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
45580,Sri Lanka,144,2001,2001,Men,[40-44],40,44,90.05,Census,2001 Census,242,Sri Lanka 2001 Census,UNSD,,,Excluding persons from areas restricted by sec...
5306,Burkina Faso,854,1998,1999,Women,[30-34],30,34,96.9,Survey,1998-1999 DHS,1777,Burkina Faso 1998-1999 Demographic and Health ...,DHS_STATcompiler,1.0,,
14948,Finland,246,1998,1998,Women,[65-69],65,69,54.13,Estimate,1998 Estimate,2093,Finland 1998 Estimate,UNSD,,,
4657,British Virgin Islands,92,1991,1991,Men,[50-54],50,54,71.68,Census,1991 Census,2304,British Virgin Islands 1991 Census,UNSD,,,Excluding the institutional population.
33852,Netherlands,528,1995,1995,Women,[65-69],65,69,60.84,Estimate,1995 Estimate,2170,Netherlands 1995 Estimate,UNSD,,,
25796,Jordan,400,2007,2007,Women,[35-39],35,39,81.8,Survey,2007 DHS,2407,Jordan 2007 Demographic and Health Survey,DHS_HH,,,
29202,Lithuania,440,2011,2011,Men,[20-24],20,24,10.65,Census,2011 Census,4831,Lithuania 2011 Census,Eurostat,1.0,Estimates computed based on data on marital st...,
28790,Liechtenstein,438,2010,2010,Men,[30-34],30,34,43.57,Estimate,2010 Estimate,2145,Liechtenstein 2010 Estimate,UNSD,,,


In [465]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
7366,Channel Islands,830,1986,1986,Women,[40-44],40,44,76.09,Census,1986 Census,352,Channel Islands 1986 Census,US Census Bureau,,,Data pertain to resident population of Jersey ...
40095,Romania,642,2011,2011,Women,[30-34],30,34,74.07,Census,2011 Census,4840,Romania 2011 Census,UNSD,1.0,,
3736,Benin,204,2017,2018,Women,[20-24],20,24,67.2,Survey,2017-2018 DHS,6055,Benin 2017-2018 Demographic and Health Survey ...,DHS_STATcompiler,1.0,,
31310,Mexico,484,1970,1970,Women,[65-69],65,69,52.4,Census,1970 Census,1015,Mexico 1970 Census,UNSD,1.0,Data have not been adjusted for underenumeration.,
44865,Solomon Islands,90,2015,2015,Men,[15-19],15,19,1.6,Survey,2015 DHS,7516,Solomon Islands 2015 Demographic and Health Su...,National statistics,,,
29513,Luxembourg,442,1987,1987,Men,[60-64],60,64,83.46,Estimate,1987 Estimate,2147,Luxembourg 1987 Estimate,UNSD,1.0,,
31561,Mexico,484,2015,2015,Women,[45-49],45,49,72.36,Survey,2015 Intercensal Survey,7250,Mexico 2015 Intercensal Survey,National statistics,1.0,,
6717,Canada,124,2003,2003,Men,[20-24],20,24,12.83,Estimate,2003 Estimate,2061,Canada 2003 Estimate,UNSD,1.0,,


In [466]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
33827,Netherlands,528,Men,[10-14],10,14,0.0,Estimate,1995 Estimate,UNSD
31388,Mexico,484,Women,[20-24],20,24,46.88,Census,1995 Sample Census,IPUMS
755,Argentina,32,Men,[25-29],25,29,59.03,Census,1991 Census,UNSD
34385,Netherlands Antilles,530,Women,[20-24],20,24,19.07,Census,1981 Census,US Census Bureau
4557,Brazil,76,Men,[75+],75,999,70.13,Census,2000 Census,UNSD
31490,Mexico,484,Men,[40-44],40,44,83.51,Estimate,2014 Estimate,UNSD
35304,Nigeria,566,Women,[35-39],35,39,91.3,Survey,2013 DHS,DHS_HH
27168,Latvia,428,Women,[20-24],20,24,17.98,Estimate,2002 Estimate,UNSD
40322,Rwanda,646,Men,[20-24],20,24,27.25,Survey,2000 MICS_HH,MICS_HH
8625,Colombia,170,Women,[20-24],20,24,48.6,Survey,1995 DHS,DHS_STATcompiler


In [467]:
df_11.drop_duplicates(inplace=True)
df_11.dropna(inplace=True)

In [468]:
df_info = pd.DataFrame({
    'datatypes': df_11.dtypes,
    'null_count': df_11.isnull().sum(),
    'unique_count': df_11.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           233
isocode                  int64           0           230
sex                     object           0             2
agegroup                object           0            23
age_start                int64           0            17
ageend                   int64           0            15
data_value             float64           0          9213
dataprocess             object           0             6
datacatalogshortname    object           0           412
data_source             object           0            15


In [469]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [470]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [471]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [472]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
7534,Cayman Islands,136,2010,2010,Women,[40-44],40,44,75.07,Census,2010 Census,4740,Cayman Islands 2010 Census,UNSD,,,Excluding the institutional population.
30086,Liechtenstein,438,1987,1987,Women,[40-44],40,44,89.11,Estimate,1987 Estimate,2145,Liechtenstein 1987 Estimate,UNSD,,Unrevised data.,
1433,Australia,36,1999,1999,Women,[60-64],60,64,96.39,Estimate,1999 Estimate,2037,Australia 1999 Estimate,UNSD,,,
31198,Luxembourg,442,2011,2011,Women,[30-34],30,34,72.82,Census,2011 Census,4832,Luxembourg 2011 Census,Eurostat,1.0,Estimates computed based on data on marital st...,
38119,Norway,578,1993,1993,Men,[40-44],40,44,85.5,Estimate,1993 Estimate,2180,Norway 1993 Estimate,UNSD,,,
33416,Monaco,492,2000,2000,Men,[35-39],35,39,66.97,Census,2000 Census,2453,Monaco 2000 Census,UNSD,,,
27611,Kazakhstan,398,2006,2006,Women,[30-34],30,34,90.84,Survey,2006 MICS,1882,Kazakhstan 2006 Multiple Indicator Cluster Survey,MICS,1.0,,
25275,Ireland,372,2016,2016,Women,[75+],75,999,87.0,Estimate,2016 Estimate,2126,Ireland 2016 Estimate,UNSD,1.0,,


In [473]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
12177,Denmark,208,Women,[60-64],60,64,93.54,Estimate,UNSD
1806,Austria,40,Men,[75+],75,999,93.39,Estimate,UNSD
31491,Malawi,454,Men,[25-29],25,29,82.19,Census,UNSD
30636,Lithuania,440,Men,[50-54],50,54,93.2,Estimate,UNSD
31803,Maldives,462,Women,[45-49],45,49,99.37,Census,UNSD
32850,Mauritius,480,Men,[20-24],20,24,11.5,Census,UNSD
9929,Costa Rica,188,Men,[45-49],45,49,87.41,Estimate,UNSD
11666,Democratic Republic of the Congo,180,Women,[60-64],60,64,99.51,Survey,MICS_HH


In [474]:
df_12.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_12.dtypes,
    'null_count': df_12.isnull().sum(),
    'unique_count': df_12.nunique()
})
print(df_info)

            datatypes  null_count  unique_count
country        object           0           233
isocode         int64           0           230
sex            object           0             2
agegroup       object           0            23
age_start       int64           0            17
age_end         int64           0            15
datavalue     float64           0          8396
dataprocess    object           0             6
datasource     object           0            15


In [475]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [476]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [477]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [478]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
13775,China,156,[35-39],ASFR3539,1967.000732,205.8,"1982 One-per-Thousand FS,Birth Histories,One-p...",Birth histories,Survey,Survey,717,China 1982 National One-per-Thousand-Populatio...,1982 One-per-Thousand FS,1982,1982,China (1984). Analysis on China's national One...,1984
39778,Kenya,404,[15-19],ASFR1519,2006.5,106.0,"2008-2009 DHS,Direct,DHS,4689-16-39167",Direct,Survey,DHS,4689,Kenya 2008-2009 Demographic and Health Survey,2008-2009 DHS,2008,2009,DHS Statcompiler,2012
44251,Malawi,454,[25-29],ASFR2529,1991.295044,268.0,"1992 DHS,Direct,DHS,1834-16-39167",Direct,Survey,DHS,1834,Malawi 1992 Demographic and Health Survey,1992 DHS,1992,1992,DHS Statcompiler,2012
61867,Senegal,686,[Total],MAC,2011.920044,30.04138,"2014 DHS,Direct,DHS,5779-16-39167",Direct,Survey,DHS,5779,Senegal 2014 Demographic and Health Survey,2014 DHS,2014,2014,DHS Statcompiler,2012
14324,China,156,[15-19],ASFR1519,1989.998657,22.0,"1990 Census,Reconstructed Birth Histories data...",Reconstructed Birth Histories data,Census,Census,1569,China 1990 Census,1990 Census,1990,1990,Recent fertility trends in China: Results fro...,1994
69184,Tajikistan,762,[Total],TFR,2009.5,2.66,"Estimates,Direct,DYB,2230-16-39443",Direct,Estimate,Estimate,2230,All sources of estimates,Estimates,2009,2009,Demographic Yearbook,2015


In [479]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [480]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
73004,Ukraine,[30-34],2005,36.0,Direct,Survey,2012 MICS,2012,2012
79571,Zimbabwe,[25-29],2001,196.55,Birth histories,Survey,2014 MICS,2014,2014
65388,Spain,[25-29],1952,160.27,Fertility data (adjusted),Estimate,Estimates,1952,1952
64182,Slovakia,[25-29],1995,90.59,Official estimates,Estimate,Estimates,1995,1995
12796,Canada,[35-39],2009,50.6,Direct,Register,Register,2009,2009
75499,Uruguay,[40-44],1975,21.08,Fertility data (adjusted),Estimate,Estimates,1975,1975
65425,Spain,[45-49],1955,4.55,Fertility data (adjusted),Estimate,Estimates,1955,1955
71120,Trinidad and Tobago,[35-39],1970,78.4,Computed rate from DYB,Register,Register,1970,1970
58208,Republic of Moldova,[30-34],1969,99.8,Official,Register,Register,1969,1969
49324,Myanmar,[20-24],2006,67.9,Birth histories,Survey,2007 FRHS,2007,2007


In [481]:
df_13.drop_duplicates(inplace=True)
df_13.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_13.dtypes,
    'null_count': df_13.isnull().sum(),
    'unique_count': df_13.nunique()
})
print(df_info)

                    datatypes  null_count  unique_count
countryorarea          object           0           201
agegroup               object           0             8
date                    int32           0            69
value                 float64           0         18752
datatype               object           0            30
datasourcetype         object           0             7
datasourcenameshort    object           0           539
datasourcestartyear     int64           0            69
datasourceendyear      object           0            70


In [482]:
#df_13.to_csv("../data/Cleaned/cleaned_fertility_indicators_un.csv", index=False)

In [483]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [484]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [485]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
170158,Netherlands,528,1979,1979,Women,Widowed,,,[75+],75,...,1979 Estimate,2170,Netherlands 1979 Estimate,UNSD,,,,,,
247567,Trinidad and Tobago,780,1990,1990,Men,Married,,,[50-54],50,...,1990 Census,1423,Trinidad and Tobago 1990 Census,UNSD,,,,,,
172333,Netherlands,528,2001,2001,Men,Widowed,,,[50-54],50,...,2001 Census,1347,Netherlands 2001 Census,UNSD,,,,,,
158746,Mexico,484,1990,1990,Women,Single,,,[45-49],45,...,1990 Census,1350,Mexico 1990 Census,UNSD,,,,Data have not been adjusted for underenumeration.,,
73593,Ethiopia,231,2016,2016,Men,Married,,,[45-49],45,...,2016 DHS,5808,Ethiopia 2016 Demographic and Health Survey,DHS_HH,,,,,,


In [486]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
214653,Saudi Arabia,2016,2016,Men,Divorced,[45-49],45,49,1.63,Survey,2016 DS,National statistics
69407,Egypt,2003,2003,Women,Widowed,[20-24],20,24,0.1,Survey,2003 DHS Interim,DHS_STATcompiler
45252,Colombia,1995,1995,Women,Married,[30-34],30,34,37.2,Survey,1995 DHS,DHS_STATcompiler
198059,Puerto Rico,1980,1980,Women,Consensual union,[70-74],70,74,1.0,Census,1980 Census,UNSD
137540,Lao People’s Democratic Republic,2015,2015,Men,Consensual union,[25-29],25,29,0.82,Census,2015 Census,National statistics
5516,Aruba,1981,1981,Women,Widowed,[60-64],60,64,21.27,Census,1981 Census,UNSD
234286,Sweden,1985,1985,Men,Married,[70-74],70,74,69.95,Census,1985 Census,UNSD
165358,Mozambique,2017,2017,Men,Single,[50-54],50,54,5.19,Census,2017 Census,National statistics
165067,Mozambique,2011,2011,Men,Not living together,[50-54],50,54,1.3,Survey,2011 DHS,DHS_STATcompiler
23183,Brazil,2000,2000,Women,Consensual union,[65-69],65,69,4.43,Census,2000 Census,UNSD


In [487]:
df_14.drop_duplicates(inplace=True)
df_14.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_14.dtypes,
    'null_count': df_14.isnull().sum(),
    'unique_count': df_14.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           235
year_start               int64           0            62
year_end                 int64           0            60
sex                     object           0             2
marital_status          object           0            35
age_group               object           0            63
agestart                 int64           0            21
ageend                   int64           0            20
datavalue              float64           0          9994
dataprocess             object           0             6
datacatalogshortname    object           0           443
datasource              object           0            15


In [488]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [489]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [490]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [491]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
21651,Micronesia,954,SDG-M49,Married or in-union women,2003,30-34,69.308804,30.014177,Estimate
7683,Southern Africa,913,M49,Married or in-union women,2039,30-34,36.798986,2400.978515,Projection
26808,Upper-middle-income countries,1502,Income group,Married or in-union women,2000,15-19,7.286791,13219.772592,Estimate
12396,Europe,908,SDG-M49,Married or in-union women,1980,35-39,84.578843,17066.753352,Estimate
14981,Western Europe,926,M49,Married or in-union women,1979,40-44,83.430863,4604.829227,Estimate
21595,Micronesia,954,SDG-M49,Married or in-union women,1996,30-34,72.48645,27.439021,Estimate


In [492]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
16841,Caribbean,915,Married or in-union women,2050,20-24,23.112448,652.577198,Projection
27157,Upper-middle-income countries,1502,Married or in-union women,2043,40-44,78.584952,116059.702009,Projection
27071,Upper-middle-income countries,1502,Married or in-union women,2032,15-49,56.893002,652525.380949,Projection
27435,High-income countries,1503,Married or in-union women,1997,30-34,75.95331,35570.142408,Estimate
21245,Melanesia,928,Married or in-union women,2033,40-44,82.765933,774.665134,Projection
27766,High-income countries,1503,Married or in-union women,2038,45-49,67.37002,31163.563407,Projection
22790,Developed countries,901,Married or in-union women,1983,45-49,79.640419,26792.26908,Estimate
28169,No income group available,1518,Married or in-union women,2008,20-24,42.302436,1096.021841,Estimate
12062,South-Eastern Asia,35,Married or in-union women,2019,45-49,83.538665,35294.215422,Estimate
14520,Southern Europe,925,Married or in-union women,2003,15-19,3.768015,153.316268,Estimate


In [493]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [494]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [495]:
df_15.dropna(inplace=True)
df_15.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_15.dtypes,
    'null_count': df_15.isnull().sum(),
    'unique_count': df_15.nunique()
})
print(df_info)

           datatypes  null_count  unique_count
region        object           0            43
iso_code       int64           0            44
indicator     object           0             1
year           int64           0            81
age_group     object           0             8
percentage   float64           0          7796
number         int32           0         20311
process       object           0             2


In [496]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [497]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [498]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [499]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [500]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [501]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [502]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [503]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [504]:
df_16_1.sample(10)

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
18,Ireland,2.4,4.0,2.7
11,Australia,2.53,3.93,2.78
8,New Zealand,2.61,3.88,2.67
24,Luxembourg,2.3,3.7,2.5
23,Hungary,2.3,3.7,2.4
28,Switzerland,2.21,4.02,2.58
35,France,2.1,3.9,2.6
34,Bulgaria,2.2,3.5,2.3
37,Sweden,2.1,3.9,2.6
43,Estonia,1.8,3.8,2.6


In [505]:
#df_16_1.to_csv('../data/Cleaned/cleaned_average_size_of_households_type_2024_oecd.csv', index=False)

In [506]:
#df_16_1.to_sql('average_size_of_households_type_2024_oecd', engine, if_exists = 'replace', index= False)

In [507]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [508]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [509]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [510]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [511]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [512]:
df_16_2.rename(columns={
   "couple_total" : "couple_total(%)",
   "with_children" : "with_children(%)",
   "without_children" : "without_children(%)",
    "single_parent_total" : "single_parent_total(%)",
    "single_mother" : "single_mother(%)",
    "single_father" : "single_father(%)",
    "single_person" : "single_person(%)",
    "other_household_types" : "other_household_types(%)"
}, inplace=True)

In [513]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total(%)         float64           0            36
with_children(%)        float64           0            35
without_children(%)     float64           0            36
single_parent_total(%)  float64           0            34
single_mother(%)        float64           0            32
single_father(%)        float64           0            31
single_person(%)        float64           0            35
other_households_types  float64           0            36
country                    object
couple_total(%)           float64
with_children(%)          float64
without_children(%)       float64
single_parent_total(%)    float64
single_mother(%)          float64
single_father(%)          float64
single_person(%)          float64
other_households_types    float64
dtype: object


In [514]:
df_16_2.sample(10)

Unnamed: 0,country,couple_total(%),with_children(%),without_children(%),single_parent_total(%),single_mother(%),single_father(%),single_person(%),other_households_types
8,Denmark,48.6,20.41,28.19,6.31,5.11,1.19,37.57,7.52
37,United States,53.19,19.85,33.34,6.8,5.21,1.58,27.61,12.41
21,Latvia,27.8,12.21,15.6,13.44,11.21,2.23,41.08,17.68
15,Iceland,45.19,25.42,19.77,7.35,6.23,1.12,29.16,18.29
22,Lithuania,50.09,23.74,26.35,7.18,6.68,0.5,35.16,7.58
20,Korea,43.48,26.25,17.23,9.13,6.85,2.28,35.47,11.93
43,Romania,45.66,20.73,24.93,6.5,4.56,1.94,33.63,14.21
9,Estonia,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98
10,Finland,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6
1,Austria,48.93,21.13,27.8,5.63,4.78,0.85,38.34,7.11


In [515]:
#df_16_2.to_csv('../data/Cleaned/cleaned_types_of_household_2021_oecd.csv', index = False)

In [516]:
#df_16_2.to_sql('types_of_household_2021_oecd', engine, if_exists = 'replace', index= False)

In [517]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [518]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [519]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [520]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [521]:
df_16_3.sample(10)

Unnamed: 0,country,0_children,1_child,2_children,3_or_more_children,children_under_6
7,Czechia,71.95,13.85,11.56,2.64,12.29
18,Italy,77.79,12.26,8.28,1.66,8.05
28,Poland,74.39,12.91,9.84,2.86,9.97
36,United Kingdom,72.06,12.1,11.31,4.53,12.73
10,Finland,81.98,7.89,6.99,3.14,7.14
44,EU average,75.1,12.28,9.46,3.15,9.9
22,Lithuania,80.44,11.06,7.0,1.51,8.12
33,Sweden,74.84,10.77,9.83,4.56,9.95
32,Spain,74.61,13.54,8.95,2.9,8.79
12,Germany,79.86,9.91,7.72,2.51,8.57


In [522]:
df_16_3.rename(columns={
    "0_children": "households_0_children(%)",
    "1_child": "households_1_child(%)",
    "2_children": "households_2_children(%)",
    "3_or_more_children": "households_3_or_more_children(%)"
}, inplace=True)

In [523]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                                    dtype  null_count  unique_count
country                            object           0            33
households_0_children(%)          float64           0            32
households_1_child(%)             float64           0            32
households_2_children(%)          float64           0            33
households_3_or_more_children(%)  float64           0            31
children_under_6                  float64           0            31
country                              object
households_0_children(%)            float64
households_1_child(%)               float64
households_2_children(%)            float64
households_3_or_more_children(%)    float64
children_under_6                    float64
dtype: object


In [524]:
df_16_3.sample(10)

Unnamed: 0,country,households_0_children(%),households_1_child(%),households_2_children(%),households_3_or_more_children(%),children_under_6
8,Denmark,77.78,10.54,8.94,2.74,8.15
13,Greece,74.31,11.83,9.97,3.89,9.37
21,Latvia,74.8,14.05,8.32,2.83,10.07
25,Netherlands,78.65,8.78,9.27,3.3,8.79
9,Estonia,75.76,12.53,8.73,2.98,9.85
31,Slovenia,75.0,11.25,10.2,3.56,9.93
35,Türkiye,57.62,17.42,14.5,10.45,19.65
23,Luxembourg,73.0,12.49,12.07,2.41,11.54
41,Cyprus,71.36,13.88,11.67,3.1,12.71
32,Spain,74.61,13.54,8.95,2.9,8.79


In [525]:
#df_16_3.to_csv('../data/Cleaned/cleaned_households_by_number_of_children_2024_oecd.csv', index=False)

In [526]:
#df_16_3.to_sql('households_by_number_of_children_2024_oecd', engine, index= False)

In [527]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
#total_fertility_rates_from_1960_oecd
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [528]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [529]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [530]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [531]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [532]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            49
1960     float64           0            47
1961     float64           0            47
1962     float64           0            47
1963     float64           0            46
...          ...         ...           ...
2019     float64           0            37
2020     float64           0            39
2021     float64           0            40
2022     float64           0            34
2023     float64           0            35

[65 rows x 3 columns]


In [533]:
df_17_1.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
29,Slovak Republic,3.07,2.96,2.83,2.93,2.91,2.8,2.67,2.49,2.4,...,1.37,1.4,1.48,1.52,1.54,1.57,1.59,1.63,1.57,1.49
21,Lithuania,2.4,2.4,2.4,2.4,2.4,2.4,2.4,2.4,2.4,...,1.57,1.63,1.63,1.57,1.53,1.43,1.36,1.36,1.27,1.18
28,Portugal,3.1,3.16,3.21,3.11,3.21,3.14,3.12,3.08,3.0,...,1.23,1.31,1.36,1.38,1.42,1.43,1.41,1.35,1.42,1.44
30,Slovenia,2.18,2.26,2.27,2.28,2.32,2.45,2.48,2.38,2.28,...,1.58,1.57,1.58,1.62,1.6,1.61,1.59,1.64,1.55,1.51
14,Hungary,2.02,1.94,1.79,1.82,1.8,1.81,1.88,2.01,2.06,...,1.42,1.45,1.51,1.51,1.5,1.51,1.59,1.61,1.55,1.51
7,Czechia,2.11,2.13,2.14,2.33,2.36,2.18,2.01,1.9,1.83,...,1.53,1.57,1.63,1.69,1.71,1.71,1.71,1.83,1.62,1.45
17,Italy,2.41,2.41,2.46,2.56,2.7,2.67,2.63,2.54,2.49,...,1.38,1.36,1.36,1.34,1.31,1.27,1.24,1.25,1.24,1.2
4,Chile,4.7,4.66,4.6,4.54,4.46,4.36,4.26,4.14,4.03,...,1.77,1.74,1.69,1.56,1.54,1.43,1.31,1.18,1.26,1.17
27,Poland,2.98,2.83,2.72,2.7,2.57,2.52,2.34,2.33,2.24,...,1.29,1.29,1.36,1.45,1.44,1.42,1.39,1.33,1.26,1.16
15,Iceland,4.27,3.88,3.98,3.98,3.86,3.71,3.58,3.28,3.07,...,1.99,1.86,1.8,1.76,1.76,1.81,1.79,1.9,1.67,1.59


In [534]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [535]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [536]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [537]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [538]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_17_2.head()

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341


In [539]:
# --- Ensure "country" and "birth order" are strings (tidy casing/spacing) ---
df_17_2["country"] = df_17_2["country"].astype(str).str.strip().str.title()
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_17_2.columns if c not in ["country", "birth_order"]]
# --- Robust cleaning -> convert to float ---
df_17_2[num_cols] = (
    df_17_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_17_2[num_cols] = df_17_2[num_cols].round(2)


In [540]:
df_17_2.drop_duplicates(inplace=True)
df_17_2.dropna(inplace=True)

In [541]:
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str) + "(%)"

In [542]:
df_17_2.head(10)

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth(%),47.6,47.8,46.7,46.2,46.5,46.1,45.2,44.5,...,48.0,48.3,47.3,47.5,47.1,47.2,47.7,47.6,48.4,48.1
1,Austria,Second birth(%),33.7,33.7,34.3,34.9,34.5,34.8,35.8,36.4,...,35.5,35.3,35.6,35.3,35.3,35.1,35.3,35.5,34.9,35.1
2,Austria,Third birth or higher(%),18.8,18.5,19.0,18.9,19.0,19.1,18.9,19.1,...,16.5,16.4,17.1,17.2,17.6,17.7,17.0,16.9,16.7,16.8
3,Belgium,First birth(%),46.8,46.9,47.3,47.3,48.1,47.2,46.9,47.2,...,42.3,43.5,44.1,43.6,42.9,42.6,45.0,44.0,44.7,45.5
4,Belgium,Second birth(%),33.0,32.9,32.7,32.8,32.3,32.8,33.5,33.0,...,35.1,34.8,34.5,34.6,34.5,34.7,34.2,35.1,34.3,34.1
5,Belgium,Third birth or higher(%),20.2,20.2,19.9,19.9,19.6,20.0,19.6,19.8,...,22.6,21.8,21.4,21.9,22.6,22.6,20.8,20.9,20.9,20.4
6,Czechia,First birth(%),46.7,46.6,47.4,47.8,50.1,49.8,48.5,47.7,...,47.4,48.1,48.7,48.7,48.0,47.8,47.6,46.4,46.3,46.3
7,Czechia,Second birth(%),37.7,37.6,37.4,37.2,35.5,35.8,36.8,36.9,...,37.5,37.3,36.7,36.6,37.2,37.6,37.6,39.0,38.6,39.1
8,Czechia,Third birth or higher(%),15.6,15.8,15.2,15.0,14.4,14.4,14.8,15.4,...,15.1,14.7,14.6,14.7,14.7,14.6,14.8,14.6,15.0,14.6
9,Estonia,First birth(%),43.5,43.5,44.0,46.2,49.5,50.3,49.6,49.6,...,41.9,42.3,40.8,40.2,36.7,38.8,38.0,37.2,39.8,39.7


In [543]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

               dtype  null_count  unique_count
country       object           0            17
birth_order   object           0             3
1987         float64           0            48
1988         float64           0            49
1989         float64           0            48
1990         float64           0            44
1991         float64           0            48
1992         float64           0            46
1993         float64           0            47
1994         float64           0            47
1995         float64           0            48
1996         float64           0            47
1997         float64           0            49
1998         float64           0            50
1999         float64           0            49
2000         float64           0            48
2001         float64           0            50
2002         float64           0            47
2003         float64           0            50
2004         float64           0            49
2005         

In [544]:
#df_17_2.to_csv('../data/Cleaned/cleaned_births_by_birth_order_oecd.csv', index=False)

In [545]:
#df_17_2.to_sql('births_by_birth_order_oecd', engine, if_exists='replace', index=False)

In [546]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [547]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [548]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [549]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [550]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [551]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [552]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [553]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [554]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [555]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children.csv', index=False)

In [556]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [557]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
#age_of_mothers_at_childbirth
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [558]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1963     object           0            19
1964     object           0            22
1965     object           0            22
1966     object           0            22
1967     object           0            22
1968     object           0            20
1969     object           0            21
1970     object           0            19
1971     object           0            19
1972     object           0            20
1973     object           0            20
1974     object           0            24
1975     object           0            21
1976     object           0            22
1977     object           0            20
1978     object           0            22
1979     object           0            23
1980     object           0            22
1981     object           0            20
1982     object           0            18
1983     object           0            20
1984     object           0       

In [559]:
df_19_1.columns = df_19_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [560]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_19_1.columns if c != "country"]

df_19_1[num_cols] = (
    df_19_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [561]:
df_19_1.drop_duplicates(inplace=True)
df_19_1.dropna(inplace=True)

In [562]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1963     float64           0            16
1964     float64           0            18
1965     float64           0            18
1966     float64           0            18
1967     float64           0            18
1968     float64           0            17
1969     float64           0            17
1970     float64           0            15
1971     float64           0            17
1972     float64           0            18
1973     float64           0            18
1974     float64           0            20
1975     float64           0            18
1976     float64           0            18
1977     float64           0            16
1978     float64           0            18
1979     float64           0            21
1980     float64           0            20
1981     float64           0            17
1982     float64           0            17
1983     float64           0            18
1984     fl

In [563]:
df_19_1.sample(10)

Unnamed: 0,country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
17,Netherlands,29.3,29.2,29.0,28.8,28.5,28.4,28.3,28.2,28.0,...,30.9,31.0,31.1,31.2,31.3,31.4,31.5,31.6,31.7,31.8
13,Ireland,31.1,31.1,31.0,30.9,30.8,30.6,30.5,30.4,30.2,...,31.5,31.7,31.8,31.9,32.1,32.1,32.2,32.4,32.6,32.7
22,Switzerland,27.8,27.8,27.7,27.7,27.5,27.5,27.9,27.8,27.7,...,31.5,31.6,31.8,31.8,31.9,31.9,32.0,32.1,32.2,32.3
21,Slovak Republic,26.8,26.9,26.8,26.6,26.5,26.4,26.3,26.2,26.2,...,28.7,28.8,28.8,28.8,28.8,28.8,28.8,28.8,28.9,28.9
9,Finland,28.1,28.0,28.0,27.8,27.7,27.5,27.4,27.1,26.9,...,30.4,30.5,30.5,30.6,30.8,30.9,31.0,31.1,31.2,31.4
3,Canada,27.8,27.9,27.8,27.7,27.5,27.3,27.3,27.2,27.0,...,30.3,30.4,30.5,30.6,30.7,30.9,31.0,31.2,31.3,31.4
12,Iceland,27.6,27.7,27.7,27.5,27.5,27.4,27.3,27.2,27.0,...,30.1,30.4,30.2,30.3,30.6,30.6,30.6,30.9,30.7,30.9
23,United States,26.5,26.6,26.6,26.4,26.3,26.3,26.2,26.1,26.0,...,28.0,28.2,28.3,28.5,28.7,28.8,29.0,29.1,29.2,29.4
5,Czech Republic,25.7,25.8,25.5,25.2,25.0,24.9,24.8,24.8,24.9,...,29.8,29.9,29.9,30.0,30.0,30.0,30.1,30.2,30.2,30.4
2,Belgium,27.8,27.7,27.6,27.5,27.4,27.3,27.2,27.2,27.0,...,30.0,30.2,30.3,30.4,30.5,30.6,30.7,30.8,30.8,31.0


In [564]:
#df_19_1.to_csv('../data/Cleaned/age_of_mothers_at_childbirth_oecd.csv', index=False)

In [565]:
#df_19_1.to_sql('age_of_mothers_at_childbirth_oecd', engine, if_exists='replace', index=False)

In [566]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
#fertility_per_1000_from 1960
df_19_2.head()

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [567]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            21
Age group  object           0             7
1960       object           0           136
1961       object           0           140
1962       object           0           140
...           ...         ...           ...
2017       object           0           124
2018       object           0           128
2019       object           0           126
2020       object           0           121
2021       object           7           119

[64 rows x 3 columns]


In [568]:
df_19_2.columns = df_19_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_2.head()

Unnamed: 0,country,age_group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [569]:
# --- Ensure "country" and "age_group" are strings
df_19_2["country"] = df_19_2["country"].astype(str).str.strip().str.title()
df_19_2["age_group"] = df_19_2["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_2.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_2[num_cols] = (
    df_19_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_2[num_cols] = df_19_2[num_cols].round(2)

In [570]:
df_19_2.drop_duplicates(inplace=True)
df_19_2.dropna(inplace = True)

In [571]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            19
age_group   object           0             7
1960       float64           0           124
1961       float64           0           126
1962       float64           0           126
...            ...         ...           ...
2017       float64           0           118
2018       float64           0           121
2019       float64           0           120
2020       float64           0           115
2021       float64           0           118

[64 rows x 3 columns]


In [572]:
#df_19_2.to_csv('../data/Cleaned/fertility_per_1000_by_age_from 1960_oecd.csv', index=False)

In [573]:
#df_19_2.to_sql('fertility_per_1000_from_1960_oecd', engine, if_exists='replace', index=False)

In [574]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
#fertility_per_1000_from_2000
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [575]:
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            43
Age group  object           0             7
2000       object           0           233
2001       object           0           248
2002       object           0           240
2003       object           0           239
2004       object           0           245
2005       object           0           240
2006       object           0           239
2007       object           0           242
2008       object           0           252
2009       object           0           251
2010       object           0           239
2011       object           0           235
2012       object           0           242
2013       object           0           234
2014       object           0           238
2015       object           0           237
2016       object           0           248
2017       object           0           236
2018       object           0           245
2019       object           0   

In [576]:
df_19_3.columns = df_19_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_3.head()

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587


In [577]:
# --- Ensure "country" and "age_group" are strings
df_19_3["country"] = df_19_3["country"].astype(str).str.strip().str.title()
df_19_3["age_group"] = df_19_3["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_3.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_3[num_cols] = (
    df_19_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_3[num_cols] = df_19_3[num_cols].round(2)

In [578]:
df_19_3.drop_duplicates(inplace=True)
df_19_3.dropna(inplace=True)

In [579]:
#Check again
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            41
age_group   object           0             7
2000       float64           0           225
2001       float64           0           237
2002       float64           0           232
2003       float64           0           229
2004       float64           0           233
2005       float64           0           229
2006       float64           0           229
2007       float64           0           230
2008       float64           0           238
2009       float64           0           238
2010       float64           0           230
2011       float64           0           227
2012       float64           0           231
2013       float64           0           225
2014       float64           0           226
2015       float64           0           225
2016       float64           0           237
2017       float64           0           227
2018       float64           0           233
2019      

In [580]:
df_19_3.sample(10)

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
142,Japan,25-29,99.5,92.4,92.4,92.4,92.4,85.3,86.4,86.4,...,87.2,86.7,84.8,85.1,83.5,82.1,81.1,77.2,74.7,72.2
129,Israel,30-34,169.2,169.2,172.8,177.0,175.3,172.5,175.2,177.1,...,178.5,177.2,186.1,189.0,183.4,185.5,184.17,178.23,174.0,184.4
212,Portugal,25-29,100.7,92.6,92.8,89.4,85.5,85.0,81.0,78.2,...,71.4,66.9,65.9,67.8,67.8,68.8,69.5,68.1,68.3,64.1
101,Greece,30-34,76.9,74.0,76.9,78.6,81.7,86.9,92.6,92.8,...,91.4,89.6,91.4,94.7,98.8,96.1,95.8,94.6,96.4,101.5
187,Netherlands,40-44,6.6,6.8,7.0,7.0,7.5,7.5,7.6,7.9,...,9.0,8.7,9.5,9.3,10.1,10.4,10.9,11.3,11.6,12.4
263,United Kingdom,35-39,41.1,41.4,42.7,45.9,48.8,50.3,53.6,56.5,...,63.7,62.9,64.5,66.0,66.9,65.1,63.4,61.9,59.8,62.8
84,France,15-19,12.3,12.7,12.0,11.7,11.7,11.5,11.5,11.4,...,10.7,10.1,9.9,9.2,8.5,8.0,7.7,7.5,7.0,6.4
18,Austria,35-39,25.2,25.8,28.7,30.3,32.5,33.2,34.4,35.8,...,45.9,47.5,49.3,52.2,55.2,55.8,55.1,55.9,53.9,57.9
34,Canada,45-49,0.2,0.2,0.2,0.3,0.3,0.3,0.3,0.4,...,0.5,0.6,0.6,0.7,0.7,0.8,0.8,0.8,0.8,0.8
247,Switzerland,25-29,101.6,91.1,89.9,87.4,87.0,85.4,84.7,84.0,...,80.9,80.8,80.2,79.9,79.6,78.4,76.9,74.4,74.4,75.3


In [581]:
#df_19_3.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_2000_oecd.csv',index=False)

In [582]:
#df_19_3.to_sql('fertility_per_1000_from_2000_oecd',engine, if_exists='replace', index=False)

In [583]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
#(%)share_of_births_outside_of_marriage
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [584]:
df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1960     object           0            26
1961     object           0            24
1962     object           0            24
1963     object           0            24
...         ...         ...           ...
2016     object           0            24
2017     object           0            26
2018     object           0            25
2019     object           0            25
2020     object           0            24

[62 rows x 3 columns]


In [585]:
df_20.columns = df_20.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [586]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_20.columns if c != "country"]

df_20[num_cols] = (
    df_20[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [587]:
df_20.drop_duplicates(inplace=True)
df_20.dropna(inplace=True)

df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1960     float64           0            22
1961     float64           0            20
1962     float64           0            21
1963     float64           0            21
...          ...         ...           ...
2016     float64           0            20
2017     float64           0            22
2018     float64           0            21
2019     float64           0            22
2020     float64           0            22

[62 rows x 3 columns]


In [588]:
df_20.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
25,Croatia,7.4,7.1,6.5,6.9,6.8,6.0,5.7,5.4,5.5,...,14.0,15.4,16.1,17.4,18.1,18.9,19.9,20.7,21.5,22.8
7,Hungary,5.5,5.5,5.4,5.3,5.2,5.2,5.1,5.0,5.0,...,42.3,44.5,45.6,47.3,47.9,46.7,44.7,43.9,38.7,30.4
0,Austria,13.0,12.6,12.0,11.6,11.3,11.2,11.4,11.5,12.0,...,40.4,41.5,41.4,41.7,42.1,42.2,42.0,41.3,40.6,41.2
11,Latvia,11.9,12.3,12.4,12.3,12.8,13.3,12.6,12.3,12.1,...,44.6,45.0,44.6,44.0,41.5,40.9,40.4,39.5,38.4,39.5
5,Germany,7.6,7.1,6.6,6.1,5.9,5.8,5.7,5.8,6.1,...,33.9,34.5,34.8,35.0,35.0,35.5,34.7,33.9,33.3,33.1
23,United States,5.3,5.6,5.9,6.3,6.9,7.7,8.4,9.0,9.7,...,40.7,40.7,40.6,40.2,40.3,39.8,39.8,39.6,40.0,40.5
4,Finland,4.0,4.1,4.0,4.2,4.4,4.6,4.8,5.1,5.3,...,40.9,41.5,42.1,42.8,44.3,44.9,44.8,44.6,45.4,46.1
6,Greece,1.2,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.1,...,7.4,7.6,7.0,8.2,8.8,9.4,10.3,11.1,12.4,13.8
16,Portugal,9.5,8.8,8.5,8.2,8.0,7.8,7.5,7.5,7.4,...,42.8,45.6,47.6,49.3,50.7,52.8,54.9,55.9,56.8,57.9
3,Denmark,7.8,8.0,8.3,8.9,9.3,9.5,10.2,11.1,11.1,...,49.0,50.6,51.5,52.5,53.8,54.0,54.2,54.2,54.1,54.2


In [589]:
#df_20.to_csv('../data/Cleaned/cleaned_share_of_births_outside_of_marriage_oecd.csv', index=False)

In [590]:
#df_20.to_sql('share_of_births_outside_of_marriage_oecd',engine, if_exists='replace', index=False)

In [591]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
#mean_age_first_marriage
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [592]:
df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            10
Gender     object           0             2
1990       object           0            17
1991       object           0            18
1992       object           0            18
1993       object           0            19
1994       object           0            16
1995       object           0            18
1996       object           0            19
1997       object           0            17
1998       object           0            14
1999       object           0            19
2000       object           0            18
2001       object           0            18
2002       object           0            19
2003       object           0            19
2004       object           0            16
2005       object           0            18
2006       object           0            18
2007       object           0            19
2008       object           0            18
2009       object           0   

In [593]:
df_21_1.columns = df_21_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [594]:
# --- Ensure "country" and "gender" are strings
df_21_1["country"] = df_21_1["country"].astype(str).str.strip().str.title()
df_21_1["gender"] = df_21_1["gender"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_1.columns if c not in ["country", "gender"]]
# --- Robust cleaning -> convert to float ---
df_21_1[num_cols] = (
    df_21_1[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_1[num_cols] = df_21_1[num_cols].round(2)

In [595]:
df_21_1.drop_duplicates(inplace=True)
df_21_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0             9
gender     object           0             2
1990      float64           0            15
1991      float64           0            16
1992      float64           0            16
1993      float64           0            17
1994      float64           0            15
1995      float64           0            16
1996      float64           0            17
1997      float64           0            15
1998      float64           0            13
1999      float64           0            17
2000      float64           0            16
2001      float64           0            16
2002      float64           0            17
2003      float64           0            17
2004      float64           0            15
2005      float64           0            17
2006      float64           0            17
2007      float64           0            17
2008      float64           0            16
2009      float64           0   

In [596]:
#df_21_1.to_csv('../data/Cleaned/cleaned_mean_age_first_marriage_oecd.csv',index=False)

In [597]:
#df_21_1.to_sql('mean_age_first_marriage_oecd', engine, if_exists='replace', index= False)

In [598]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
#divorce_rates_per_1000_oecd
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [599]:
df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            28
1970       object           0            18
1971       object           0            19
1972       object           0            19
1973       object           0            18
1974       object           0            18
1975       object           0            19
1976       object           0            18
1977       object           0            18
1978       object           0            18
1979       object           0            15
1980       object           0            18
1981       object           0            20
1982       object           0            22
1983       object           0            24
1984       object           0            20
1985       object           0            19
1986       object           0            20
1987       object           0            20
1988       object           0            20
1989       object           0            19
1990       object           0   

In [600]:
df_21_2.columns = df_21_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [601]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_21_2.columns if c != "country"]

df_21_2[num_cols] = (
    df_21_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [602]:
df_21_2.drop_duplicates(inplace=True)
df_21_2.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0            23
1970      float64           0            15
1971      float64           0            17
1972      float64           0            15
1973      float64           0            14
1974      float64           0            15
1975      float64           0            16
1976      float64           0            14
1977      float64           0            13
1978      float64           0            15
1979      float64           0            12
1980      float64           0            14
1981      float64           0            17
1982      float64           0            17
1983      float64           0            19
1984      float64           0            16
1985      float64           0            15
1986      float64           0            16
1987      float64           0            16
1988      float64           0            15
1989      float64           0            15
1990      float64           0   

In [603]:
df_21_2.head(8)

Unnamed: 0,country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,1.4,1.3,1.3,1.3,1.4,1.4,1.5,1.5,1.6,...,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.6,1.5
1,Belgium,0.7,0.7,0.8,0.9,1.0,1.1,1.3,1.3,1.4,...,2.2,2.2,2.2,2.1,2.0,2.0,2.0,1.8,1.9,1.7
2,Czechia,2.2,2.4,2.3,2.5,2.5,2.6,2.5,2.5,2.6,...,2.7,2.5,2.5,2.4,2.4,2.3,2.3,2.0,2.0,1.9
3,Denmark,1.9,2.7,2.6,2.5,2.6,2.6,2.6,2.6,2.6,...,3.4,3.4,2.9,3.0,2.6,2.6,1.8,2.7,2.2,2.1
5,Finland,1.3,1.6,1.8,1.9,2.1,2.0,2.1,2.1,2.2,...,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.4,2.2,2.0
6,Germany,1.3,1.4,1.5,1.6,1.8,1.9,2.0,1.5,1.0,...,2.1,2.1,2.0,2.0,1.9,1.8,1.8,1.7,1.7,1.6
8,Hungary,2.2,2.3,2.3,2.4,2.3,2.5,2.6,2.6,2.7,...,2.0,2.0,2.1,2.0,1.9,1.7,1.8,1.5,1.9,1.8
10,Japan,0.9,1.0,1.0,1.0,1.0,1.1,1.1,1.1,1.2,...,1.8,1.8,1.8,1.7,1.7,1.7,1.7,1.6,1.47,1.52


In [604]:
#df_21_2.to_csv('../data/Cleaned/cleaned_divorce_rates_per_1000_oecd.csv', index=False)

In [605]:
#df_21_2.to_sql('divorce_rates_per_1000_oecd',engine, if_exists= 'replace' , index=False)

In [606]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
#share_of_previous_marital_status
df_21_3

Unnamed: 0,Country,Previous marital status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216
5,Austria,Widowed,12,11,12,11,12,12,11,10,...,10,9,10,9,10,8,9,9,8,4
6,Czechia,Single never married,749,745,743,740,739,742,745,726,...,740,740,752,756,766,767,764,764,761,759
7,Czechia,Divorced,237,242,244,247,247,245,244,261,...,249,249,238,234,224,223,226,226,229,230
8,Czechia,Widowed,14,13,13,13,14,12,11,13,...,12,11,10,10,10,10,10,10,10,11
9,Denmark,Single never married,759,760,762,764,760,756,756,763,...,772,760,750,762,761,769,764,771,776,783


In [607]:
df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
Country                    object           0            20
Previous marital status    object           0             3
2000                       object           0            47
2001                       object           0            51
2002                       object           0            56
2003                       object           0            50
2004                       object           0            50
2005                       object           0            52
2006                       object           0            49
2008                       object           0            47
2009                       object           0            50
2010                       object           0            49
2011                       object           0            49
2012                       object           0            53
2013                       object           0            49
2014                       object       

In [608]:
df_21_3.columns = df_21_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216


In [609]:
# --- Ensure "country" and "previous_marital_status" are strings
df_21_3["country"] = df_21_3["country"].astype(str).str.strip().str.title()
df_21_3["previous_marital_status"] = df_21_3["previous_marital_status"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_3.columns if c not in ["country", "previous_marital_status"]]
# --- Robust cleaning -> convert to float ---
df_21_3[num_cols] = (
    df_21_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_3[num_cols] = df_21_3[num_cols].round(2)

In [610]:
df_21_3.drop_duplicates(inplace=True)
df_21_3.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
country                    object           0            20
previous_marital_status    object           0             3
2000                      float64           0            47
2001                      float64           0            51
2002                      float64           0            56
2003                      float64           0            50
2004                      float64           0            50
2005                      float64           0            52
2006                      float64           0            49
2008                      float64           0            47
2009                      float64           0            50
2010                      float64           0            49
2011                      float64           0            49
2012                      float64           0            53
2013                      float64           0            49
2014                      float64       

In [611]:
#df_21_3.to_csv('../data/Cleaned/cleaned_share_of_previous_marital_status_oecd.csv', index=False)

In [612]:
#df_21_3.to_sql('share_of_previous_marital_status_oecd', engine, if_exists= 'replace', index =  False)

In [613]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
#hauseholds_by_partnership_status_oecd
df_22_1

Unnamed: 0,Country,20+_All_Total_Living_with_a_partner(%),20+_All_Married or in a civil or registered partnership_living_with_a_partner(%),20+_All_Cohabiting_living_with_a_partner(%),20+_All_Not living with a partner(%),20/34_Total_living_with_a_partner(%),20/34_Married or in a civil or registered partnership_living_with_a_partner(%),20/34_Cohabiting_living_with_a_partner(%),Not living with a partner_Total(%),Living with at least one parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620
5,Denmark,6415,5002,1412,3585,5054,2186,2868,4946,1067
6,Estonia,5393,3730,1664,4607,4531,1781,2750,5469,2646
7,France,6414,4941,1472,3586,5042,2189,2853,4958,2208
8,Germany,6261,5391,869,3739,3953,2215,1739,5974,2754
9,Greece,6023,5852,171,3977,3313,2924,390,6687,4543


In [614]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
20+_All_Total_Living_with_a_partner(%)                object           0   
20+_All_Married or in a civil or registered par...    object           0   
20+_All_Cohabiting_living_with_a_partner(%)           object           0   
20+_All_Not living with a partner(%)                  object           0   
20/34_Total_living_with_a_partner(%)                  object           0   
20/34_Married or in a civil or registered partn...    object           0   
20/34_Cohabiting_living_with_a_partner(%)             object           0   
Not living with a partner_Total(%)                    object           0   
Living with at least one parent(%)                    object           0   

                                                    unique_count  
Country                                                       37  
20+_All_Total_Living_with_a_p

In [615]:
df_22_1.columns = df_22_1.columns.str.lower() \
                .str.replace(' ', '_') \


df_22_1.head()

Unnamed: 0,country,20+_all_total_living_with_a_partner(%),20+_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20+_all_cohabiting_living_with_a_partner(%),20+_all_not_living_with_a_partner(%),20/34_total_living_with_a_partner(%),20/34_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20/34_cohabiting_living_with_a_partner(%),not_living_with_a_partner_total(%),living_with_at_least_one_parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620


In [616]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_1.columns if c != "country"]

df_22_1[num_cols] = (
    df_22_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [617]:
df_22_1["country"] = df_22_1["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_1["country"].unique())

['Australia' 'Austria' 'Belgium' 'Canada' 'Czech Republic' 'Denmark'
 'Estonia' 'France' 'Germany' 'Greece' 'Hungary' 'Iceland' 'Ireland'
 'Italy' 'Latvia' 'Luxembourg' 'Netherlands' 'New Zealand' 'Norway'
 'Poland' 'Portugal' 'Slovak Republic' 'Slovenia' 'Spain' 'Sweden'
 'Switzerland' 'United Kingdom' 'United States' 'OECD-28 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [618]:
df_22_1.drop_duplicates(inplace = True)
df_22_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
20+_all_total_living_with_a_partner(%)               float64           0   
20+_all_married_or_in_a_civil_or_registered_par...   float64           0   
20+_all_cohabiting_living_with_a_partner(%)          float64           0   
20+_all_not_living_with_a_partner(%)                 float64           0   
20/34_total_living_with_a_partner(%)                 float64           0   
20/34_married_or_in_a_civil_or_registered_partn...   float64           0   
20/34_cohabiting_living_with_a_partner(%)            float64           0   
not_living_with_a_partner_total(%)                   float64           0   
living_with_at_least_one_parent(%)                   float64           0   

                                                    unique_count  
country                                                       34  
20+_all_total_living_with_a_p

In [619]:
df_22_1.sample(8)

Unnamed: 0,country,20+_all_total_living_with_a_partner(%),20+_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20+_all_cohabiting_living_with_a_partner(%),20+_all_not_living_with_a_partner(%),20/34_total_living_with_a_partner(%),20/34_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20/34_cohabiting_living_with_a_partner(%),not_living_with_a_partner_total(%),living_with_at_least_one_parent(%)
1,Austria,58.8,49.1,9.7,41.2,39.11,22.15,16.97,60.89,33.82
31,Cyprus,65.14,60.71,4.43,34.86,42.28,33.11,9.16,57.72,37.66
35,EU average,59.27,50.93,8.35,40.73,39.27,24.86,14.42,60.49,37.15
25,Switzerland,64.15,53.46,10.69,35.85,42.43,25.4,17.03,55.0,31.44
8,Germany,62.61,53.91,8.69,37.39,39.53,22.15,17.39,59.74,27.54
33,Malta,61.79,59.23,2.56,38.21,34.61,30.44,4.16,65.39,55.11
4,Czech Republic,51.17,45.39,5.79,48.83,30.78,21.32,9.46,69.22,36.2
19,Poland,57.72,55.6,2.12,42.28,37.6,34.2,3.4,62.4,45.65


In [620]:
#df_22_1.to_csv('../data/Cleaned/cleaned_hauseholds_by_partnership_status_oecd.csv', index=False)

In [621]:
#df_22_1.to_sql('hauseholds_by_partnership_status_oecd', engine, if_exists='replace', index= False)

In [622]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
#level_of_educational_attainment
df_22_2

Unnamed: 0,Country,Low_Education_Total_living_with_a_partner(%),Low_educationMarried or in a civil or registered partnership_living_with_a_partner(%),Low_education_Cohabiting_living_with_a_partner(%),Not living with a partner_Low_education(%),Medium education_Total_Living with a partner(%),Medium education_Married or in a civil or registered partnership_Living with a partner(%),Medium education_Cohabiting_Living with a partner(%),Not living with a partner_Medium education(%),High education_Total_Living with a partner(%),High education_Married or in a civil or registered partnership_Living with a partner(%),High education_Cohabiting_Living with a partner(%),Not living with a partner_High education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442
5,Germany,5446,4879,567,4554,6238,5313,925,3762.0,6889,5916,974,3111
6,Greece,6381,6288,93,3619,5700,5488,212,4300.0,5833,5570,263,4167
7,Hungary,5033,4038,995,4967,5794,4678,1115,4206.0,5956,5102,855,4044
8,Iceland,5186,4102,1084,4814,5831,4657,1174,4169.0,6972,5453,1519,3028
9,Latvia,3627,2592,1035,6373,4932,3954,978,5068.0,5291,4539,752,4709


In [623]:
df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
Low_Education_Total_living_with_a_partner(%)          object           0   
Low_educationMarried or in a civil or registere...    object           0   
Low_education_Cohabiting_living_with_a_partner(%)     object           0   
Not living with a partner_Low_education(%)            object           0   
Medium education_Total_Living with a partner(%)       object           0   
Medium education_Married or in a civil or regis...    object           0   
Medium education_Cohabiting_Living with a partn...    object           0   
Not living with a partner_Medium education(%)         object           2   
High education_Total_Living with a partner(%)         object           0   
High education_Married or in a civil or registe...    object           0   
High education_Cohabiting_Living with a partner(%)    object           0   
Not living w

In [624]:
df_22_2.columns = df_22_2.columns.str.lower() \
                .str.replace(' ', '_') \


df_22_2.head()

Unnamed: 0,country,low_education_total_living_with_a_partner(%),low_educationmarried_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),low_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_low_education(%),medium_education_total_living_with_a_partner(%),medium_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),medium_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_medium_education(%),high_education_total_living_with_a_partner(%),high_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),high_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_high_education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442


In [625]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_2.columns if c != "country"]

df_22_2[num_cols] = (
    df_22_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [626]:
df_22_2["country"] = df_22_2["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_2["country"].unique())

['Austria' 'Belgium' 'Czech Republic' 'Estonia' 'France' 'Germany'
 'Greece' 'Hungary' 'Iceland' 'Latvia' 'Luxembourg' 'Norway' 'Poland'
 'Portugal' 'Slovenia' 'Spain' 'Sweden' 'United Kingdom' 'OECD-19 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [627]:
df_22_2.drop_duplicates(inplace=True)
df_22_2.dropna(inplace=True)


df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
low_education_total_living_with_a_partner(%)         float64           0   
low_educationmarried_or_in_a_civil_or_registere...   float64           0   
low_education_cohabiting_living_with_a_partner(%)    float64           0   
not_living_with_a_partner_low_education(%)           float64           0   
medium_education_total_living_with_a_partner(%)      float64           0   
medium_education_married_or_in_a_civil_or_regis...   float64           0   
medium_education_cohabiting_living_with_a_partn...   float64           0   
not_living_with_a_partner_medium_education(%)        float64           0   
high_education_total_living_with_a_partner(%)        float64           0   
high_education_married_or_in_a_civil_or_registe...   float64           0   
high_education_cohabiting_living_with_a_partner(%)   float64           0   
not_living_w

In [628]:
#df_22_2.to_csv('../data/Cleaned/cleaned_level_of_educational_attainment_oecd.csv', index=False)

In [629]:
#df_22_2.to_sql('level_of_educational_attainment_oecd',engine, if_exists='replace', index= False)

In [630]:
df_6666 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_6666

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,
