In [None]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook
from pathlib import Path

In [1071]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [1072]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [1073]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [1074]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [1075]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [1076]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1077]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [1078]:
df_1.drop_duplicates(inplace=True)
df_1.dropna(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [1079]:
df_info = pd.DataFrame({
    'datatypes': df_1.dtypes,
    'null_count': df_1.isnull().sum(),
    'unique_count': df_1.nunique()
})
print(df_info)

                           datatypes  null_count  unique_count
country                       object           0           235
age_group                     object           0            63
sex                           object           0             2
marital_status                object           0            35
data_process                  object           0             6
data_collection_start_year     int32           0            62
data_collection_end_year       int32           0            60
data_source                   object           0            15


In [1080]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [1081]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [1082]:
s_1 = ('../data/Cleaned/cleaned_world_marriage.csv')

In [1083]:
AGE_COL = "age_group"

# 0) Ensure s_1 is a DataFrame
if isinstance(s_1, (str, Path)):
    s_1 = pd.read_csv(s_1)
elif isinstance(s_1, tuple) and len(s_1) == 1 and isinstance(s_1[0], (str, Path)):
    s_1 = pd.read_csv(s_1[0])

# 1) Normalize age labels; keep [+75]; map any 65+ to [65-69]

def norm_age(x):
    if pd.isna(x): return x
    x = str(x).replace("–","-").replace("—","-").replace("to","-").replace("_","-")
    x = re.sub(r"[()]", "", x)
    x = re.sub(r"\s+", "", x)
    if re.search(r"\[\+75\]|\[75\+\]|75\+|\+75", x):   # preserve 75+
        return "[+75]"
    if re.fullmatch(r"\+?65\+?|\[?\+65\]?|\[?65\+\]?", x, flags=re.I):  # merge 65+
        return "[65-69]"
    m = re.match(r"^\[?(\d{1,3})-(\d{1,3})\]?$", x)   # standard ranges
    if m:
        a, b = map(int, m.groups())
        return f"[{a}-{b}]"
    m = re.search(r"(\d{1,3})", x)                    # fallback: first number
    return f"[{m.group(1)}]" if m else x

s_1[AGE_COL] = s_1[AGE_COL].astype(str).map(norm_age)

# 2) Keep top-14 age buckets by frequency (delete all others)
top14 = s_1[AGE_COL].value_counts(dropna=False).nlargest(14).index.tolist()
s_1 = s_1[s_1[AGE_COL].isin(top14)].copy()

# 3) Natural ordering (put [+75] last)
def start_num(lbl):
    return 10**9 if lbl == "[+75]" else int(re.search(r"\d+", str(lbl)).group())
cats = sorted(top14, key=start_num)
s_1[AGE_COL] = pd.Categorical(s_1[AGE_COL], categories=cats, ordered=True)
s_1 = s_1.sort_values(AGE_COL).reset_index(drop=True)

print("Kept age buckets (14):", list(s_1[AGE_COL].cat.categories))

Kept age buckets (14): ['[10-14]', '[15-19]', '[20-24]', '[25-29]', '[30-34]', '[35-39]', '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65-69]', '[70-74]', '[+75]']


In [1084]:
#s_1.to_csv('../data/Staging/staging_world_marriage.csv', index= False)

In [1085]:
#s_1.to_sql('staging_world_marriage', engine, if_exists='replace', index=False)

In [1086]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [1087]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1088]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [1089]:
df_2.drop_duplicates(inplace=True)
df_2.dropna(inplace=True)


In [1090]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [1091]:
df_info = pd.DataFrame({
    'datatypes': df_2.dtypes,
    'null_count': df_2.isnull().sum(),
    'unique_count': df_2.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            41
code                                   object           0            41
year                                    int32           0            32
mean_age_of_women_at_first_marriage   float64           0           179


In [1092]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [1093]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [1094]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [1095]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1096]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [1097]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [1098]:
df_3.drop_duplicates(inplace=True)
df_3.dropna(inplace=True)


In [1099]:
df_info = pd.DataFrame({
    'datatypes': df_3.dtypes,
    'null_count': df_3.isnull().sum(),
    'unique_count': df_3.nunique()
})
print(df_info)

                                              datatypes  null_count  \
country                                          object           0   
code                                             object           0   
year                                              int32           0   
crude_marriage_rate_marriages_per_1000_people   float64           0   

                                               unique_count  
country                                                  45  
code                                                     45  
year                                                    127  
crude_marriage_rate_marriages_per_1000_people           109  


In [1100]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [1101]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [1102]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [1103]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1104]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [1105]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [1106]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [1107]:
df_info = pd.DataFrame({
    'datatypes': df_4.dtypes,
    'null_count': df_4.isnull().sum(),
    'unique_count': df_4.nunique()
})
print(df_info)

                            datatypes  null_count  unique_count
country                        object           0            38
code                           object           0            38
year                            int64           0            61
crude_marriage_rate           float64           0           101
crude_marriage_rate_people1   float64           0            28
year_1                          Int64           0             1


In [1108]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [1109]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [1110]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [1111]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1112]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)
df_5.dropna(inplace=True)

In [1113]:
df_info = pd.DataFrame({
    'datatypes': df_5.dtypes,
    'null_count': df_5.isnull().sum(),
    'unique_count': df_5.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            42
code                                   object           0            42
year                                    int64           0            62
share_of_births_outside_of_marriage   float64           0           610


In [1114]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [1115]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [1116]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')
df_6

Unnamed: 0,Entity,Code,Year,Proportions of men or women who had ever married by a certain age for 1900 birth cohort,Proportions of men or women who had ever married by a certain age for 1920 birth cohort,Proportions of men or women who had ever married by a certain age for 1940 birth cohort,Proportions of men or women who had ever married by a certain age for 1960 birth cohort,Proportions of men or women who had ever married by a certain age for 1970 birth cohort,Proportions of men or women who had ever married by a certain age for 1980 birth cohort,Proportions of men or women who had ever married by a certain age for 1990 birth cohort,Proportions of men or women who had ever married by a certain age for 2000 birth cohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2
...,...,...,...,...,...,...,...,...,...,...,...
63,Women,,46,84.5,91.6,95.5,86.9,75.0,,,
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,
65,Women,,48,85.0,91.8,95.6,87.2,75.7,,,
66,Women,,49,85.2,91.9,95.7,87.3,76.0,,,


In [1117]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)

df_6.head()

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2


In [1118]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)
df_6

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
0,Men,17,0.0,0.0,0.0,0.1,0.0
1,Men,18,0.1,0.1,0.4,0.6,0.1
2,Men,19,0.8,0.6,2.0,2.5,0.7
3,Men,20,2.4,2.2,6.0,6.2,1.9
4,Men,21,6.1,7.4,13.6,11.9,3.9
...,...,...,...,...,...,...,...
63,Women,46,84.5,91.6,95.5,86.9,75.0
64,Women,47,84.8,91.7,95.6,87.0,75.4
65,Women,48,85.0,91.8,95.6,87.2,75.7
66,Women,49,85.2,91.9,95.7,87.3,76.0


In [1119]:
df_6.dropna(inplace=True)
df_6.describe

<bound method NDFrame.describe of       sex  year  1900_birthcohort  1920_birthcohort  1940_birthcohort  \
0     Men    17               0.0               0.0               0.0   
1     Men    18               0.1               0.1               0.4   
2     Men    19               0.8               0.6               2.0   
3     Men    20               2.4               2.2               6.0   
4     Men    21               6.1               7.4              13.6   
..    ...   ...               ...               ...               ...   
63  Women    46              84.5              91.6              95.5   
64  Women    47              84.8              91.7              95.6   
65  Women    48              85.0              91.8              95.6   
66  Women    49              85.2              91.9              95.7   
67  Women    50              85.4              92.0              95.7   

    1960_birthcohort  1970_birthcohort  
0                0.1               0.0  
1      

In [1120]:
df_info = pd.DataFrame({
    'datatypes': df_6.dtypes,
    'null_count': df_6.isnull().sum(),
    'unique_count': df_6.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [1121]:
df_6.sample(12)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
61,Women,44,84.0,91.3,95.4,86.5,74.1
42,Women,25,48.5,61.8,80.6,60.7,35.2
55,Women,38,81.2,89.8,94.6,84.5,70.5
15,Men,32,79.3,82.2,86.5,68.8,48.2
54,Women,37,80.6,89.4,94.4,84.1,69.4
58,Women,41,82.9,90.7,95.1,85.7,72.7
35,Women,18,0.4,1.6,4.6,4.6,1.3
9,Men,26,48.3,51.0,67.7,46.8,24.1
6,Men,23,21.4,26.8,38.1,26.2,10.5
39,Women,22,24.3,39.3,57.5,40.6,18.2


In [1122]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [1123]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [1124]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [1125]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1126]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.dropna(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
2093,United States,USA,1975,14.3
1184,Lithuania,LTU,2004,28.5
1751,Slovenia,SVN,1990,24.5
1444,New Zealand,NZL,1991,35.7
1905,Sweden,SWE,1981,41.2


In [1127]:
df_info = pd.DataFrame({
    'datatypes': df_7.dtypes,
    'null_count': df_7.isnull().sum(),
    'unique_count': df_7.nunique()
})
print(df_info)

                                          datatypes  null_count  unique_count
country                                      object           0            42
code                                         object           0            42
year                                          int64           0            62
shareofbirthsoutsideofmarriageofallbirths   float64           0           610


In [1128]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [1129]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [1130]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [1131]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1132]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
66,Women,GBR,49,85.2,91.9,95.7,87.3,76.0,,,
9,Men,GBR,26,48.3,51.0,67.7,46.8,24.1,11.2,7.1,
3,Men,GBR,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
67,Women,GBR,50,85.4,92.0,95.7,87.5,76.3,,,
48,Women,GBR,31,73.9,84.5,91.7,78.4,58.7,42.0,30.2,


In [1133]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.dropna(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
61,Women,44,84.0,91.3,95.4,86.5,74.1
19,Men,36,86.1,87.6,89.7,74.8,58.3
48,Women,31,73.9,84.5,91.7,78.4,58.7
26,Men,43,91.1,90.8,91.7,79.5,66.7
55,Women,38,81.2,89.8,94.6,84.5,70.5


In [1134]:
df_info = pd.DataFrame({
    'datatypes': df_8.dtypes,
    'null_count': df_8.isnull().sum(),
    'unique_count': df_8.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [1135]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [1136]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [1137]:
#pip install openpyxl pywin32

In [1138]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [1139]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [1140]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [1141]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [1142]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [1143]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [1144]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [1145]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [1146]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [1147]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [1148]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [1149]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
6680,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,231,Ethiopia,ET,ETH,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.25
18326,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,634,Qatar,QA,QAT,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,63.08
3261,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,112,Belarus,BY,BLR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.2
12791,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,438,Liechtenstein,LI,LIE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.7
5726,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,203,Czechia,CZ,CZE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,73.88


In [1150]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
8285,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,275,State of Palestine,PS,PSE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,62.1
3160,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,108,Burundi,BI,BDI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,62.77
21408,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,740,Suriname,SR,SUR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,54.58
17048,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,586,Pakistan,PK,PAK,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,72.9
22027,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,762,Tajikistan,TJ,TJK,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,65.76


In [1151]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [1152]:
df_9.drop_duplicates(inplace=True)
df_9.dropna(inplace = True)

df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [1153]:
df_info = pd.DataFrame({
    'datatypes': df_9.dtypes,
    'null_count': df_9.isnull().sum(),
    'unique_count': df_9.nunique()
})
print(df_info)

                datatypes  null_count  unique_count
indicatorname      object           0             1
year                int64           0             1
country            object           0           224
code               object           0           224
time                int64           0            56
variant            object           0             1
sex                object           0             1
age                object           0             1
estimate_method     int64           0             2
estimatemethod     object           0             2
value             float64           0          3867


In [1154]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [1155]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [1156]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [1157]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
59759,Indonesia,360,Married or in-union women,1987,15-49,67.880857,29297.111813,Estimate
51212,Greenland,304,Married or in-union women,1972,35-39,79.684249,0.947047,Estimate
86011,Montserrat,500,Married or in-union women,2029,30-34,51.871442,0.053946,Projection
7507,Bahrain,48,Married or in-union women,2017,30-34,70.341488,41.789878,Estimate
32680,Cyprus,196,Married or in-union women,2005,15-19,1.841886,0.755938,Estimate
145368,Zambia,894,Married or in-union women,1997,15-19,23.1,124.794285,Estimate
138657,United Kingdom,826,Married or in-union women,2049,20-24,20.159397,416.584169,Projection
120989,South Africa,710,Married or in-union women,2027,40-44,55.510038,1421.02449,Projection
116366,Seychelles,690,Married or in-union women,2016,45-49,60.313999,2.198144,Estimate
82540,Mauritius,480,Married or in-union women,2000,35-39,83.78,43.440768,Estimate


In [1158]:
df_10.rename(columns={
    "dataprocess": "data_process",
    "countryorarea": "country"
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,country,isocode,indicator,year,agegroup,percentage,number,data_process
9948,Belgium,56,Married or in-union women,1998,35-39,80.613659,320.203095,Estimate
53673,Guam,316,Married or in-union women,2037,20-24,10.412407,0.715645,Projection
12769,Bosnia and Herzegovina,70,Married or in-union women,2027,20-24,33.49258,26.256676,Projection
19674,Cambodia,116,Married or in-union women,1999,25-29,80.93125,366.608851,Estimate
107713,Romania,642,Married or in-union women,1988,20-24,53.493235,432.442256,Estimate


In [1159]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [1160]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [1161]:
df_10.dropna(inplace=True)

In [1162]:
df_info = pd.DataFrame({
    'datatypes': df_10.dtypes,
    'null_count': df_10.isnull().sum(),
    'unique_count': df_10.nunique()
})
print(df_info)

             datatypes  null_count  unique_count
country         object           0           225
isocode          int64           0           225
indicator       object           0             1
year             int64           0            81
agegroup        object           0             8
percentage     float64           0          9667
number         float64           0         65394
data_process    object           0             2


In [1163]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   country       145800 non-null  object 
 1   isocode       145800 non-null  int64  
 2   indicator     145800 non-null  object 
 3   year          145800 non-null  int64  
 4   agegroup      145800 non-null  object 
 5   percentage    145800 non-null  float64
 6   number        145800 non-null  float64
 7   data_process  145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [None]:
#df_10.to_csv("../data/Cleaned/cleaned_countries_1970_2025_un.csv", index=False)

In [None]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

800

In [1166]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [1167]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
23451,Ireland,372,1981,1981,Women,[20-24],20,24,32.26,Census,1981 Census,73,Ireland 1981 Census,UNSD,,,
39098,Republic of Korea,410,1975,1975,Women,[75+],75,999,12.34,Census,1975 Census,1276,Republic of Korea 1975 Census,UNSD,,,
10969,Czechia,203,2015,2015,Men,[40-44],40,44,58.12,Estimate,2015 Estimate,2079,Czechia 2015 Estimate,UNSD,,,
24180,Israel,376,1995,1995,Men,[70-74],70,74,83.17,Census,1995 Census,1504,Israel 1995 Census,UNSD,,,"Including Israeli citizens, permanent resident..."
41881,San Marino,674,1995,1995,Women,[10-14],10,14,0.0,Estimate,1995 Estimate,2208,San Marino 1995 Estimate,UNSD,,,
53518,Zimbabwe,716,2005,2006,Men,[30-34],30,34,85.2,Survey,2005-2006 DHS,1698,Zimbabwe 2005-2006 Demographic and Health Survey,DHS_HH,1.0,,
4088,Bolivia (Plurinational State of),68,2001,2001,Women,[20-24],20,24,46.07,Census,2001 Census,1511,Bolivia 2001 Census,UNSD,1.0,,
1651,Austria,40,1972,1972,Women,[70-74],70,74,29.69,Estimate,1972 Estimate,2038,Austria 1972 Estimate,UNSD,,,


In [1168]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
47231,Sweden,752,2007,2007,Men,[25-29],25,29,11.98,Estimate,2007 Estimate,2227,Sweden 2007 Estimate,UNSD,,,
15077,Finland,246,2006,2006,Women,[20-24],20,24,8.66,Estimate,2006 Estimate,2093,Finland 2006 Estimate,UNSD,1.0,,
11642,Denmark,208,2003,2003,Women,[15-19],15,19,0.39,Estimate,2003 Estimate,2081,Denmark 2003 Estimate,UNSD,,,Excluding Faeroe Islands and Greenland shown s...
39721,Romania,642,1995,1995,Men,[10-14],10,14,0.0,Estimate,1995 Estimate,2199,Romania 1995 Estimate,UNSD,,,
3814,Bermuda,60,2000,2000,Men,[45-49],45,49,65.68,Census,2000 Census,2293,Bermuda 2000 Census,UNSD,,,Excluding the institutional population.
1171,Australia,36,1986,1986,Men,[55-59],55,59,80.41,Census,1986 Census,856,Australia 1986 Census,UNSD,,,
52977,Yemen,887,1994,1994,Women,[75+],75,999,30.15,Census,1994 Census,1134,Yemen 1994 Census,UNSD,,,
14449,Finland,246,1978,1978,Women,[60-64],60,64,53.57,Estimate,1978 Estimate,2093,Finland 1978 Estimate,UNSD,,,


In [1169]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
21965,Iceland,352,Men,[35-39],35,39,69.69,Estimate,2008 Estimate,UNSD
30051,Malawi,454,Women,[35-39],35,39,79.37,Survey,2015-2016 DHS,DHS_HH
35892,Norway,578,Men,[55-59],55,59,79.49,Estimate,1986 Estimate,UNSD
1084,Australia,36,Men,[15-19],15,19,1.41,Census,1971 Census,UNSD
18566,Grenada,308,Men,[35-39],35,39,40.13,Census,1991 Census,US Census Bureau
16239,France,250,Women,[35-39],35,39,62.48,Estimate,2001 Estimate,UNSD
3168,Belgium,56,Men,[45-49],45,49,73.18,Estimate,1999 Estimate,UNSD
12877,Egypt,818,Women,[45-49],45,49,80.7,Survey,1992 DHS,DHS_STATcompiler
25562,Japan,392,Men,[75+],75,999,79.55,Census,2015 Census,UNSD
41811,San Marino,674,Men,[10-14],10,14,0.0,Estimate,1993 Estimate,UNSD


In [1170]:
df_11.drop_duplicates(inplace=True)
df_11.dropna(inplace=True)

In [1171]:
df_info = pd.DataFrame({
    'datatypes': df_11.dtypes,
    'null_count': df_11.isnull().sum(),
    'unique_count': df_11.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           233
isocode                  int64           0           230
sex                     object           0             2
agegroup                object           0            23
age_start                int64           0            17
ageend                   int64           0            15
data_value             float64           0          9213
dataprocess             object           0             6
datacatalogshortname    object           0           412
data_source             object           0            15


In [1172]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [1173]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [1174]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [1175]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
22352,Hungary,348,2014,2014,Men,[55-59],55,59,88.22,Estimate,2014 Estimate,2120,Hungary 2014 Estimate,UNSD,1.0,,
31559,Malawi,454,2000,2000,Men,[50-54],50,54,99.1,Survey,2000 DHS,1833,Malawi 2000 Demographic and Health Survey,DHS_STATcompiler,1.0,,
6692,Canada,124,1987,1987,Women,[50-54],50,54,94.45,Estimate,1987 Estimate,2061,Canada 1987 Estimate,UNSD,,,
2603,Bahrain,48,2001,2001,Women,[55-59],55,59,97.44,Census,2001 Census,317,Bahrain 2001 Census,UNSD,,,
14270,Eritrea,232,1995,1996,Men,[55-59],55,59,100.0,Survey,1995 DHS,1835,Eritrea 1995 Demographic and Health Survey,DHS_STATcompiler,1.0,,
22453,Hungary,348,2017,2017,Women,[70-74],70,74,96.64,Estimate,2017 Estimate,2120,Hungary 2017 Estimate,UNSD,1.0,,
8275,Chile,152,2002,2002,Men,[20-24],20,24,18.1,Census,2002 Census,1037,Chile 2002 Census,UNSD,1.0,,
12200,Denmark,208,1984,1984,Women,[35-39],35,39,91.56,Estimate,1984 Estimate,2081,Denmark 1984 Estimate,UNSD,,,Excluding Faeroe Islands and Greenland shown s...


In [1176]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
29784,Liberia,430,Men,[15-19],15,19,2.6,Survey,DHS_HH
14319,Eritrea,232,Women,[40-44],40,44,98.71,Survey,DHS_HH
19948,Guatemala,320,Men,[25-29],25,29,75.06,Census,UNSD
17797,Georgia,268,Women,[35-39],35,39,91.09,Survey,RHS
45819,Slovakia,703,Women,[45-49],45,49,91.92,Estimate,UNSD
49513,Sweden,752,Women,[75+],75,999,94.56,Estimate,UNSD
52616,Uganda,800,Women,[25-29],25,29,94.4,Survey,DHS_STATcompiler
54748,Viet Nam,704,Women,[45-49],45,49,93.9,Survey,National statistics


In [1177]:
df_12.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_12.dtypes,
    'null_count': df_12.isnull().sum(),
    'unique_count': df_12.nunique()
})
print(df_info)

            datatypes  null_count  unique_count
country        object           0           233
isocode         int64           0           230
sex            object           0             2
agegroup       object           0            23
age_start       int64           0            17
age_end         int64           0            15
datavalue     float64           0          8396
dataprocess    object           0             6
datasource     object           0            15


In [1178]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [1179]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [1180]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [1181]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
45804,Malta,470,[Total],TFR,1976.5,2.2,"Estimates,Direct,DYB,2154-16-53",Direct,Estimate,Estimate,2154,All sources of estimates,Estimates,1976,1976,Demographic Yearbook,2000
36991,Israel,376,[Total],TFR,2017.50137,3.1115,"Register, Direct, Statistical Abstract of Isra...",Direct,Register,VR,587,Vital Registration,Register,2017,2017,Vital Statistics - Statistical Abstract of Isr...,2018
5710,Bangladesh,50,[20-24],ASFR2024,1987.749144,280.0109,"2001 DHS Special ,Birth Histories,FBH analysis...",Birth histories,Survey,DHS-NS,5047,Bangladesh 2001 Maternal Health Services and M...,2001 DHS Special,2001,2001,Fertility rates from full birth histories anal...,2018
54493,Pakistan,586,[20-24],ASFR2024,1999.5,205.6,"Annual PDS,Recent births,Report,13",Recent births,Survey,Annual HH survey,514,Pakistan 1999 Demographic Survey,Annual PDS,1999,1999,"PBS. Social Statistics, Compendium of gender s...",2004
5856,Bangladesh,50,[Total],MAC,1994.249268,26.3012,"2011 DHS,Birth Histories (Extrapolated),DHS,50...",Extrapolated from Truncated Birth Histories,Survey,DHS,5048,Bangladesh 2011 Demographic and Health Survey,2011 DHS,2011,2011,DHS Statcompiler,2012
65860,Spain,724,[20-24],ASFR2024,2004.5,30.3,Eurostat.20190531,Official estimates,Estimate,Estimate,2222,All sources of estimates,Estimates,2004,2004,"Eurostat Statistics, Fertility rates by age [d...",2019


In [1182]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13 = df_13.rename(columns={
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type"
})

In [1183]:
df_13.head()

Unnamed: 0,country,age_group,date,value,data_type,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
0,Afghanistan,[Total],1964.977051,7.966653,Reverse survival method,Census,1979 Census,1979,1979
1,Afghanistan,[Total],1965.977051,8.212275,Reverse survival method,Census,1979 Census,1979,1979
2,Afghanistan,[Total],1966.977051,8.317603,Reverse survival method,Census,1979 Census,1979,1979
3,Afghanistan,[Total],1967.977051,8.225812,Reverse survival method,Census,1979 Census,1979,1979
4,Afghanistan,[Total],1968.977051,8.068459,Reverse survival method,Census,1979 Census,1979,1979


In [1184]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,country,age_group,date,value,data_type,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
70628,Togo,[15-19],1987,131.0,Direct,Survey,1988 DHS,1988,1988.0
2037,Argentina,[35-39],2007,58.05,Direct,Register,Register,2007,2007.0
9318,Bosnia and Herzegovina,[15-19],2006,16.4,Official estimates,Estimate,Estimates,2006,2006.0
30815,Guinea,[Total],1992,5.64,P/F Ratio method (Feeney),Census,1996 Census,1996,1996.0
17689,Côte d'Ivoire,[15-19],1982,193.0,Direct,Survey,1994 DHS,1994,1994.0
41106,Lao People's Dem. Republic,[15-19],2001,100.3,Own-children method,Survey,2003 WHS,2003,2003.0
45181,Maldives,[45-49],2012,1.08,Computed rate from DYB,Register,Register,2012,2012.0
8554,Bolivia (Plurinational State of),[15-19],1976,123.0,Direct,Survey,1989 DHS,1989,1989.0
6249,Bangladesh,[40-44],2011,9.0,Computed rate from reported ASFR,SRS,SVRS,1980,
14375,China,[Total],1990,26.26,Fertility data (adjusted),Estimate,Estimates,1990,1990.0


In [1185]:
df_13.drop_duplicates(inplace=True)
df_13.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_13.dtypes,
    'null_count': df_13.isnull().sum(),
    'unique_count': df_13.nunique()
})
print(df_info)

                    datatypes  null_count  unique_count
country                object           0           201
age_group              object           0             8
date                    int32           0            69
value                 float64           0         18752
data_type              object           0            30
datasourcetype         object           0             7
datasourcenameshort    object           0           539
datasourcestartyear     int64           0            69
datasourceendyear      object           0            70


In [1186]:
#df_13.to_csv("../data/Cleaned/cleaned_fertility_indicators_un.csv", index=False)

In [1187]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [1188]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [1189]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
214908,Saudi Arabia,682,2017,2017,Women,Single,,,[50-54],50,...,2017 Estimate,2210,Saudi Arabia 2017 Estimate,UNSD,,,,,,
99410,Guinea,324,2012,2012,Women,Divorced,,,[45-49],45,...,2012 DHS,5422,Guinea 2012 Demographic and Health Survey,DHS_STATcompiler,,,,,,
199225,Qatar,634,2010,2010,Women,Single,1.0,1.0,[65-74],65,...,2010 Census,4796,Qatar 2010 Census,UNSD,,,,,,
26313,Burkina Faso,854,1992,1993,Men,Divorced,,,[15-19],15,...,1993 DHS,1776,Burkina Faso 1993 Demographic and Health Survey,DHS_STATcompiler,,,,,,
86310,Gabon,266,2012,2012,Men,Widowed,,,[35-39],35,...,2012 DHS,5054,Gabon 2012 Demographic and Health Survey,DHS_STATcompiler,,,,,,


In [1190]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
150194,Madagascar,1992,1992,Women,Living together,[45-49],45,49,9.2,Survey,1992 DHS,DHS_STATcompiler
244598,Timor-Leste,2015,2015,Men,Single,[35-39],35,39,12.97,Census,2015 Census,National statistics
152627,Maldives,1985,1985,Women,Single,[55-59],55,59,0.62,Census,1985 Census,UNSD
255480,United Kingdom,1993,1993,Men,Divorced,[45-49],45,49,11.26,Estimate,1993 Estimate,UNSD
131591,Jordan,2002,2002,Women,Never married,[25-29],25,29,34.8,Survey,2002 DHS,DHS_HH
260760,United States of America,1993,1993,Women,Single,[10-14],10,14,100.0,Estimate,1993 Estimate,UNSD
83237,France,1986,1986,Women,Widowed,[45-49],45,49,4.79,Estimate,1986 Estimate,UNSD
70178,El Salvador,1971,1971,Men,Married,[10-14],10,14,0.0,Census,1971 Census,UNSD
177364,Nicaragua,2011,2012,Women,Single,[30-34],30,34,7.9,Survey,2011-2012 NDHS,National statistics
148627,Lithuania,2017,2017,Men,Single,[35-39],35,39,37.19,Estimate,2017 Estimate,UNSD


In [1191]:
df_14.drop_duplicates(inplace=True)
df_14.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_14.dtypes,
    'null_count': df_14.isnull().sum(),
    'unique_count': df_14.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           235
year_start               int64           0            62
year_end                 int64           0            60
sex                     object           0             2
marital_status          object           0            35
age_group               object           0            63
agestart                 int64           0            21
ageend                   int64           0            20
datavalue              float64           0          9994
dataprocess             object           0             6
datacatalogshortname    object           0           443
datasource              object           0            15


In [1192]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [1193]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [1194]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [1195]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
4493,Europe and Northern America,513,SDG,Married or in-union women,2045,40-44,66.727454,21723.164825,Projection
15163,Western Europe,926,M49,Married or in-union women,2002,30-34,75.086068,5126.090213,Estimate
1395,Northern Africa and Western Asia,747,SDG,Married or in-union women,1982,30-34,87.516762,12370.737541,Estimate
15635,Latin America and the Caribbean,904,SDG-M49,Married or in-union women,1980,30-34,78.515298,18102.041221,Estimate
5866,Middle Africa,911,M49,Married or in-union women,1974,25-29,83.983135,2754.41001,Estimate
25724,Low-income countries,1500,Income group,Married or in-union women,2026,35-39,84.360301,39345.863662,Projection


In [1196]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
12559,Europe,908,Married or in-union women,2000,15-49,61.203184,112782.456458,Estimate
22612,Polynesia,957,Married or in-union women,2042,35-39,81.569428,35.149898,Projection
5822,Eastern Africa,910,Married or in-union women,2049,45-49,72.643307,32179.976769,Projection
800,Sub-Saharan Africa,202,Married or in-union women,1989,15-19,32.00636,16156.367642,Estimate
9773,Eastern Asia,906,Married or in-union women,1976,40-44,94.973211,49985.49937,Estimate
27943,No income group available,1518,Married or in-union women,1979,15-49,54.067038,3887.408154,Estimate
7218,Southern Africa,913,Married or in-union women,1981,25-29,59.878049,1556.705316,Estimate
21584,Micronesia,954,Married or in-union women,1995,15-19,10.726049,4.607482,Estimate
3408,Oceania excluding Australia and New Zealand,543,Married or in-union women,1991,15-19,14.152772,90.976848,Estimate
27083,Upper-middle-income countries,1502,Married or in-union women,2034,30-34,70.86448,104911.745664,Projection


In [1197]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [1198]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [1199]:
df_15.dropna(inplace=True)
df_15.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_15.dtypes,
    'null_count': df_15.isnull().sum(),
    'unique_count': df_15.nunique()
})
print(df_info)

           datatypes  null_count  unique_count
region        object           0            43
iso_code       int64           0            44
indicator     object           0             1
year           int64           0            81
age_group     object           0             8
percentage   float64           0          7796
number         int32           0         20311
process       object           0             2


In [1200]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [1201]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [1202]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [1203]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1204]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [1205]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [1206]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [1207]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [1208]:
df_16_1.sample(10)

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children
29,Korea,2.21,3.55,2.34
43,Estonia,1.8,3.8,2.6
31,Austria,2.2,3.8,2.5
16,Romania,2.5,3.8,2.4
14,Cyprus,2.5,3.7,2.4
18,Ireland,2.4,4.0,2.7
23,Hungary,2.3,3.7,2.4
33,Italy,2.2,3.6,2.4
7,Iceland,2.7,4.12,2.61
28,Switzerland,2.21,4.02,2.58


In [1209]:
#df_16_1.to_csv('../data/Cleaned/cleaned_average_size_of_households_type_2024_oecd.csv', index=False)

In [1210]:
#df_16_1.to_sql('average_size_of_households_type_2024_oecd', engine, if_exists = 'replace', index= False)

In [1211]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa2.csv', header=1)
df_16_2
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [1212]:
df_16_2.rename(columns={
    "Total": "couple_total(%)",
    "Couple with children": "couple_with_children(%)",
    "Couple without children": "couple_without_children(%)",
    "Total.1": "single_parent_total(%)",
    "Single mother households": "single_mother(%)",
    "Single father households": "single_father(%)",
    "Single person households": "single_person(%)",
    "Other types of households": "other_household_types(%)"
}, inplace=True)

In [1213]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [1214]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1215]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)
df_16_2.dropna(how="all", subset=num_cols, inplace=True)

In [1216]:
df_16_2.rename(columns={
   "couple_total" : "couple_total(%)",
   "with_children" : "with_children(%)",
   "without_children" : "without_children(%)",
    "single_parent_total" : "single_parent_total(%)",
    "single_mother" : "single_mother(%)",
    "single_father" : "single_father(%)",
    "single_person" : "single_person(%)",
    "other_household_types" : "other_household_types(%)"
}, inplace=True)

In [1217]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                          dtype  null_count  unique_count
country                  object           0            36
couple_total(%)         float64           0            36
with_children(%)        float64           0            35
without_children(%)     float64           0            36
single_parent_total(%)  float64           0            34
single_mother(%)        float64           0            32
single_father(%)        float64           0            31
single_person(%)        float64           0            35
other_households_types  float64           0            36
country                    object
couple_total(%)           float64
with_children(%)          float64
without_children(%)       float64
single_parent_total(%)    float64
single_mother(%)          float64
single_father(%)          float64
single_person(%)          float64
other_households_types    float64
dtype: object


In [1218]:
df_16_2.sample(10)

Unnamed: 0,country,couple_total(%),with_children(%),without_children(%),single_parent_total(%),single_mother(%),single_father(%),single_person(%),other_households_types
7,Czechia,47.03,21.7,25.32,7.15,6.11,1.04,39.15,6.67
15,Iceland,45.19,25.42,19.77,7.35,6.23,1.12,29.16,18.29
2,Belgium,52.22,23.98,28.24,7.42,6.08,1.35,35.5,4.86
13,Greece,52.14,24.03,28.11,4.66,3.82,0.84,32.35,10.85
10,Finland,45.64,17.06,28.58,5.43,4.5,0.93,45.34,3.6
6,Costa Rica,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74
33,Sweden,49.27,22.49,26.78,6.67,4.91,1.76,39.24,4.82
39,Bulgaria,40.3,16.35,23.95,4.6,3.88,0.73,35.81,19.28
18,Italy,46.7,20.91,25.8,7.27,5.65,1.63,36.64,9.38
9,Estonia,46.2,25.46,20.73,6.83,6.09,0.74,36.99,9.98


In [1219]:
#df_16_2.to_csv('../data/Cleaned/cleaned_types_of_household_2021_oecd.csv', index = False)

In [1220]:
#df_16_2.to_sql('types_of_household_2021_oecd', engine, if_exists = 'replace', index= False)

In [1221]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition - Sayfa3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [1222]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [1223]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1224]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [1225]:
df_16_3.sample(10)

Unnamed: 0,country,0_children,1_child,2_children,3_or_more_children,children_under_6
11,France,75.36,11.43,9.23,3.99,9.86
27,Norway,76.87,10.53,9.14,3.47,8.88
31,Slovenia,75.0,11.25,10.2,3.56,9.93
23,Luxembourg,73.0,12.49,12.07,2.41,11.54
41,Cyprus,71.36,13.88,11.67,3.1,12.71
29,Portugal,74.35,15.87,8.18,1.6,9.85
16,Ireland,69.02,12.42,12.18,6.38,11.81
30,Slovak Republic,64.41,17.09,14.49,4.02,15.56
9,Estonia,75.76,12.53,8.73,2.98,9.85
44,EU average,75.1,12.28,9.46,3.15,9.9


In [1226]:
df_16_3.rename(columns={
    "0_children": "households_0_children(%)",
    "1_child": "households_1_child(%)",
    "2_children": "households_2_children(%)",
    "3_or_more_children": "households_3_or_more_children(%)"
}, inplace=True)

In [1227]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                                    dtype  null_count  unique_count
country                            object           0            33
households_0_children(%)          float64           0            32
households_1_child(%)             float64           0            32
households_2_children(%)          float64           0            33
households_3_or_more_children(%)  float64           0            31
children_under_6                  float64           0            31
country                              object
households_0_children(%)            float64
households_1_child(%)               float64
households_2_children(%)            float64
households_3_or_more_children(%)    float64
children_under_6                    float64
dtype: object


In [1228]:
df_16_3.sample(10)

Unnamed: 0,country,households_0_children(%),households_1_child(%),households_2_children(%),households_3_or_more_children(%),children_under_6
32,Spain,74.61,13.54,8.95,2.9,8.79
16,Ireland,69.02,12.42,12.18,6.38,11.81
42,Malta,76.49,12.68,7.81,2.98,9.61
21,Latvia,74.8,14.05,8.32,2.83,10.07
11,France,75.36,11.43,9.23,3.99,9.86
36,United Kingdom,72.06,12.1,11.31,4.53,12.73
9,Estonia,75.76,12.53,8.73,2.98,9.85
44,EU average,75.1,12.28,9.46,3.15,9.9
2,Belgium,73.97,11.76,10.15,4.11,10.4
10,Finland,81.98,7.89,6.99,3.14,7.14


In [1229]:
#df_16_3.to_csv('../data/Cleaned/cleaned_households_by_number_of_children_2024_oecd.csv', index=False)

In [1230]:
#df_16_3.to_sql('households_by_number_of_children_2024_oecd', engine, index= False)

In [1231]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
#total_fertility_rates_from_1960_oecd
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [1232]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [1233]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1234]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1235]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [1236]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            49
1960     float64           0            47
1961     float64           0            47
1962     float64           0            47
1963     float64           0            46
...          ...         ...           ...
2019     float64           0            37
2020     float64           0            39
2021     float64           0            40
2022     float64           0            34
2023     float64           0            35

[65 rows x 3 columns]


In [1237]:
df_17_1.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
31,Spain,2.86,2.76,2.8,2.88,3.01,2.94,2.99,3.03,2.96,...,1.32,1.33,1.34,1.31,1.26,1.23,1.19,1.19,1.16,1.12
41,Peru,6.94,6.92,6.9,6.86,6.81,6.75,6.68,6.6,6.51,...,2.27,2.23,2.19,2.15,2.12,2.09,2.06,2.03,2.0,1.98
16,Ireland,3.76,3.79,3.92,4.01,4.06,4.03,3.95,3.84,3.78,...,1.89,1.85,1.82,1.78,1.75,1.7,1.63,1.72,1.7,1.5
30,Slovenia,2.18,2.26,2.27,2.28,2.32,2.45,2.48,2.38,2.28,...,1.58,1.57,1.58,1.62,1.6,1.61,1.59,1.64,1.55,1.51
36,OECD-38,3.34,3.33,3.3,3.3,3.28,3.2,3.1,3.03,2.95,...,1.68,1.68,1.68,1.65,1.63,1.59,1.55,1.57,1.5,1.43
12,Germany,2.37,2.44,2.44,2.51,2.53,2.5,2.51,2.45,2.36,...,1.47,1.5,1.59,1.57,1.57,1.54,1.53,1.58,1.46,1.35
32,Sweden,2.2,2.23,2.26,2.34,2.48,2.42,2.36,2.27,2.07,...,1.88,1.85,1.85,1.78,1.75,1.7,1.66,1.67,1.52,1.45
5,Colombia,6.74,6.71,6.66,6.58,6.48,6.33,6.16,5.96,5.74,...,1.82,1.77,1.72,1.72,1.72,1.71,1.69,1.68,1.66,1.65
47,Croatia,2.2,2.19,2.17,2.12,2.12,2.21,2.21,2.07,1.99,...,1.46,1.4,1.42,1.42,1.47,1.47,1.56,1.63,1.53,1.47
46,Bulgaria,2.31,2.29,2.24,2.21,2.19,2.09,2.03,2.02,2.27,...,1.62,1.64,1.67,1.71,1.73,1.79,1.77,1.8,1.78,1.81


In [1238]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [1239]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [1240]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [1241]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [1242]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_17_2.head()

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341


In [1243]:
# --- Ensure "country" and "birth order" are strings (tidy casing/spacing) ---
df_17_2["country"] = df_17_2["country"].astype(str).str.strip().str.title()
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_17_2.columns if c not in ["country", "birth_order"]]
# --- Robust cleaning -> convert to float ---
df_17_2[num_cols] = (
    df_17_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_17_2[num_cols] = df_17_2[num_cols].round(2)


In [1244]:
df_17_2.drop_duplicates(inplace=True)
df_17_2.dropna(inplace=True)

In [1245]:
df_17_2["birth_order"] = df_17_2["birth_order"].astype(str) + "(%)"

In [1246]:
df_17_2.head(10)

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth(%),47.6,47.8,46.7,46.2,46.5,46.1,45.2,44.5,...,48.0,48.3,47.3,47.5,47.1,47.2,47.7,47.6,48.4,48.1
1,Austria,Second birth(%),33.7,33.7,34.3,34.9,34.5,34.8,35.8,36.4,...,35.5,35.3,35.6,35.3,35.3,35.1,35.3,35.5,34.9,35.1
2,Austria,Third birth or higher(%),18.8,18.5,19.0,18.9,19.0,19.1,18.9,19.1,...,16.5,16.4,17.1,17.2,17.6,17.7,17.0,16.9,16.7,16.8
3,Belgium,First birth(%),46.8,46.9,47.3,47.3,48.1,47.2,46.9,47.2,...,42.3,43.5,44.1,43.6,42.9,42.6,45.0,44.0,44.7,45.5
4,Belgium,Second birth(%),33.0,32.9,32.7,32.8,32.3,32.8,33.5,33.0,...,35.1,34.8,34.5,34.6,34.5,34.7,34.2,35.1,34.3,34.1
5,Belgium,Third birth or higher(%),20.2,20.2,19.9,19.9,19.6,20.0,19.6,19.8,...,22.6,21.8,21.4,21.9,22.6,22.6,20.8,20.9,20.9,20.4
6,Czechia,First birth(%),46.7,46.6,47.4,47.8,50.1,49.8,48.5,47.7,...,47.4,48.1,48.7,48.7,48.0,47.8,47.6,46.4,46.3,46.3
7,Czechia,Second birth(%),37.7,37.6,37.4,37.2,35.5,35.8,36.8,36.9,...,37.5,37.3,36.7,36.6,37.2,37.6,37.6,39.0,38.6,39.1
8,Czechia,Third birth or higher(%),15.6,15.8,15.2,15.0,14.4,14.4,14.8,15.4,...,15.1,14.7,14.6,14.7,14.7,14.6,14.8,14.6,15.0,14.6
9,Estonia,First birth(%),43.5,43.5,44.0,46.2,49.5,50.3,49.6,49.6,...,41.9,42.3,40.8,40.2,36.7,38.8,38.0,37.2,39.8,39.7


In [1247]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

               dtype  null_count  unique_count
country       object           0            17
birth_order   object           0             3
1987         float64           0            48
1988         float64           0            49
1989         float64           0            48
1990         float64           0            44
1991         float64           0            48
1992         float64           0            46
1993         float64           0            47
1994         float64           0            47
1995         float64           0            48
1996         float64           0            47
1997         float64           0            49
1998         float64           0            50
1999         float64           0            49
2000         float64           0            48
2001         float64           0            50
2002         float64           0            47
2003         float64           0            50
2004         float64           0            49
2005         

In [1248]:
#df_17_2.to_csv('../data/Cleaned/cleaned_births_by_birth_order_oecd.csv', index=False)

In [1249]:
#df_17_2.to_sql('births_by_birth_order_oecd', engine, if_exists='replace', index=False)

In [1250]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1251]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [1252]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [1253]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [1254]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [1255]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [1256]:
df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            39
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           1            50


In [1257]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [1258]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [1259]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children_oecd.csv', index=False)

In [1260]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [1261]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
#age_of_mothers_at_childbirth
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [1262]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1963     object           0            19
1964     object           0            22
1965     object           0            22
1966     object           0            22
1967     object           0            22
1968     object           0            20
1969     object           0            21
1970     object           0            19
1971     object           0            19
1972     object           0            20
1973     object           0            20
1974     object           0            24
1975     object           0            21
1976     object           0            22
1977     object           0            20
1978     object           0            22
1979     object           0            23
1980     object           0            22
1981     object           0            20
1982     object           0            18
1983     object           0            20
1984     object           0       

In [1263]:
df_19_1.columns = df_19_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1264]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_19_1.columns if c != "country"]

df_19_1[num_cols] = (
    df_19_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1265]:
df_19_1.drop_duplicates(inplace=True)
df_19_1.dropna(inplace=True)

In [1266]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1963     float64           0            16
1964     float64           0            18
1965     float64           0            18
1966     float64           0            18
1967     float64           0            18
1968     float64           0            17
1969     float64           0            17
1970     float64           0            15
1971     float64           0            17
1972     float64           0            18
1973     float64           0            18
1974     float64           0            20
1975     float64           0            18
1976     float64           0            18
1977     float64           0            16
1978     float64           0            18
1979     float64           0            21
1980     float64           0            20
1981     float64           0            17
1982     float64           0            17
1983     float64           0            18
1984     fl

In [1267]:
df_19_1.sample(10)

Unnamed: 0,country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
14,Israel,28.1,28.1,28.2,28.2,28.2,28.5,28.4,28.2,28.5,...,30.1,30.2,30.3,30.3,30.4,30.4,30.4,30.5,30.5,30.6
17,Netherlands,29.3,29.2,29.0,28.8,28.5,28.4,28.3,28.2,28.0,...,30.9,31.0,31.1,31.2,31.3,31.4,31.5,31.6,31.7,31.8
21,Slovak Republic,26.8,26.9,26.8,26.6,26.5,26.4,26.3,26.2,26.2,...,28.7,28.8,28.8,28.8,28.8,28.8,28.8,28.8,28.9,28.9
7,Denmark,27.3,26.8,26.8,26.6,26.5,26.5,26.6,26.7,26.7,...,30.7,30.8,30.9,31.0,31.0,31.1,31.2,31.3,31.4,31.6
12,Iceland,27.6,27.7,27.7,27.5,27.5,27.4,27.3,27.2,27.0,...,30.1,30.4,30.2,30.3,30.6,30.6,30.6,30.9,30.7,30.9
23,United States,26.5,26.6,26.6,26.4,26.3,26.3,26.2,26.1,26.0,...,28.0,28.2,28.3,28.5,28.7,28.8,29.0,29.1,29.2,29.4
11,Hungary,25.8,25.7,25.6,25.6,25.6,25.5,25.5,25.4,25.4,...,29.4,29.5,29.5,29.6,29.6,29.8,29.8,29.9,29.9,30.0
25,Bulgaria,25.0,24.9,24.8,24.7,24.7,24.7,24.8,24.7,24.5,...,27.1,27.1,27.3,27.4,27.6,27.6,27.7,27.8,27.8,27.9
20,Portugal,29.6,29.5,29.5,29.4,29.3,29.2,29.1,29.0,29.0,...,30.2,30.4,30.7,30.9,31.1,31.2,31.4,31.4,31.6,31.8
8,Estonia,27.6,27.4,27.3,27.3,27.1,26.9,26.9,26.7,26.7,...,29.6,29.5,29.6,29.9,30.2,30.4,30.5,30.6,30.7,31.0


In [1268]:
#df_19_1.to_csv('../data/Cleaned/age_of_mothers_at_childbirth_oecd.csv', index=False)

In [1269]:
#df_19_1.to_sql('age_of_mothers_at_childbirth_oecd', engine, if_exists='replace', index=False)

In [1270]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
#fertility_per_1000_from 1960
df_19_2.head()

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [1271]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            21
Age group  object           0             7
1960       object           0           136
1961       object           0           140
1962       object           0           140
...           ...         ...           ...
2017       object           0           124
2018       object           0           128
2019       object           0           126
2020       object           0           121
2021       object           7           119

[64 rows x 3 columns]


In [1272]:
df_19_2.columns = df_19_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_2.head()

Unnamed: 0,country,age_group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [1273]:
# --- Ensure "country" and "age_group" are strings
df_19_2["country"] = df_19_2["country"].astype(str).str.strip().str.title()
df_19_2["age_group"] = df_19_2["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_2.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_2[num_cols] = (
    df_19_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_2[num_cols] = df_19_2[num_cols].round(2)

In [1274]:
df_19_2.drop_duplicates(inplace=True)
df_19_2.dropna(inplace = True)

In [1275]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            19
age_group   object           0             7
1960       float64           0           124
1961       float64           0           126
1962       float64           0           126
...            ...         ...           ...
2017       float64           0           118
2018       float64           0           121
2019       float64           0           120
2020       float64           0           115
2021       float64           0           118

[64 rows x 3 columns]


In [1276]:
#df_19_2.to_csv('../data/Cleaned/fertility_per_1000_from_1960_oecd.csv', index=False)

In [1277]:
#df_19_2.to_sql('fertility_per_1000_from_1960_oecd', engine, if_exists='replace', index=False)

In [1278]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
#fertility_per_1000_from_2000
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [1279]:
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            43
Age group  object           0             7
2000       object           0           233
2001       object           0           248
2002       object           0           240
2003       object           0           239
2004       object           0           245
2005       object           0           240
2006       object           0           239
2007       object           0           242
2008       object           0           252
2009       object           0           251
2010       object           0           239
2011       object           0           235
2012       object           0           242
2013       object           0           234
2014       object           0           238
2015       object           0           237
2016       object           0           248
2017       object           0           236
2018       object           0           245
2019       object           0   

In [1280]:
df_19_3.columns = df_19_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_3.head()

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587


In [1281]:
# --- Ensure "country" and "age_group" are strings
df_19_3["country"] = df_19_3["country"].astype(str).str.strip().str.title()
df_19_3["age_group"] = df_19_3["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_3.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_3[num_cols] = (
    df_19_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_3[num_cols] = df_19_3[num_cols].round(2)

In [1282]:
df_19_3.drop_duplicates(inplace=True)
df_19_3.dropna(inplace=True)

In [1283]:
#Check again
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            41
age_group   object           0             7
2000       float64           0           225
2001       float64           0           237
2002       float64           0           232
2003       float64           0           229
2004       float64           0           233
2005       float64           0           229
2006       float64           0           229
2007       float64           0           230
2008       float64           0           238
2009       float64           0           238
2010       float64           0           230
2011       float64           0           227
2012       float64           0           231
2013       float64           0           225
2014       float64           0           226
2015       float64           0           225
2016       float64           0           237
2017       float64           0           227
2018       float64           0           233
2019      

In [1284]:
df_19_3.sample(10)

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
272,United States,45-49,0.5,0.5,0.5,0.5,0.5,0.6,0.6,0.6,...,0.7,0.8,0.8,0.8,0.9,0.9,0.9,0.9,0.9,0.9
32,Canada,35-39,34.0,35.7,36.4,38.8,40.4,42.6,45.8,48.5,...,53.4,53.7,54.8,55.9,56.6,57.1,57.2,57.3,55.2,57.8
107,Hungary,25-29,94.6,92.7,92.5,88.8,88.5,89.2,89.5,85.5,...,77.9,76.4,78.9,77.5,80.6,80.8,80.1,82.8,84.3,87.3
212,Portugal,25-29,100.7,92.6,92.8,89.4,85.5,85.0,81.0,78.2,...,71.4,66.9,65.9,67.8,67.8,68.8,69.5,68.1,68.3,64.1
129,Israel,30-34,169.2,169.2,172.8,177.0,175.3,172.5,175.2,177.1,...,178.5,177.2,186.1,189.0,183.4,185.5,184.17,178.23,174.0,184.4
154,Latvia,15-19,24.1,22.4,21.3,22.4,21.4,21.6,23.7,25.0,...,20.3,19.9,19.8,18.0,15.4,14.7,12.1,12.6,10.5,9.8
67,Denmark,35-39,43.6,44.0,45.6,46.6,47.4,49.2,51.5,55.3,...,55.5,54.3,56.4,57.5,61.1,60.2,60.7,60.7,61.5,67.0
4,Oecd-Average,35-39,38.1,38.6,39.5,40.6,42.2,43.5,45.6,47.7,...,53.1,53.4,55.1,56.3,57.1,57.0,57.4,57.5,55.9,58.7
208,Poland,40-44,4.8,4.8,4.7,4.6,4.8,4.8,4.9,5.2,...,6.3,6.3,6.6,6.7,7.2,7.7,7.9,8.1,8.1,7.9
115,Iceland,30-34,112.0,100.0,107.6,116.3,118.7,114.0,120.4,121.9,...,117.7,117.0,112.1,107.1,108.4,107.2,106.5,112.1,114.2,115.2


In [1285]:
#df_19_3.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_2000_oecd.csv',index=False)

In [1286]:
#df_19_3.to_sql('fertility_per_1000_from_2000_oecd',engine, if_exists='replace', index=False)

In [1287]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
#(%)share_of_births_outside_of_marriage
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [1288]:
df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1960     object           0            26
1961     object           0            24
1962     object           0            24
1963     object           0            24
...         ...         ...           ...
2016     object           0            24
2017     object           0            26
2018     object           0            25
2019     object           0            25
2020     object           0            24

[62 rows x 3 columns]


In [1289]:
df_20.columns = df_20.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1290]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_20.columns if c != "country"]

df_20[num_cols] = (
    df_20[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1291]:
df_20.drop_duplicates(inplace=True)
df_20.dropna(inplace=True)

df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

           dtype  null_count  unique_count
country   object           0            22
1960     float64           0            22
1961     float64           0            20
1962     float64           0            21
1963     float64           0            21
...          ...         ...           ...
2016     float64           0            20
2017     float64           0            22
2018     float64           0            21
2019     float64           0            22
2020     float64           0            22

[62 rows x 3 columns]


In [1292]:
df_20.sample(10)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
6,Greece,1.2,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.1,...,7.4,7.6,7.0,8.2,8.8,9.4,10.3,11.1,12.4,13.8
12,Luxembourg,3.2,3.4,3.1,3.1,3.2,3.7,3.2,3.5,3.2,...,34.1,37.1,37.8,39.1,38.8,40.7,40.8,39.5,40.4,41.6
3,Denmark,7.8,8.0,8.3,8.9,9.3,9.5,10.2,11.1,11.1,...,49.0,50.6,51.5,52.5,53.8,54.0,54.2,54.2,54.1,54.2
17,Slovak Republic,4.7,4.4,4.6,4.7,5.0,5.3,5.3,5.7,5.9,...,34.0,35.4,37.0,38.9,39.2,40.2,40.1,40.0,40.1,40.7
11,Latvia,11.9,12.3,12.4,12.3,12.8,13.3,12.6,12.3,12.1,...,44.6,45.0,44.6,44.0,41.5,40.9,40.4,39.5,38.4,39.5
20,Sweden,11.3,11.7,12.4,12.6,13.1,13.8,14.5,15.1,16.0,...,54.3,54.5,54.4,54.6,54.7,54.9,54.5,54.5,54.5,55.2
25,Croatia,7.4,7.1,6.5,6.9,6.8,6.0,5.7,5.4,5.5,...,14.0,15.4,16.1,17.4,18.1,18.9,19.9,20.7,21.5,22.8
19,Spain,2.3,2.2,2.1,1.9,1.8,1.7,1.6,1.5,1.4,...,37.4,39.0,40.9,42.5,44.5,45.9,46.8,47.3,48.4,47.6
2,Czech Republic,4.9,4.6,4.5,4.7,4.8,5.0,5.3,5.3,5.4,...,41.8,43.4,45.0,46.7,47.8,48.6,49.0,48.5,48.2,48.5
21,Switzerland,3.8,4.0,4.2,4.1,4.2,3.9,3.8,3.9,3.8,...,19.3,20.2,21.1,21.7,22.9,24.2,25.2,25.7,26.5,27.7


In [1293]:
#df_20.to_csv('../data/Cleaned/cleaned_share_of_births_outside_of_marriage_oecd.csv', index=False)

In [1294]:
#df_20.to_sql('share_of_births_outside_of_marriage_oecd',engine, if_exists='replace', index=False)

In [1295]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
#mean_age_first_marriage
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [1296]:
df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            10
Gender     object           0             2
1990       object           0            17
1991       object           0            18
1992       object           0            18
1993       object           0            19
1994       object           0            16
1995       object           0            18
1996       object           0            19
1997       object           0            17
1998       object           0            14
1999       object           0            19
2000       object           0            18
2001       object           0            18
2002       object           0            19
2003       object           0            19
2004       object           0            16
2005       object           0            18
2006       object           0            18
2007       object           0            19
2008       object           0            18
2009       object           0   

In [1297]:
df_21_1.columns = df_21_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1298]:
# --- Ensure "country" and "gender" are strings
df_21_1["country"] = df_21_1["country"].astype(str).str.strip().str.title()
df_21_1["gender"] = df_21_1["gender"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_1.columns if c not in ["country", "gender"]]
# --- Robust cleaning -> convert to float ---
df_21_1[num_cols] = (
    df_21_1[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_1[num_cols] = df_21_1[num_cols].round(2)

In [1299]:
df_21_1.drop_duplicates(inplace=True)
df_21_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0             9
gender     object           0             2
1990      float64           0            15
1991      float64           0            16
1992      float64           0            16
1993      float64           0            17
1994      float64           0            15
1995      float64           0            16
1996      float64           0            17
1997      float64           0            15
1998      float64           0            13
1999      float64           0            17
2000      float64           0            16
2001      float64           0            16
2002      float64           0            17
2003      float64           0            17
2004      float64           0            15
2005      float64           0            17
2006      float64           0            17
2007      float64           0            17
2008      float64           0            16
2009      float64           0   

In [1300]:
#df_21_1.to_csv('../data/Cleaned/cleaned_mean_age_first_marriage_oecd.csv',index=False)

In [1301]:
#df_21_1.to_sql('mean_age_first_marriage_oecd', engine, if_exists='replace', index= False)

In [1302]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
#divorce_rates_per_1000_oecd
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [1303]:
df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            28
1970       object           0            18
1971       object           0            19
1972       object           0            19
1973       object           0            18
1974       object           0            18
1975       object           0            19
1976       object           0            18
1977       object           0            18
1978       object           0            18
1979       object           0            15
1980       object           0            18
1981       object           0            20
1982       object           0            22
1983       object           0            24
1984       object           0            20
1985       object           0            19
1986       object           0            20
1987       object           0            20
1988       object           0            20
1989       object           0            19
1990       object           0   

In [1304]:
df_21_2.columns = df_21_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [1305]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_21_2.columns if c != "country"]

df_21_2[num_cols] = (
    df_21_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1306]:
df_21_2.drop_duplicates(inplace=True)
df_21_2.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0            23
1970      float64           0            15
1971      float64           0            17
1972      float64           0            15
1973      float64           0            14
1974      float64           0            15
1975      float64           0            16
1976      float64           0            14
1977      float64           0            13
1978      float64           0            15
1979      float64           0            12
1980      float64           0            14
1981      float64           0            17
1982      float64           0            17
1983      float64           0            19
1984      float64           0            16
1985      float64           0            15
1986      float64           0            16
1987      float64           0            16
1988      float64           0            15
1989      float64           0            15
1990      float64           0   

In [1307]:
df_21_2.head(8)

Unnamed: 0,country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,1.4,1.3,1.3,1.3,1.4,1.4,1.5,1.5,1.6,...,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.6,1.5
1,Belgium,0.7,0.7,0.8,0.9,1.0,1.1,1.3,1.3,1.4,...,2.2,2.2,2.2,2.1,2.0,2.0,2.0,1.8,1.9,1.7
2,Czechia,2.2,2.4,2.3,2.5,2.5,2.6,2.5,2.5,2.6,...,2.7,2.5,2.5,2.4,2.4,2.3,2.3,2.0,2.0,1.9
3,Denmark,1.9,2.7,2.6,2.5,2.6,2.6,2.6,2.6,2.6,...,3.4,3.4,2.9,3.0,2.6,2.6,1.8,2.7,2.2,2.1
5,Finland,1.3,1.6,1.8,1.9,2.1,2.0,2.1,2.1,2.2,...,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.4,2.2,2.0
6,Germany,1.3,1.4,1.5,1.6,1.8,1.9,2.0,1.5,1.0,...,2.1,2.1,2.0,2.0,1.9,1.8,1.8,1.7,1.7,1.6
8,Hungary,2.2,2.3,2.3,2.4,2.3,2.5,2.6,2.6,2.7,...,2.0,2.0,2.1,2.0,1.9,1.7,1.8,1.5,1.9,1.8
10,Japan,0.9,1.0,1.0,1.0,1.0,1.1,1.1,1.1,1.2,...,1.8,1.8,1.8,1.7,1.7,1.7,1.7,1.6,1.47,1.52


In [1308]:
#df_21_2.to_csv('../data/Cleaned/cleaned_divorce_rates_per_1000_oecd.csv', index=False)

In [1309]:
#df_21_2.to_sql('divorce_rates_per_1000_oecd',engine, if_exists= 'replace' , index=False)

In [1310]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
#share_of_previous_marital_status
df_21_3

Unnamed: 0,Country,Previous marital status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216
5,Austria,Widowed,12,11,12,11,12,12,11,10,...,10,9,10,9,10,8,9,9,8,4
6,Czechia,Single never married,749,745,743,740,739,742,745,726,...,740,740,752,756,766,767,764,764,761,759
7,Czechia,Divorced,237,242,244,247,247,245,244,261,...,249,249,238,234,224,223,226,226,229,230
8,Czechia,Widowed,14,13,13,13,14,12,11,13,...,12,11,10,10,10,10,10,10,10,11
9,Denmark,Single never married,759,760,762,764,760,756,756,763,...,772,760,750,762,761,769,764,771,776,783


In [1311]:
df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
Country                    object           0            20
Previous marital status    object           0             3
2000                       object           0            47
2001                       object           0            51
2002                       object           0            56
2003                       object           0            50
2004                       object           0            50
2005                       object           0            52
2006                       object           0            49
2008                       object           0            47
2009                       object           0            50
2010                       object           0            49
2011                       object           0            49
2012                       object           0            53
2013                       object           0            49
2014                       object       

In [1312]:
df_21_3.columns = df_21_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216


In [1313]:
# --- Ensure "country" and "previous_marital_status" are strings
df_21_3["country"] = df_21_3["country"].astype(str).str.strip().str.title()
df_21_3["previous_marital_status"] = df_21_3["previous_marital_status"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_3.columns if c not in ["country", "previous_marital_status"]]
# --- Robust cleaning -> convert to float ---
df_21_3[num_cols] = (
    df_21_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_3[num_cols] = df_21_3[num_cols].round(2)

In [1314]:
df_21_3.drop_duplicates(inplace=True)
df_21_3.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
country                    object           0            20
previous_marital_status    object           0             3
2000                      float64           0            47
2001                      float64           0            51
2002                      float64           0            56
2003                      float64           0            50
2004                      float64           0            50
2005                      float64           0            52
2006                      float64           0            49
2008                      float64           0            47
2009                      float64           0            50
2010                      float64           0            49
2011                      float64           0            49
2012                      float64           0            53
2013                      float64           0            49
2014                      float64       

In [1315]:
#df_21_3.to_csv('../data/Cleaned/cleaned_share_of_previous_marital_status_oecd.csv', index=False)

In [1316]:
#df_21_3.to_sql('share_of_previous_marital_status_oecd', engine, if_exists= 'replace', index =  False)

In [1317]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
#hauseholds_by_partnership_status_oecd
df_22_1

Unnamed: 0,Country,20+_All_Total_Living_with_a_partner(%),20+_All_Married or in a civil or registered partnership_living_with_a_partner(%),20+_All_Cohabiting_living_with_a_partner(%),20+_All_Not living with a partner(%),20/34_Total_living_with_a_partner(%),20/34_Married or in a civil or registered partnership_living_with_a_partner(%),20/34_Cohabiting_living_with_a_partner(%),Not living with a partner_Total(%),Living with at least one parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620
5,Denmark,6415,5002,1412,3585,5054,2186,2868,4946,1067
6,Estonia,5393,3730,1664,4607,4531,1781,2750,5469,2646
7,France,6414,4941,1472,3586,5042,2189,2853,4958,2208
8,Germany,6261,5391,869,3739,3953,2215,1739,5974,2754
9,Greece,6023,5852,171,3977,3313,2924,390,6687,4543


In [1318]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
20+_All_Total_Living_with_a_partner(%)                object           0   
20+_All_Married or in a civil or registered par...    object           0   
20+_All_Cohabiting_living_with_a_partner(%)           object           0   
20+_All_Not living with a partner(%)                  object           0   
20/34_Total_living_with_a_partner(%)                  object           0   
20/34_Married or in a civil or registered partn...    object           0   
20/34_Cohabiting_living_with_a_partner(%)             object           0   
Not living with a partner_Total(%)                    object           0   
Living with at least one parent(%)                    object           0   

                                                    unique_count  
Country                                                       37  
20+_All_Total_Living_with_a_p

In [1319]:
df_22_1.columns = df_22_1.columns.str.lower() \
                .str.replace(' ', '_') \


df_22_1.head()

Unnamed: 0,country,20+_all_total_living_with_a_partner(%),20+_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20+_all_cohabiting_living_with_a_partner(%),20+_all_not_living_with_a_partner(%),20/34_total_living_with_a_partner(%),20/34_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20/34_cohabiting_living_with_a_partner(%),not_living_with_a_partner_total(%),living_with_at_least_one_parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620


In [1320]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_1.columns if c != "country"]

df_22_1[num_cols] = (
    df_22_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1321]:
df_22_1["country"] = df_22_1["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_1["country"].unique())

['Australia' 'Austria' 'Belgium' 'Canada' 'Czech Republic' 'Denmark'
 'Estonia' 'France' 'Germany' 'Greece' 'Hungary' 'Iceland' 'Ireland'
 'Italy' 'Latvia' 'Luxembourg' 'Netherlands' 'New Zealand' 'Norway'
 'Poland' 'Portugal' 'Slovak Republic' 'Slovenia' 'Spain' 'Sweden'
 'Switzerland' 'United Kingdom' 'United States' 'OECD-28 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [1322]:
df_22_1.drop_duplicates(inplace = True)
df_22_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
20+_all_total_living_with_a_partner(%)               float64           0   
20+_all_married_or_in_a_civil_or_registered_par...   float64           0   
20+_all_cohabiting_living_with_a_partner(%)          float64           0   
20+_all_not_living_with_a_partner(%)                 float64           0   
20/34_total_living_with_a_partner(%)                 float64           0   
20/34_married_or_in_a_civil_or_registered_partn...   float64           0   
20/34_cohabiting_living_with_a_partner(%)            float64           0   
not_living_with_a_partner_total(%)                   float64           0   
living_with_at_least_one_parent(%)                   float64           0   

                                                    unique_count  
country                                                       34  
20+_all_total_living_with_a_p

In [1323]:
df_22_1.sample(8)

Unnamed: 0,country,20+_all_total_living_with_a_partner(%),20+_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20+_all_cohabiting_living_with_a_partner(%),20+_all_not_living_with_a_partner(%),20/34_total_living_with_a_partner(%),20/34_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),20/34_cohabiting_living_with_a_partner(%),not_living_with_a_partner_total(%),living_with_at_least_one_parent(%)
22,Slovenia,52.2,44.66,7.54,47.8,24.41,14.64,9.77,75.59,52.6
14,Latvia,47.87,38.55,9.32,52.13,33.13,20.71,12.41,66.87,41.78
6,Estonia,53.93,37.3,16.64,46.07,45.31,17.81,27.5,54.69,26.46
27,United States,59.5,52.4,7.1,40.5,41.9,29.75,12.15,58.11,29.68
7,France,64.14,49.41,14.72,35.86,50.42,21.89,28.53,49.58,22.08
35,EU average,59.27,50.93,8.35,40.73,39.27,24.86,14.42,60.49,37.15
24,Sweden,62.84,43.63,19.21,37.16,46.98,17.57,29.41,53.02,21.9
5,Denmark,64.15,50.02,14.12,35.85,50.54,21.86,28.68,49.46,10.67


In [1324]:
#df_22_1.to_csv('../data/Cleaned/cleaned_hauseholds_by_partnership_status_oecd.csv', index=False)

In [1325]:
#df_22_1.to_sql('hauseholds_by_partnership_status_oecd', engine, if_exists='replace', index= False)

In [1326]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
#level_of_educational_attainment
df_22_2

Unnamed: 0,Country,Low_Education_Total_living_with_a_partner(%),Low_educationMarried or in a civil or registered partnership_living_with_a_partner(%),Low_education_Cohabiting_living_with_a_partner(%),Not living with a partner_Low_education(%),Medium education_Total_Living with a partner(%),Medium education_Married or in a civil or registered partnership_Living with a partner(%),Medium education_Cohabiting_Living with a partner(%),Not living with a partner_Medium education(%),High education_Total_Living with a partner(%),High education_Married or in a civil or registered partnership_Living with a partner(%),High education_Cohabiting_Living with a partner(%),Not living with a partner_High education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442
5,Germany,5446,4879,567,4554,6238,5313,925,3762.0,6889,5916,974,3111
6,Greece,6381,6288,93,3619,5700,5488,212,4300.0,5833,5570,263,4167
7,Hungary,5033,4038,995,4967,5794,4678,1115,4206.0,5956,5102,855,4044
8,Iceland,5186,4102,1084,4814,5831,4657,1174,4169.0,6972,5453,1519,3028
9,Latvia,3627,2592,1035,6373,4932,3954,978,5068.0,5291,4539,752,4709


In [1327]:
df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
Low_Education_Total_living_with_a_partner(%)          object           0   
Low_educationMarried or in a civil or registere...    object           0   
Low_education_Cohabiting_living_with_a_partner(%)     object           0   
Not living with a partner_Low_education(%)            object           0   
Medium education_Total_Living with a partner(%)       object           0   
Medium education_Married or in a civil or regis...    object           0   
Medium education_Cohabiting_Living with a partn...    object           0   
Not living with a partner_Medium education(%)         object           2   
High education_Total_Living with a partner(%)         object           0   
High education_Married or in a civil or registe...    object           0   
High education_Cohabiting_Living with a partner(%)    object           0   
Not living w

In [1328]:
df_22_2.columns = df_22_2.columns.str.lower() \
                .str.replace(' ', '_') \


df_22_2.head()

Unnamed: 0,country,low_education_total_living_with_a_partner(%),low_educationmarried_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),low_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_low_education(%),medium_education_total_living_with_a_partner(%),medium_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),medium_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_medium_education(%),high_education_total_living_with_a_partner(%),high_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner(%),high_education_cohabiting_living_with_a_partner(%),not_living_with_a_partner_high_education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442


In [1329]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_2.columns if c != "country"]

df_22_2[num_cols] = (
    df_22_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [1330]:
df_22_2["country"] = df_22_2["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_2["country"].unique())

['Austria' 'Belgium' 'Czech Republic' 'Estonia' 'France' 'Germany'
 'Greece' 'Hungary' 'Iceland' 'Latvia' 'Luxembourg' 'Norway' 'Poland'
 'Portugal' 'Slovenia' 'Spain' 'Sweden' 'United Kingdom' 'OECD-19 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [1331]:
df_22_2.drop_duplicates(inplace=True)
df_22_2.dropna(inplace=True)


df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
low_education_total_living_with_a_partner(%)         float64           0   
low_educationmarried_or_in_a_civil_or_registere...   float64           0   
low_education_cohabiting_living_with_a_partner(%)    float64           0   
not_living_with_a_partner_low_education(%)           float64           0   
medium_education_total_living_with_a_partner(%)      float64           0   
medium_education_married_or_in_a_civil_or_regis...   float64           0   
medium_education_cohabiting_living_with_a_partn...   float64           0   
not_living_with_a_partner_medium_education(%)        float64           0   
high_education_total_living_with_a_partner(%)        float64           0   
high_education_married_or_in_a_civil_or_registe...   float64           0   
high_education_cohabiting_living_with_a_partner(%)   float64           0   
not_living_w

In [1332]:
#df_22_2.to_csv('../data/Cleaned/cleaned_level_of_educational_attainment_oecd.csv', index=False)

In [1333]:
#df_22_2.to_sql('level_of_educational_attainment_oecd',engine, if_exists='replace', index= False)

In [1334]:
df_6666 = pd.read_csv('../data/Raw/OECD/OECD_df_famliy_selected.csv')
df_6666

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,COU,Country,SEX,Sex,IND,Indicator,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LVA,Latvia,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,39.5,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,GRC,Greece,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,11.1,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,CHL,Chile,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,74.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,NLD,Netherlands,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,51.9,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,LTU,Lithuania,TOTAL,Total,FAM3,Share of births outside of marriage (% of all ...,...,26.4,,A,,PC,Percentage,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,COL,Colombia,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.4,,A,,YR,Years,0,Units,,
501,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.5,,A,,YR,Years,0,Units,,
502,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.6,,A,,YR,Years,0,Units,,
503,DATAFLOW,OECD:DF_FAMILY(1.0),Family Database,I,OAVG,OECD - Average,TOTAL,Total,FAM2,Mean age of women at childbirth,...,30.7,,A,,YR,Years,0,Units,,
