In [1]:
import pandas as pd
import os, re
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 
from openpyxl import load_workbook
from pathlib import Path

In [2]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [3]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [4]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [5]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [6]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [7]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [8]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [9]:
df_1.drop_duplicates(inplace=True)
df_1.dropna(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [10]:
df_info = pd.DataFrame({
    'datatypes': df_1.dtypes,
    'null_count': df_1.isnull().sum(),
    'unique_count': df_1.nunique()
})
print(df_info)

                           datatypes  null_count  unique_count
country                       object           0           235
age_group                     object           0            63
sex                           object           0             2
marital_status                object           0            35
data_process                  object           0             6
data_collection_start_year     int32           0            62
data_collection_end_year       int32           0            60
data_source                   object           0            15


In [11]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [12]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [13]:
s_1 = ('../data/Cleaned/cleaned_world_marriage.csv')

In [14]:
AGE_COL = "age_group"

# 0) Ensure s_1 is a DataFrame
if isinstance(s_1, (str, Path)):
    s_1 = pd.read_csv(s_1)
elif isinstance(s_1, tuple) and len(s_1) == 1 and isinstance(s_1[0], (str, Path)):
    s_1 = pd.read_csv(s_1[0])

# 1) Normalize age labels; keep [+75]; map any 65+ to [65-69]

def norm_age(x):
    if pd.isna(x): return x
    x = str(x).replace("–","-").replace("—","-").replace("to","-").replace("_","-")
    x = re.sub(r"[()]", "", x)
    x = re.sub(r"\s+", "", x)
    if re.search(r"\[\+75\]|\[75\+\]|75\+|\+75", x):   # preserve 75+
        return "[+75]"
    if re.fullmatch(r"\+?65\+?|\[?\+65\]?|\[?65\+\]?", x, flags=re.I):  # merge 65+
        return "[65-69]"
    m = re.match(r"^\[?(\d{1,3})-(\d{1,3})\]?$", x)   # standard ranges
    if m:
        a, b = map(int, m.groups())
        return f"[{a}-{b}]"
    m = re.search(r"(\d{1,3})", x)                    # fallback: first number
    return f"[{m.group(1)}]" if m else x

s_1[AGE_COL] = s_1[AGE_COL].astype(str).map(norm_age)

# 2) Keep top-14 age buckets by frequency (delete all others)
top14 = s_1[AGE_COL].value_counts(dropna=False).nlargest(14).index.tolist()
s_1 = s_1[s_1[AGE_COL].isin(top14)].copy()

# 3) Natural ordering (put [+75] last)
def start_num(lbl):
    return 10**9 if lbl == "[+75]" else int(re.search(r"\d+", str(lbl)).group())
cats = sorted(top14, key=start_num)
s_1[AGE_COL] = pd.Categorical(s_1[AGE_COL], categories=cats, ordered=True)
s_1 = s_1.sort_values(AGE_COL).reset_index(drop=True)

print("Kept age buckets (14):", list(s_1[AGE_COL].cat.categories))

Kept age buckets (14): ['[10-14]', '[15-19]', '[20-24]', '[25-29]', '[30-34]', '[35-39]', '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65-69]', '[70-74]', '[+75]']


In [15]:
df_info = pd.DataFrame({
    'datatypes': s_1.dtypes,
    'null_count': s_1.isnull().sum(),
    'unique_count': s_1.nunique()
})
print(df_info)

                           datatypes  null_count  unique_count
country                       object           0           235
age_group                   category           0            14
sex                           object           0             2
marital_status                object           0            35
data_process                  object           0             6
data_collection_start_year     int64           0            62
data_collection_end_year       int64           0            60
data_source                   object           0            15


In [16]:
#s_1.to_csv('../data/Prep/prep_world_marriage.csv', index= False)

In [17]:
#s_1.to_sql('prep_world_marriage', engine, if_exists='replace', index=False)

In [18]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [19]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [20]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [21]:
df_2.drop_duplicates(inplace=True)
df_2.dropna(inplace=True)


In [22]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [23]:
df_info = pd.DataFrame({
    'datatypes': df_2.dtypes,
    'null_count': df_2.isnull().sum(),
    'unique_count': df_2.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            41
code                                   object           0            41
year                                    int32           0            32
mean_age_of_women_at_first_marriage   float64           0           179


In [24]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [25]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [26]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [27]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [28]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [29]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [30]:
df_3.drop_duplicates(inplace=True)
df_3.dropna(inplace=True)


In [31]:
df_info = pd.DataFrame({
    'datatypes': df_3.dtypes,
    'null_count': df_3.isnull().sum(),
    'unique_count': df_3.nunique()
})
print(df_info)

                                              datatypes  null_count  \
country                                          object           0   
code                                             object           0   
year                                              int32           0   
crude_marriage_rate_marriages_per_1000_people   float64           0   

                                               unique_count  
country                                                  45  
code                                                     45  
year                                                    127  
crude_marriage_rate_marriages_per_1000_people           109  


In [32]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [33]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [34]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [35]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [36]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [37]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [38]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [39]:
df_info = pd.DataFrame({
    'datatypes': df_4.dtypes,
    'null_count': df_4.isnull().sum(),
    'unique_count': df_4.nunique()
})
print(df_info)

                            datatypes  null_count  unique_count
country                        object           0            38
code                           object           0            38
year                            int64           0            61
crude_marriage_rate           float64           0           101
crude_marriage_rate_people1   float64           0            28
year_1                          Int64           0             1


In [40]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [41]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [42]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [43]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [44]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)
df_5.dropna(inplace=True)

In [45]:
df_info = pd.DataFrame({
    'datatypes': df_5.dtypes,
    'null_count': df_5.isnull().sum(),
    'unique_count': df_5.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            42
code                                   object           0            42
year                                    int64           0            62
share_of_births_outside_of_marriage   float64           0           610


In [46]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [47]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [48]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')
df_6

Unnamed: 0,Entity,Code,Year,Proportions of men or women who had ever married by a certain age for 1900 birth cohort,Proportions of men or women who had ever married by a certain age for 1920 birth cohort,Proportions of men or women who had ever married by a certain age for 1940 birth cohort,Proportions of men or women who had ever married by a certain age for 1960 birth cohort,Proportions of men or women who had ever married by a certain age for 1970 birth cohort,Proportions of men or women who had ever married by a certain age for 1980 birth cohort,Proportions of men or women who had ever married by a certain age for 1990 birth cohort,Proportions of men or women who had ever married by a certain age for 2000 birth cohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2
...,...,...,...,...,...,...,...,...,...,...,...
63,Women,,46,84.5,91.6,95.5,86.9,75.0,,,
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,
65,Women,,48,85.0,91.8,95.6,87.2,75.7,,,
66,Women,,49,85.2,91.9,95.7,87.3,76.0,,,


In [49]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)

df_6.head()

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
0,Men,,17,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Men,,18,0.1,0.1,0.4,0.6,0.1,0.0,0.0,0.0
2,Men,,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
3,Men,,20,2.4,2.2,6.0,6.2,1.9,0.7,0.3,0.1
4,Men,,21,6.1,7.4,13.6,11.9,3.9,1.4,0.6,0.2


In [50]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)
df_6

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
0,Men,17,0.0,0.0,0.0,0.1,0.0
1,Men,18,0.1,0.1,0.4,0.6,0.1
2,Men,19,0.8,0.6,2.0,2.5,0.7
3,Men,20,2.4,2.2,6.0,6.2,1.9
4,Men,21,6.1,7.4,13.6,11.9,3.9
...,...,...,...,...,...,...,...
63,Women,46,84.5,91.6,95.5,86.9,75.0
64,Women,47,84.8,91.7,95.6,87.0,75.4
65,Women,48,85.0,91.8,95.6,87.2,75.7
66,Women,49,85.2,91.9,95.7,87.3,76.0


In [51]:
df_6.dropna(inplace=True)
df_6.describe

<bound method NDFrame.describe of       sex  year  1900_birthcohort  1920_birthcohort  1940_birthcohort  \
0     Men    17               0.0               0.0               0.0   
1     Men    18               0.1               0.1               0.4   
2     Men    19               0.8               0.6               2.0   
3     Men    20               2.4               2.2               6.0   
4     Men    21               6.1               7.4              13.6   
..    ...   ...               ...               ...               ...   
63  Women    46              84.5              91.6              95.5   
64  Women    47              84.8              91.7              95.6   
65  Women    48              85.0              91.8              95.6   
66  Women    49              85.2              91.9              95.7   
67  Women    50              85.4              92.0              95.7   

    1960_birthcohort  1970_birthcohort  
0                0.1               0.0  
1      

In [52]:
df_info = pd.DataFrame({
    'datatypes': df_6.dtypes,
    'null_count': df_6.isnull().sum(),
    'unique_count': df_6.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [53]:
df_6.sample(12)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
26,Men,43,91.1,90.8,91.7,79.5,66.7
34,Women,17,0.1,0.3,1.0,1.3,0.4
65,Women,48,85.0,91.8,95.6,87.2,75.7
11,Men,28,62.7,66.3,77.7,56.8,33.1
24,Men,41,90.2,90.3,91.4,78.6,65.0
3,Men,20,2.4,2.2,6.0,6.2,1.9
6,Men,23,21.4,26.8,38.1,26.2,10.5
30,Men,47,92.3,91.6,92.2,81.0,69.4
5,Men,22,13.5,16.8,25.8,18.8,6.8
2,Men,19,0.8,0.6,2.0,2.5,0.7


In [54]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [55]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [56]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [57]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [58]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.dropna(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1147,Latvia,LVA,2018,39.5
238,Canada,CAN,1971,9.0
361,Croatia,HRV,1997,7.3
885,Iceland,ISL,1981,41.2
367,Croatia,HRV,2003,10.1


In [59]:
df_info = pd.DataFrame({
    'datatypes': df_7.dtypes,
    'null_count': df_7.isnull().sum(),
    'unique_count': df_7.nunique()
})
print(df_info)

                                          datatypes  null_count  unique_count
country                                      object           0            42
code                                         object           0            42
year                                          int64           0            62
shareofbirthsoutsideofmarriageofallbirths   float64           0           610


In [60]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [61]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [62]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [63]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [64]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
12,Men,GBR,29,68.3,72.0,81.0,60.6,37.4,21.5,16.4,
31,Men,GBR,48,92.5,91.7,92.3,81.3,69.9,,,
38,Women,GBR,21,14.6,26.1,42.2,31.5,12.7,4.8,1.7,0.6
52,Women,GBR,35,79.0,88.3,93.8,82.8,66.7,54.2,,
57,Women,GBR,40,82.4,90.5,95.0,85.3,72.0,61.6,,


In [65]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.dropna(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
66,Women,49,85.2,91.9,95.7,87.3,76.0
62,Women,45,84.2,91.4,95.4,86.7,74.5
41,Women,24,40.8,56.1,75.5,55.1,29.7
32,Men,49,92.7,91.8,92.3,81.5,70.3
31,Men,48,92.5,91.7,92.3,81.3,69.9


In [66]:
df_info = pd.DataFrame({
    'datatypes': df_8.dtypes,
    'null_count': df_8.isnull().sum(),
    'unique_count': df_8.nunique()
})
print(df_info)

                 datatypes  null_count  unique_count
sex                 object           0             2
year                 int64           0            34
1900_birthcohort   float64           0            66
1920_birthcohort   float64           0            61
1940_birthcohort   float64           0            62
1960_birthcohort   float64           0            67
1970_birthcohort   float64           0            65


In [67]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [68]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [69]:
#pip install openpyxl pywin32

In [70]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [71]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [72]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [73]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [74]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [75]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [76]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [77]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [78]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [79]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [80]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [81]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
6670,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,231,Ethiopia,ET,ETH,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.39
23216,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,798,Tuvalu,TV,TUV,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.26
11050,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,388,Jamaica,JM,JAM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,37.71
24963,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,887,Yemen,YE,YEM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,63.92
15694,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,533,Aruba,AW,ABW,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,49.9


In [82]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
6177,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,214,Dominican Republic,DO,DOM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.81
15125,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,516,Namibia,,NAM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.99
8066,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,270,Gambia,GM,GMB,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,82.87
21308,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,732,Western Sahara,EH,ESH,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,3,Projection,78.51
11798,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,412,Kosovo (under UNSC res. 1244),XK,XKX,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,54.48


In [83]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [84]:
df_9.drop_duplicates(inplace=True)
df_9.dropna(inplace = True)

df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [85]:
df_info = pd.DataFrame({
    'datatypes': df_9.dtypes,
    'null_count': df_9.isnull().sum(),
    'unique_count': df_9.nunique()
})
print(df_info)

                datatypes  null_count  unique_count
indicatorname      object           0             1
year                int64           0             1
country            object           0           224
code               object           0           224
time                int64           0            56
variant            object           0             1
sex                object           0             1
age                object           0             1
estimate_method     int64           0             2
estimatemethod     object           0             2
value             float64           0          3867


In [86]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [87]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [88]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [89]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
33908,Benin,204,Married or in-union women,1996,35-39,93.454545,148.10396,Estimate
66245,Jordan,400,Married or in-union women,1988,40-44,85.525882,45.231646,Estimate
37276,El Salvador,222,Married or in-union women,2012,35-39,69.43147,149.466167,Estimate
8440,Armenia,51,Married or in-union women,1972,15-19,13.073333,18.06264,Estimate
104866,Guinea-Bissau,624,Married or in-union women,2037,25-29,60.710334,74.395353,Projection
138047,United Kingdom,826,Married or in-union women,1972,15-49,71.833902,8943.516578,Estimate
7293,Bahrain,48,Married or in-union women,1990,40-44,84.2,6.819358,Estimate
135700,Uganda,800,Married or in-union women,2003,35-39,78.413043,480.59472,Estimate
121837,Spain,724,Married or in-union women,1971,40-44,84.230976,993.546473,Estimate
137343,The former Yugoslav Republic of Macedonia,807,Married or in-union women,2046,15-49,61.249653,183.404429,Projection


In [90]:
df_10.rename(columns={
    "dataprocess": "data_process",
    "countryorarea": "country"
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,country,isocode,indicator,year,agegroup,percentage,number,data_process
57093,"China, Hong Kong SAR",344,Married or in-union women,1978,40-44,89.21732,101.377641,Estimate
26745,Colombia,170,Married or in-union women,1992,20-24,44.7575,732.02055,Estimate
107599,Romania,642,Married or in-union women,1973,15-49,73.641761,4007.759518,Estimate
106372,Qatar,634,Married or in-union women,1982,35-39,86.24,5.30376,Estimate
20724,Cameroon,120,Married or in-union women,2049,35-39,70.848395,1189.971768,Projection


In [91]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [92]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [93]:
df_10.dropna(inplace=True)

In [94]:
df_info = pd.DataFrame({
    'datatypes': df_10.dtypes,
    'null_count': df_10.isnull().sum(),
    'unique_count': df_10.nunique()
})
print(df_info)

             datatypes  null_count  unique_count
country         object           0           225
isocode          int64           0           225
indicator       object           0             1
year             int64           0            81
agegroup        object           0             8
percentage     float64           0          9667
number         float64           0         65394
data_process    object           0             2


In [95]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   country       145800 non-null  object 
 1   isocode       145800 non-null  int64  
 2   indicator     145800 non-null  object 
 3   year          145800 non-null  int64  
 4   agegroup      145800 non-null  object 
 5   percentage    145800 non-null  float64
 6   number        145800 non-null  float64
 7   data_process  145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [96]:
#df_10.to_csv("../data/Cleaned/cleaned_countries_1970_2025_un.csv", index=False)

In [97]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [98]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [99]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
32728,Myanmar,104,1983,1983,Men,[20-24],20,24,37.38,Census,1983 Census,256,Myanmar 1983 Census,UNSD,,,Excluding persons from areas restricted by sec...
38930,Puerto Rico,630,2010,2010,Men,[50-54],50,54,56.27,Census,2010 Census,4755,Puerto Rico 2010 Census,UNSD,,,Including armed forces stationed in the area.
45349,Spain,724,1978,1978,Men,[30-34],30,34,81.28,Estimate,1978 Estimate,2222,Spain 1978 Estimate,UNSD,,,
48608,Tajikistan,762,2010,2010,Men,[35-39],35,39,94.39,Census,2010 Census,4801,Tajikistan 2010 Census,UNSD,1.0,,
17493,Ghana,288,1998,1999,Men,[35-39],35,39,90.7,Survey,1998 DHS,1751,Ghana 1998 Demographic and Health Survey,DHS_STATcompiler,1.0,,
32797,Myanmar,104,2000,2000,Men,[75+],75,999,56.11,Survey,2000 MICS_HH,4442,Myanmar 2000 Multiple Indicator Cluster Survey,MICS_HH,1.0,,
38894,Puerto Rico,630,1990,1990,Men,[35-39],35,39,75.23,Census,1990 Census,391,Puerto Rico 1990 Census,UNSD,1.0,,Including military personnel.
35519,Norway,578,1972,1972,Men,[10-14],10,14,0.0,Estimate,1972 Estimate,2180,Norway 1972 Estimate,UNSD,,,


In [100]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
11877,Denmark,208,2011,2011,Men,[10-14],10,14,0.0,Estimate,2011 Estimate,2081,Denmark 2011 Estimate,UNSD,,Based on data compiled from registers.,Excluding Faeroe Islands and Greenland shown s...
35538,Norway,578,1972,1972,Women,[35-39],35,39,88.42,Estimate,1972 Estimate,2180,Norway 1972 Estimate,UNSD,,,
12477,Dominican Republic,214,2007,2007,Men,[20-24],20,24,22.1,Survey,2007 DHS,49,Dominican Republic 2007 Demographic and Health...,DHS_HH,1.0,,
21928,Iceland,352,2006,2006,Women,[60-64],60,64,70.66,Estimate,2006 Estimate,2121,Iceland 2006 Estimate,UNSD,1.0,,
16662,Gabon,266,2012,2012,Women,[25-29],25,29,65.7,Survey,2012 DHS,5054,Gabon 2012 Demographic and Health Survey,DHS_STATcompiler,1.0,,
30983,Mauritania,478,1988,1988,Men,[25-29],25,29,43.47,Census,1988 Census,355,Mauritania 1988 Census,US Census Bureau,,,
27348,Latvia,428,2011,2011,Men,[15-19],15,19,0.61,Census,2011 Census,4829,Latvia 2011 Census,Eurostat,1.0,Estimates computed based on data on marital st...,
45401,Spain,724,1986,1986,Men,[10-14],10,14,0.02,Estimate,1986 Estimate,2222,Spain 1986 Estimate,UNSD,,,


In [101]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
49640,Trinidad and Tobago,780,Women,[65+],65,999,32.97,Census,1980 Census,UNSD
43949,Slovakia,703,Women,[35-39],35,39,64.28,Estimate,2013 Estimate,UNSD
43549,Singapore,702,Women,[45-49],45,49,78.85,Census,2000 Census,UNSD
29496,Luxembourg,442,Women,[45-49],45,49,81.83,Census,1981 Census,UNSD
11215,Democratic Republic of the Congo,180,Women,[55-59],55,59,59.6,Survey,2007 DHS,DHS_HH
38066,Philippines,608,Women,[30-34],30,34,85.06,Census,1980 Census,UNSD
29425,Luxembourg,442,Men,[20-24],20,24,22.88,Census,1970 Census,UNSD
8470,"China, Taiwan Province of China",158,Women,[40-44],40,44,93.07,Census,1980 Census,US Census Bureau
24752,Italy,380,Women,[25-29],25,29,45.26,Estimate,1999 Estimate,UNSD
1619,Austria,40,Women,[50-54],50,54,67.99,Census,1971 Census,UNSD


In [102]:
df_11.drop_duplicates(inplace=True)
df_11.dropna(inplace=True)

In [103]:
df_info = pd.DataFrame({
    'datatypes': df_11.dtypes,
    'null_count': df_11.isnull().sum(),
    'unique_count': df_11.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           233
isocode                  int64           0           230
sex                     object           0             2
agegroup                object           0            23
age_start                int64           0            17
ageend                   int64           0            15
data_value             float64           0          9213
dataprocess             object           0             6
datacatalogshortname    object           0           412
data_source             object           0            15


In [104]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [105]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [106]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [107]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
33400,Monaco,492,1982,1982,Women,[25-29],25,29,60.5,Census,1982 Census,2451,Monaco 1982 Census,UNSD,,,
18364,Ghana,288,1970,1970,Men,[50+],50,999,97.4,Census,1970 Census,90,Ghana 1970 Census,INED,,,
38140,Norway,578,1993,1993,Women,[75+],75,999,87.25,Estimate,1993 Estimate,2180,Norway 1993 Estimate,UNSD,,,
5581,Burundi,108,2002,2002,Men,[40-44],40,44,94.55,Survey,2002 Survey,2919,Burundi 2002 Enquête Socio-démographique et de...,National statistics,,,
17014,France,250,1990,1990,Men,[30-34],30,34,65.81,Census,1990 Census,1211,France 1990 Census,UNSD,,,Excluding diplomatic personnel outside the cou...
4661,British Virgin Islands,92,1980,1980,Men,[40-44],40,44,80.26,Census,1980 Census,2303,British Virgin Islands 1980 Census,UNSD,,,
20731,Haiti,332,1982,1982,Women,[50-54],50,54,91.24,Census,1982 Census,1024,Haiti 1982 Census,UNSD,1.0,,
46678,Slovenia,705,2016,2016,Men,[55-59],55,59,79.87,Estimate,2016 Estimate,2218,Slovenia 2016 Estimate,UNSD,,,


In [108]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
43088,Samoa,882,Men,[60-64],60,64,93.06,Census,US Census Bureau
2219,Azerbaijan,31,Men,[65-69],65,69,99.33,Estimate,UNSD
15613,Finland,246,Women,[55-59],55,59,89.44,Estimate,UNSD
1023,Aruba,533,Women,[60-64],60,64,83.92,Census,UNSD
9851,Costa Rica,188,Men,[35-39],35,39,84.28,Census,UNSD
20403,Guinea-Bissau,624,Women,[65-69],65,69,88.89,Census,INED
18228,Germany,276,Men,[35-39],35,39,58.85,Estimate,UNSD
54570,Viet Nam,704,Women,[75+],75,999,100.0,Survey,DHS_HH


In [109]:
df_12.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_12.dtypes,
    'null_count': df_12.isnull().sum(),
    'unique_count': df_12.nunique()
})
print(df_info)

            datatypes  null_count  unique_count
country        object           0           233
isocode         int64           0           230
sex            object           0             2
agegroup       object           0            23
age_start       int64           0            17
age_end         int64           0            15
datavalue     float64           0          8396
dataprocess    object           0             6
datasource     object           0            15


In [110]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [111]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [112]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [113]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
42508,Lesotho,426,[45-49],ASFR4549,2015.79589,14.4232,2016 Census: Key Findings,Recent births,Census,Census,7012,Lesotho 2016 Census,2016 Census,2016,2016,Lesotho 2016 Census: Summary Key Findings,2018
25869,France,250,[Total],TFR,1986.5,1.8311,"Register,Fertility data (Adjusted),HFC-STAT,44...",Fertility data (adjusted),Register,VR,449,Vital Registration,Register,1986,1986,Age-specific fertility rates by age of the mot...,2013
2642,Armenia,51,[35-39],ASFR3539,2003.333008,15.0,"2005 DHS,Direct,DHS,1843-16-39167",Direct,Survey,DHS,1843,Armenia 2005 Demographic and Health Survey,2005 DHS,2005,2005,DHS Statcompiler,2012
51894,New Zealand,554,[25-29],ASFR2529,1964.50274,229.94,"NSO 2019, Official Estimate",Direct,Register,VR,607,Vital Registration,Register,1964,1964,Stats NZ Infoshare platform,2019
28869,Ghana,288,[25-29],ASFR2529,1966.206365,298.3242,"1979-1980 WFS,Birth Histories,FBH analysis 201...",Birth histories,Survey,WFS,658,Ghana 1979-1980 World Fertility Survey,1979-1980 WFS,1979,1980,Fertility rates from full birth histories anal...,2018
77357,Viet Nam,704,[20-24],ASFR2024,1985.374023,223.0,"2002 DHS,Direct,DHS,1743-16-39167",Direct,Survey,DHS,1743,Viet Nam 2002 Demographic and Health Survey,2002 DHS,2002,2002,DHS Statcompiler,2012


In [114]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13 = df_13.rename(columns={
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type"
})

In [115]:
df_13.head()

Unnamed: 0,country,age_group,date,value,data_type,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
0,Afghanistan,[Total],1964.977051,7.966653,Reverse survival method,Census,1979 Census,1979,1979
1,Afghanistan,[Total],1965.977051,8.212275,Reverse survival method,Census,1979 Census,1979,1979
2,Afghanistan,[Total],1966.977051,8.317603,Reverse survival method,Census,1979 Census,1979,1979
3,Afghanistan,[Total],1967.977051,8.225812,Reverse survival method,Census,1979 Census,1979,1979
4,Afghanistan,[Total],1968.977051,8.068459,Reverse survival method,Census,1979 Census,1979,1979


In [116]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,country,age_group,date,value,data_type,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
24327,Ethiopia,[40-44],1992,130.0,Direct,Survey,2000 DHS,2000,2000
65721,Spain,[45-49],1986,0.69,Official estimates,Estimate,Estimates,1986,1986
15012,China,[20-24],2005,115.16,Fertility data (adjusted),Estimate,Estimates,2005,2005
32236,Honduras,[Total],1993,4.97,Extrapolated from Truncated Birth Histories,Survey,2001 ENESF,2001,2001
13410,Chile,[Total],1991,27.44,Computed rate from DYB,Register,Register,1991,1991
8892,Bosnia and Herzegovina,[40-44],1953,75.05,Fertility data (adjusted),Estimate,Estimates,1953,1953
79528,Zimbabwe,[40-44],1998,56.66,Extrapolated from Truncated Birth Histories,Survey,2015 DHS,2015,2015
2163,Armenia,[Total],1959,4.73,Direct,Register,Register,1959,1959
47663,Mongolia,[25-29],2007,146.5,Direct,Register,Register,2007,2007
75064,United States of America,[Total],2000,27.4,Direct,Register,Register,2000,2000


In [117]:
df_13.drop_duplicates(inplace=True)
df_13.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_13.dtypes,
    'null_count': df_13.isnull().sum(),
    'unique_count': df_13.nunique()
})
print(df_info)

                    datatypes  null_count  unique_count
country                object           0           201
age_group              object           0             8
date                    int32           0            69
value                 float64           0         18752
data_type              object           0            30
datasourcetype         object           0             7
datasourcenameshort    object           0           539
datasourcestartyear     int64           0            69
datasourceendyear      object           0            70


In [118]:
#df_13.to_csv("../data/Cleaned/cleaned_fertility_indicators_un.csv", index=False)

In [119]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [120]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [121]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
206632,Saint Helena,654,1987,1987,Men,Widowed,,1.0,[40-44],40,...,1987 Census,2529,Saint Helena 1987 Census,US Census Bureau,,,,,Includes Ascension.,
160876,Monaco,492,1982,1982,Men,Single,,,[40-44],40,...,1982 Census,2451,Monaco 1982 Census,UNSD,,,,,,
169047,Netherlands,528,1970,1970,Women,Single,,,[10-14],10,...,1970 Estimate,2170,Netherlands 1970 Estimate,UNSD,,Data pertains to age group 0-14.,,,,
229112,Sri Lanka,144,2001,2001,Women,Separated,,,[60-64],60,...,2001 Census,242,Sri Lanka 2001 Census,UNSD,,,,,Excluding persons from areas restricted by sec...,
7063,Australia,36,1994,1994,Women,Single,,,[35-39],35,...,1994 Estimate,2037,Australia 1994 Estimate,UNSD,,,,,,


In [122]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
15153,Belarus,1999,1999,Women,Widowed,[30-34],30,34,1.66,Census,1999 Census,UNSD
230181,State of Palestine,2017,2017,Women,Widowed,[35-39],35,39,1.25,Census,2017 Census,UNSD
164983,Mozambique,2011,2011,Women,Married,[25-29],25,29,78.1,Survey,2011 DHS,DHS_HH
249682,Turkey,1993,1993,Women,Never married,[30-34],30,34,4.25,Survey,1993 DHS,DHS_HH
205905,Rwanda,2007,2008,Women,Widowed,[50-54],50,54,36.5,Survey,2007-2008 DHS Interim,DHS_HH
21216,Bosnia and Herzegovina,2004,2004,Men,Single,[15-24],15,24,94.63,Survey,2004 HBS,National statistics
52924,Côte d'Ivoire,2011,2012,Women,Not living together,[15-19],15,19,0.7,Survey,2011-2012 DHS-MICS,DHS_STATcompiler
27327,Burundi,1970,1971,Men,Widowed,[45-49],45,49,1.07,Survey,1970-1971 DS,National statistics
31906,Canada,1977,1977,Men,Widowed,[65-69],65,69,7.01,Estimate,1977 Estimate,UNSD
92719,Greece,2001,2001,Women,Single,[45-49],45,49,5.95,Census,2001 Census,UNSD


In [123]:
df_14.drop_duplicates(inplace=True)
df_14.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_14.dtypes,
    'null_count': df_14.isnull().sum(),
    'unique_count': df_14.nunique()
})
print(df_info)

                     datatypes  null_count  unique_count
country                 object           0           235
year_start               int64           0            62
year_end                 int64           0            60
sex                     object           0             2
marital_status          object           0            35
age_group               object           0            63
agestart                 int64           0            21
ageend                   int64           0            20
datavalue              float64           0          9994
dataprocess             object           0             6
datacatalogshortname    object           0           443
datasource              object           0            15


In [124]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [125]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [126]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [127]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
3890,Europe and Northern America,513,SDG,Married or in-union women,1970,25-29,79.792763,22469.244975,Estimate
16817,Caribbean,915,SDG-M49,Married or in-union women,2047,20-24,23.924101,688.176299,Projection
9034,Asia,935,M49,Married or in-union women,2046,25-29,63.759515,211582.644293,Projection
473,World,900,M49,Married or in-union women,2029,20-24,41.016957,243031.287937,Projection
27605,High-income countries,1503,Income group,Married or in-union women,2018,40-44,73.014719,34558.70635,Estimate
2099,Central and Southern Asia,62,SDG,Married or in-union women,1989,30-34,93.226716,76106.389598,Estimate


In [128]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
1834,Northern Africa and Western Asia,747,Married or in-union women,2037,25-29,66.759022,35532.036402,Projection
21965,Micronesia,954,Married or in-union women,2042,40-44,71.685618,22.512868,Projection
9893,Eastern Asia,906,Married or in-union women,1991,40-44,94.520462,73161.700898,Estimate
25718,Low-income countries,1500,Married or in-union women,2025,45-49,79.297595,23954.017559,Projection
14889,Southern Europe,925,Married or in-union women,2049,20-24,5.618355,163.250096,Projection
13756,Northern Europe,924,Married or in-union women,1988,35-39,83.173417,2653.587153,Estimate
3724,Oceania excluding Australia and New Zealand,543,Married or in-union women,2030,35-39,80.162676,836.771682,Projection
15040,Western Europe,926,Married or in-union women,1987,15-19,4.220437,266.508219,Estimate
14594,Southern Europe,925,Married or in-union women,2012,25-29,44.744318,2039.927894,Estimate
9895,Eastern Asia,906,Married or in-union women,1991,15-49,67.164284,484559.165956,Estimate


In [129]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [130]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [131]:
df_15.dropna(inplace=True)
df_15.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_15.dtypes,
    'null_count': df_15.isnull().sum(),
    'unique_count': df_15.nunique()
})
print(df_info)

           datatypes  null_count  unique_count
region        object           0            43
iso_code       int64           0            44
indicator     object           0             1
year           int64           0            81
age_group     object           0             8
percentage   float64           0          7796
number         int32           0         20311
process       object           0             2


In [132]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [133]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [134]:
df_16_1 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition_S1.csv')
df_16_1
#Data for Chart SF1.1.A. Average size of households by household type, 2024a
# avg_size_all	avg_size_couple_with_children	avg_size_single_parent_with_children		

Unnamed: 0,Country,All households,Couple households with children,Single parent households with children
0,Mexico,356,408.0,276.0
1,Costa Rica,346,437.0,344.0
2,Türkiye,320,410.0,280.0
3,Israel,319,465.0,286.0
4,Columbia,310,,
5,Slovak Republic,310,380.0,250.0
6,Chile,280,,
7,Iceland,270,412.0,261.0
8,New Zealand,261,388.0,267.0
9,Greece,260,380.0,250.0


In [135]:
df_16_1.columns = df_16_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [136]:
df_16_1.rename(columns={
        "All households": "avg_size_all",
        "Couple with children": "avg_size_couple_with_children",
        "Single parent with children": "avg_size_single_parent_with_children"
}, inplace=True)

In [137]:
df_16_1.drop_duplicates(inplace=True)
df_16_1.dropna(inplace=True)

In [138]:
for col in df_16_1.columns:
    if col != 'country':
        # Replace commas with dots if necessary, remove non-numeric chars, convert to float
        df_16_1[col] = (
            df_16_1[col]
            .astype(str)  # ensure string for replace
            .str.replace(',', '.', regex=False)  # decimal commas to dots
            .str.replace(r'[^\d\.\-]', '', regex=True)  # remove non-numeric chars except dot and minus
            .replace('', None)  # empty to NaN
            .astype(float)  # convert to float
        )

# Check updated dtypes
print(df_16_1.dtypes)

country                                    object
all_households                            float64
couple_households_with_children           float64
single_parent_households_with_children    float64
dtype: object


In [139]:
info_16_1 = pd.DataFrame({
    'dtype': df_16_1.dtypes,
    'null_count': df_16_1.isnull().sum(),
    'unique_count': df_16_1.nunique()
})
print(info_16_1)

                                          dtype  null_count  unique_count
country                                  object           0            39
all_households                          float64           0            19
couple_households_with_children         float64           0            16
single_parent_households_with_children  float64           0            15


In [140]:
if "year" not in df_16_1.columns:
    df_16_1["year"] = 2024
df_16_1['source'] = 'OECD'
df_16_1.sample(10)

Unnamed: 0,country,all_households,couple_households_with_children,single_parent_households_with_children,year,source
35,France,2.1,3.9,2.6,2024,OECD
19,Portugal,2.4,3.5,2.4,2024,OECD
32,Belgium,2.2,3.9,2.6,2024,OECD
0,Mexico,3.56,4.08,2.76,2024,OECD
15,Malta,2.5,3.7,2.5,2024,OECD
5,Slovak Republic,3.1,3.8,2.5,2024,OECD
30,Japan,2.21,3.85,2.73,2024,OECD
29,Korea,2.21,3.55,2.34,2024,OECD
3,Israel,3.19,4.65,2.86,2024,OECD
43,Estonia,1.8,3.8,2.6,2024,OECD


In [141]:
df_16_1.to_csv('../data/Cleaned/cleaned_average_size_of_households_type_2024_oecd.csv', index=False)

In [142]:
df_16_1.to_sql('average_size_of_households_type_2024_oecd', engine, if_exists = 'replace', index= False)

39

In [143]:
df_16_2 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition_S2.csv', header=1)
df_16_2.head(10)
#Table SF1.1.A. Types of household, 2021a
# share_couple_total	share_couple_with_children	share_couple_without_children	share_single_parent_total	share_single_mother	share_single_father	share_single_person	share_other_types						

Unnamed: 0,Country,Total,With children,Without children,Total.1,Single mother households,Single father households,Single person households,Other households types
0,Australia,5593,2990,2602,1037,,,2512,858
1,Austria,4893,2113,2780,563,478,085,3834,711
2,Belgium,5222,2398,2824,742,608,135,3550,486
3,Canada,5092,2530,2562,872,,,2935,1102
4,Chile,..,..,..,..,..,..,..,..
5,Columbia,..,..,..,..,..,..,..,..
6,Costa Rica,5244,3815,1429,1055,949,106,1127,2574
7,Czechia,4703,2170,2532,715,611,104,3915,667
8,Denmark,4860,2041,2819,631,511,119,3757,752
9,Estonia,4620,2546,2073,683,609,074,3699,998


In [144]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_2.columns = (
    df_16_2.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[()%]', '', regex=True)
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [145]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_2.columns if c != "country"]

df_16_2[num_cols] = (
    df_16_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)
df_16_2.sample(10)

Unnamed: 0,country,total,with_children,without_children,total1,single_mother_households,single_father_households,single_person_households,other_households_types
13,Greece,52.14,24.03,28.11,4.66,3.82,0.84,32.35,10.85
22,Lithuania,50.09,23.74,26.35,7.18,6.68,0.5,35.16,7.58
28,Poland,48.32,23.92,24.4,5.97,5.15,0.82,23.44,22.27
11,France,49.73,22.19,27.54,7.68,6.23,1.45,37.78,4.81
18,Italy,46.7,20.91,25.8,7.27,5.65,1.63,36.64,9.38
31,Slovenia,45.41,20.97,24.44,6.93,5.57,1.36,34.0,13.66
6,Costa Rica,52.44,38.15,14.29,10.55,9.49,1.06,11.27,25.74
21,Latvia,27.8,12.21,15.6,13.44,11.21,2.23,41.08,17.68
35,Türkiye,54.38,40.84,13.54,10.06,7.75,2.31,18.88,16.68
26,New Zealand,57.33,29.25,28.07,10.39,,,22.79,9.49


In [146]:
df_16_2.drop(columns=["total", "total1"], errors="ignore", inplace=True)

In [147]:
df_16_2.drop_duplicates(inplace=True)
df_16_2.dropna(inplace=True)


In [148]:
df_16_2['unit'] = '%'
df_16_2['source'] = 'OECD'

In [149]:
if "year" not in df_16_2.columns:
    df_16_2["year"] = 2021
df_16_2.sample(10)

Unnamed: 0,country,with_children,without_children,single_mother_households,single_father_households,single_person_households,other_households_types,unit,source,year
37,United States,19.85,33.34,5.21,1.58,27.61,12.41,%,OECD,2021
20,Korea,26.25,17.23,6.85,2.28,35.47,11.93,%,OECD,2021
27,Norway,23.0,24.05,4.77,1.55,42.14,4.51,%,OECD,2021
33,Sweden,22.49,26.78,4.91,1.76,39.24,4.82,%,OECD,2021
40,Croatia,24.78,26.73,4.39,1.04,27.8,15.27,%,OECD,2021
30,Slovak Republic,16.99,20.16,5.39,0.84,31.4,25.21,%,OECD,2021
25,Netherlands,23.01,30.59,5.0,1.09,38.5,1.8,%,OECD,2021
16,Ireland,29.45,23.58,6.12,0.8,23.14,16.91,%,OECD,2021
31,Slovenia,20.97,24.44,5.57,1.36,34.0,13.66,%,OECD,2021
9,Estonia,25.46,20.73,6.09,0.74,36.99,9.98,%,OECD,2021


In [150]:
info_16_2 = pd.DataFrame({
    "dtype": df_16_2.dtypes,
    "null_count": df_16_2.isna().sum(),
    "unique_count": df_16_2.nunique()
})
print(info_16_2)
print(df_16_2.dtypes)

                            dtype  null_count  unique_count
country                    object           0            36
with_children             float64           0            35
without_children          float64           0            36
single_mother_households  float64           0            32
single_father_households  float64           0            31
single_person_households  float64           0            35
other_households_types    float64           0            36
unit                       object           0             1
source                     object           0             1
year                        int64           0             1
country                      object
with_children               float64
without_children            float64
single_mother_households    float64
single_father_households    float64
single_person_households    float64
other_households_types      float64
unit                         object
source                       object
year            

In [151]:
#df_16_2.to_csv('../data/Cleaned/cleaned_types_of_household_2021_oecd.csv', index = False)

In [152]:
#df_16_2.to_sql('types_of_household_2021_oecd', engine, if_exists = 'replace', index= False)

In [153]:
df_16_3 = pd.read_csv('../data/Raw/OECD/SF_1_1_Family_size_and_composition_S3.csv', header=1)
df_16_3
#Table SF1.1.B. Households by number of children, 2024
# share_hh_0_children	share_hh_1_child	share_hh_2_children	share_hh_3plus_children		

Unnamed: 0,country,0 children,1 child,2 children,3 or more children,Children under 6
0,Australia,..,..,..,..,..
1,Austria,7778,1052,857,312,944
2,Belgium,7397,1176,1015,411,1040
3,Canada,..,..,..,..,..
4,Chile,..,..,..,..,..
5,Columbia,..,..,..,..,..
6,Costa Rica,3029,2308,2461,2202,2630
7,Czechia,7195,1385,1156,264,1229
8,Denmark,7778,1054,894,274,815
9,Estonia,7576,1253,873,298,985


In [154]:
# Normalize column names (lowercase, underscores, remove () and non-ascii)
df_16_3.columns = (
    df_16_3.columns.str.strip().str.lower()
    .str.replace(' ', '_')
    .str.replace('[^0-9a-z_]', '', regex=True)
)

In [155]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_16_3.columns if c != "country"]

df_16_3[num_cols] = (
    df_16_3[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [156]:
df_16_3.drop_duplicates(inplace=True)
df_16_3.dropna(inplace=True)

In [157]:
if "year" not in df_16_3.columns:
    df_16_3["year"] = 2024
df_16_3.sample(10)

Unnamed: 0,country,0_children,1_child,2_children,3_or_more_children,children_under_6,year
29,Portugal,74.35,15.87,8.18,1.6,9.85,2024
36,United Kingdom,72.06,12.1,11.31,4.53,12.73,2024
12,Germany,79.86,9.91,7.72,2.51,8.57,2024
2,Belgium,73.97,11.76,10.15,4.11,10.4,2024
6,Costa Rica,30.29,23.08,24.61,22.02,26.3,2024
33,Sweden,74.84,10.77,9.83,4.56,9.95,2024
25,Netherlands,78.65,8.78,9.27,3.3,8.79,2024
44,EU average,75.1,12.28,9.46,3.15,9.9,2024
27,Norway,76.87,10.53,9.14,3.47,8.88,2024
43,Romania,72.46,14.29,9.24,4.02,9.64,2024


In [158]:
df_16_3.rename(columns={
    "0_children": "households_0_children",
    "1_child": "households_1_child",
    "2_children": "households_2_children",
    "3_or_more_children": "households_3_or_more_children"
}, inplace=True)

In [159]:
df_16_3["unit"] = "%"
df_16_3["source"] = "OECD"
df_16_3.sample(10)

Unnamed: 0,country,households_0_children,households_1_child,households_2_children,households_3_or_more_children,children_under_6,year,unit,source
2,Belgium,73.97,11.76,10.15,4.11,10.4,2024,%,OECD
9,Estonia,75.76,12.53,8.73,2.98,9.85,2024,%,OECD
14,Hungary,74.98,13.16,8.71,3.16,10.41,2024,%,OECD
23,Luxembourg,73.0,12.49,12.07,2.41,11.54,2024,%,OECD
21,Latvia,74.8,14.05,8.32,2.83,10.07,2024,%,OECD
19,Japan,81.94,8.78,7.17,2.11,7.58,2024,%,OECD
16,Ireland,69.02,12.42,12.18,6.38,11.81,2024,%,OECD
43,Romania,72.46,14.29,9.24,4.02,9.64,2024,%,OECD
22,Lithuania,80.44,11.06,7.0,1.51,8.12,2024,%,OECD
10,Finland,81.98,7.89,6.99,3.14,7.14,2024,%,OECD


In [160]:
info_16_3 = pd.DataFrame({
    "dtype": df_16_3.dtypes,
    "null_count": df_16_3.isna().sum(),
    "unique_count": df_16_3.nunique()
})
print(info_16_3)
print(df_16_3.dtypes)

                                 dtype  null_count  unique_count
country                         object           0            33
households_0_children          float64           0            32
households_1_child             float64           0            32
households_2_children          float64           0            33
households_3_or_more_children  float64           0            31
children_under_6               float64           0            31
year                             int64           0             1
unit                            object           0             1
source                          object           0             1
country                           object
households_0_children            float64
households_1_child               float64
households_2_children            float64
households_3_or_more_children    float64
children_under_6                 float64
year                               int64
unit                              object
source                

In [161]:
#df_16_3.to_csv('../data/Cleaned/cleaned_households_by_number_of_children_2024_oecd.csv', index=False)

In [162]:
#df_16_3.to_sql('households_by_number_of_children_2024_oecd', engine, index= False)

In [163]:
df_17_1 = pd.read_csv('../data/Raw/OECD/SF_2_1_Total_Fertility_rates_S1.csv')
#total_fertility_rates_from_1960_oecd
df_17_1.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,345,355,343,334,315,297,289,285,289,...,179,179,179,174,174,167,159,170,163,150
1,Austria,269,278,280,282,279,270,266,262,258,...,146,149,153,152,148,146,144,148,141,132
2,Belgium,254,263,259,268,271,261,252,241,231,...,174,170,168,165,162,160,155,160,153,147
3,Canada,390,384,376,367,350,315,281,260,245,...,161,160,159,155,151,147,141,144,133,126
4,Chile,470,466,460,454,446,436,426,414,403,...,177,174,169,156,154,143,131,118,126,117


In [164]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            49
1960     object           0            47
1961     object           0            47
1962     object           0            47
1963     object           0            46
...         ...         ...           ...
2019     object           0            37
2020     object           0            39
2021     object           0            40
2022     object           0            34
2023     object           0            35

[65 rows x 3 columns]


In [165]:
df_17_1.columns = df_17_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [166]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_17_1.columns if c != "country"]

df_17_1[num_cols] = (
    df_17_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [167]:
df_17_1.drop_duplicates(inplace=True)
df_17_1.dropna(inplace=True)

In [168]:
# Identify id and year columns
id_cols = ["country"]
year_cols = [c for c in df_17_1.columns if re.fullmatch(r"\d{4}", str(c))]

# Wide → Long
df_17_1 = df_17_1.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name="year",
    value_name="value"
)

# Fix types
df_17_1["year"] = df_17_1["year"].astype(int)
df_17_1["value"] = pd.to_numeric(df_17_1["value"], errors="coerce")
df_17_1 = df_17_1.dropna(subset=["value"])

# Add metadata
df_17_1["indicator"] = "total_fertility_rate"
df_17_1["source"] = "OECD"

# Final column order
df_17_1 = df_17_1[["country","year","indicator","value","source"]]

df_17_1.sample(10)

Unnamed: 0,country,year,indicator,value,source
2289,United States,2006,total_fertility_rate,2.11,OECD
2234,Slovak Republic,2005,total_fertility_rate,1.25,OECD
2006,Bulgaria,2000,total_fertility_rate,1.26,OECD
1979,Korea,2000,total_fertility_rate,1.48,OECD
1606,China,1992,total_fertility_rate,1.78,OECD
1846,Switzerland,1997,total_fertility_rate,1.48,OECD
479,China,1969,total_fertility_rate,6.18,OECD
403,France,1968,total_fertility_rate,2.59,OECD
1685,Korea,1994,total_fertility_rate,1.66,OECD
404,Germany,1968,total_fertility_rate,2.36,OECD


In [169]:
df_info = pd.DataFrame({
    'dtype': df_17_1.dtypes,
    'null_count': df_17_1.isnull().sum(),
    'unique_count': df_17_1.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            49
year         int32           0            64
indicator   object           0             1
value      float64           0           490
source      object           0             1


In [170]:
#df_17_1.to_csv('../data/Cleaned/cleaned_total_fertility_rates_oecd.csv', index=False)

In [171]:
#df_17_1.to_sql('total_fertility_rates_oecd', engine, if_exists='replace', index=False)

In [172]:
df_17_2 = pd.read_csv('../data/Raw/OECD/SF_2_1_Fertility_rates_Births_by_birth_order_S2.csv')
df_17_2

Unnamed: 0,Country,Birth order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341
5,Belgium,Third birth or higher,202,202,199,199,196,200,196,198,...,226,218,214,219,226,226,208,209,209,204
6,Czechia,First birth,467,466,474,478,501,498,485,477,...,474,481,487,487,480,478,476,464,463,463
7,Czechia,Second birth,377,376,374,372,355,358,368,369,...,375,373,367,366,372,376,376,390,386,391
8,Czechia,Third birth or higher,156,158,152,150,144,144,148,154,...,151,147,146,147,147,146,148,146,15,146
9,Estonia,First birth,435,435,440,462,495,503,496,496,...,419,423,408,402,367,388,380,372,398,397


In [173]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

              dtype  null_count  unique_count
Country      object           0            17
Birth order  object           0             3
1987         object           0            48
1988         object           0            49
1989         object           0            48
1990         object           0            44
1991         object           0            48
1992         object           0            46
1993         object           0            47
1994         object           0            47
1995         object           0            48
1996         object           0            47
1997         object           0            49
1998         object           0            50
1999         object           0            49
2000         object           0            48
2001         object           0            50
2002         object           0            47
2003         object           0            50
2004         object           0            49
2005         object           0   

In [174]:
df_17_2.columns = df_17_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_17_2.head()

Unnamed: 0,country,birth_order,1987,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Austria,First birth,476,478,467,462,465,461,452,445,...,480,483,473,475,471,472,477,476,484,481
1,Austria,Second birth,337,337,343,349,345,348,358,364,...,355,353,356,353,353,351,353,355,349,351
2,Austria,Third birth or higher,188,185,190,189,190,191,189,191,...,165,164,171,172,176,177,170,169,167,168
3,Belgium,First birth,468,469,473,473,481,472,469,472,...,423,435,441,436,429,426,450,440,447,455
4,Belgium,Second birth,330,329,327,328,323,328,335,330,...,351,348,345,346,345,347,342,351,343,341


In [175]:
# 0) Detect year columns first (keep as-is)
year_cols = [c for c in df_17_2.columns.astype(str) if re.fullmatch(r"\d{4}", c)]

# 1) Normalize only non-year columns
df_17_2.columns = [
    c.strip().lower().replace(" ", "_").replace("(", "").replace(")", "")
    if c not in year_cols else c
    for c in df_17_2.columns.astype(str)
]

# 2) Wide → Long
df_17_2 = df_17_2.melt(
    id_vars=[c for c in ["country", "birth_order"] if c in df_17_2.columns],
    value_vars=year_cols,
    var_name="year",
    value_name="value"
)

# 3) Types
df_17_2["year"] = df_17_2["year"].astype(int)
df_17_2["value"] = pd.to_numeric(df_17_2["value"].astype(str).str.replace(",", ".", regex=False),
                                 errors="coerce")
df_17_2 = df_17_2.dropna(subset=["value"]).drop_duplicates()

# 4) Mart fields + order
df_17_2["indicator"] = "births_by_birth_order"
df_17_2["unit"] = "%"
df_17_2["source"] = "OECD"
df_17_2 = df_17_2[["country", "year", "indicator", "birth_order", "unit", "value", "source"]]


In [176]:
df_17_2

Unnamed: 0,country,year,indicator,birth_order,unit,value,source
0,Austria,1987,births_by_birth_order,First birth,%,47.6,OECD
1,Austria,1987,births_by_birth_order,Second birth,%,33.7,OECD
2,Austria,1987,births_by_birth_order,Third birth or higher,%,18.8,OECD
3,Belgium,1987,births_by_birth_order,First birth,%,46.8,OECD
4,Belgium,1987,births_by_birth_order,Second birth,%,33.0,OECD
...,...,...,...,...,...,...,...
1882,Switzerland,2023,births_by_birth_order,Second birth,%,37.1,OECD
1883,Switzerland,2023,births_by_birth_order,Third birth or higher,%,14.4,OECD
1884,Romania,2023,births_by_birth_order,First birth,%,49.9,OECD
1885,Romania,2023,births_by_birth_order,Second birth,%,30.8,OECD


In [177]:
df_info = pd.DataFrame({
    'dtype': df_17_2.dtypes,
    'null_count': df_17_2.isnull().sum(),
    'unique_count': df_17_2.nunique()
})
print(df_info)

               dtype  null_count  unique_count
country       object           0            17
year           int32           0            37
indicator     object           0             1
birth_order   object           0             3
unit          object           0             1
value        float64           0           422
source        object           0             1


In [178]:
#df_17_2.to_csv('../data/Cleaned/cleaned_births_by_birth_order_oecd.csv', index=False)

In [179]:
#df_17_2.to_sql('births_by_birth_order_oecd', engine, if_exists='replace', index=False)

In [180]:
df_18 = pd.read_csv('../data/Raw/OECD/sf1_2_wide_from_df18.csv')
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [181]:
for col in df_18.select_dtypes(include=['object']).columns:
    df_18[col] = df_18[col].astype(str).str.strip()

# 2) Define placeholders representing missing data in OECD exports
placeholders = ['..', '...', '.', ' .', '…', 'Na', 'nan', 'None']

# 3) Replace placeholders with NaN directly in df_18
df_18.replace(placeholders, pd.NA, inplace=True)

In [182]:
# 1) Ensure 'year' is integer
df_18["year"] = pd.to_numeric(df_18["year"], errors="coerce").astype("Int64")

# 2) Convert all non-key columns to numeric and round(2)
for col in df_18.columns:
    if col not in ["country", "year"]:
        df_18[col] = pd.to_numeric(df_18[col], errors="coerce").round(2)

In [183]:
# 1) Drop rows with missing key fields
df_18.dropna(subset=["country", "year"], inplace=True)

# 2) Drop duplicate country-year rows, keep the first
df_18.drop_duplicates(subset=["country", "year"], keep="first", inplace=True)

# 3) Drop rows where all value columns are NaN
value_cols = [c for c in df_18.columns if c not in ["country", "year"]]
df_18.dropna(subset=value_cols, how="all", inplace=True)

# 4) Sort and reset index
df_18.sort_values(["country", "year"], inplace=True)
df_18.reset_index(drop=True, inplace=True)


In [184]:
df_18

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other
0,Australia,2003,80.1,19.5,0.5
1,Australia,2006,81.5,18.0,0.5
2,Australia,2009,82.0,17.6,0.4
3,Australia,2012,81.3,18.0,0.6
4,Austria,2003,81.2,16.8,2.0
...,...,...,...,...,...
470,United States,2014,68.7,27.5,3.8
471,United States,2015,69.2,26.8,3.9
472,United States,2016,68.7,27.4,3.8
473,United States,2017,68.9,27.1,4.0


In [185]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

In [186]:
print(repr(df_18.loc[df_18['Other'].notnull(), 'Other'].unique()))

array([0.5, 0.4, 0.6, 2. , 1. , 1.9, 0.3, 0.1, 0.8, 0.7, 8.7, 3.5, 2.5,
       2.1, 2.4, 2.6, 6.7, 5.1, 1.4, 1.2, 1.7, 1.5, 3.4, 2.9, 2.3, 3. ,
       4.2, 2.8, 1.3, 9. , 0.2, 0.9, 1.1, 4.5, 4.7, 1.6, 3.8, 3.6, 3.3,
       2.2, 0. , 1.8, 2.7, 3.2, 3.9, 4.1, 4.4, 3.7, 4. , 4.3])


In [187]:
df_18['Other'] = pd.to_numeric(df_18['Other'], errors='coerce')

df_18.dropna(inplace=True, subset=['Other'])

df_18.isnull().sum()

country                        0
year                           0
Living with two parents        0
Living with a single parent    0
Other                          0
dtype: int64

In [188]:
df_18['unit'] = '%'
df_18['source'] = 'OECD'

df_18.sample(10)

Unnamed: 0,country,year,Living with two parents,Living with a single parent,Other,unit,source
229,Italy,2010,88.0,11.4,0.5,%,OECD
46,Bulgaria,2015,81.4,16.5,2.1,%,OECD
196,Iceland,2005,82.7,16.1,1.2,%,OECD
123,Finland,2004,85.1,14.0,0.9,%,OECD
445,Türkiye,2013,91.2,5.5,3.3,%,OECD
389,Slovenia,2010,86.4,13.0,0.6,%,OECD
332,Poland,2007,86.7,12.5,0.8,%,OECD
120,Estonia,2016,84.0,15.5,0.5,%,OECD
317,Norway,2006,78.5,19.9,1.5,%,OECD
292,Malta,2013,80.6,18.6,0.9,%,OECD


In [189]:

df_info = pd.DataFrame({
    'dtype': df_18.dtypes,
    'null_count': df_18.isnull().sum(),
    'unique_count': df_18.nunique()
})
print(df_info)

                               dtype  null_count  unique_count
country                       object           0            38
year                           Int64           0            18
Living with two parents      float64           0           211
Living with a single parent  float64           0           203
Other                        float64           0            50
unit                          object           0             1
source                        object           0             1


In [190]:
#df_18.to_csv('../data/Cleaned/cleaned_household_children_oecd.csv', index=False)

In [191]:
#df_18.to_sql('household_children_oecd', engine, if_exists= 'replace', index= False)

In [192]:
df_19_1 =pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_mean_age_birth_S1.csv')
#age_of_mothers_at_childbirth
df_19_1

Unnamed: 0,Country,1963,1964,1965,1966,1967,1968,1969,1970,1971,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,275,275,274,273,273,272,272,271,269,...,301,301,302,303,305,306,307,308,309,311
1,Austria,274,274,273,271,270,268,268,267,267,...,302,303,304,306,306,307,309,310,310,312
2,Belgium,278,277,276,275,274,273,272,272,270,...,300,302,303,304,305,306,307,308,308,310
3,Canada,278,279,278,277,275,273,273,272,270,...,303,304,305,306,307,309,310,312,313,314
4,Chile,292,291,291,290,288,287,286,284,282,...,281,283,285,288,291,294,296,299,301,..
5,Czech Republic,257,258,255,252,250,249,248,248,249,...,298,299,299,300,300,300,301,302,302,304
6,Costa Rica,293,293,293,293,292,291,289,287,285,...,265,267,268,271,272,274,276,279,284,287
7,Denmark,273,268,268,266,265,265,266,267,267,...,307,308,309,310,310,311,312,313,314,316
8,Estonia,276,274,273,273,271,269,269,267,267,...,296,295,296,299,302,304,305,306,307,310
9,Finland,281,280,280,278,277,275,274,271,269,...,304,305,305,306,308,309,310,311,312,314


In [193]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1963     object           0            19
1964     object           0            22
1965     object           0            22
1966     object           0            22
1967     object           0            22
1968     object           0            20
1969     object           0            21
1970     object           0            19
1971     object           0            19
1972     object           0            20
1973     object           0            20
1974     object           0            24
1975     object           0            21
1976     object           0            22
1977     object           0            20
1978     object           0            22
1979     object           0            23
1980     object           0            22
1981     object           0            20
1982     object           0            18
1983     object           0            20
1984     object           0       

In [194]:
df_19_1.columns = df_19_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [195]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_19_1.columns if c != "country"]

df_19_1[num_cols] = (
    df_19_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [196]:
df_19_1.drop_duplicates(inplace=True)
df_19_1.dropna(inplace=True)

In [197]:
# --- Identify id columns and year columns
id_cols = [c for c in ['country','age','age_group','indicator'] if c in df_19_1.columns]
year_cols = [c for c in df_19_1.columns if re.fullmatch(r'\d{4}', c)]

# --- Wide → Long
df_19_1 = df_19_1.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# --- Fix types
df_19_1['year'] = df_19_1['year'].astype(int)
df_19_1['age'] = pd.to_numeric(df_19_1['value'], errors='coerce')
df_19_1 = df_19_1.dropna(subset=['value'])

# --- Add unit & indicator if missing
if 'indicator' not in df_19_1.columns:
    df_19_1['indicator'] = 'fertility_by_age'
df_19_1['unit'] = 'age'
df_19_1['source'] = 'OECD'
df_19_1.sample(10)

Unnamed: 0,country,year,value,age,indicator,unit,source
157,Canada,1970,27.2,27.2,fertility_by_age,age,OECD
418,Australia,1982,27.3,27.3,fertility_by_age,age,OECD
1228,Slovak Republic,2018,28.8,28.8,fertility_by_age,age,OECD
1035,Austria,2010,29.8,29.8,fertility_by_age,age,OECD
1160,Norway,2015,30.7,30.7,fertility_by_age,age,OECD
147,New Zealand,1969,26.8,26.8,fertility_by_age,age,OECD
51,Finland,1965,28.0,28.0,fertility_by_age,age,OECD
40,Slovak Republic,1964,26.9,26.9,fertility_by_age,age,OECD
665,Denmark,1993,28.9,28.9,fertility_by_age,age,OECD
112,Belgium,1968,27.3,27.3,fertility_by_age,age,OECD


In [198]:
df_info = pd.DataFrame({
    'dtype': df_19_1.dtypes,
    'null_count': df_19_1.isnull().sum(),
    'unique_count': df_19_1.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            22
year         int32           0            59
value      float64           0            90
age        float64           0            90
indicator   object           0             1
unit        object           0             1
source      object           0             1


In [199]:
#df_19_1.to_csv('../data/Cleaned/age_of_mothers_at_childbirth_oecd.csv', index=False)

In [200]:
#df_19_1.to_sql('age_of_mothers_at_childbirth_oecd', engine, if_exists='replace', index=False)

In [201]:
df_19_2 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_1960_S2.csv')
#fertility_per_1000_from 1960
df_19_2.head()

Unnamed: 0,Country,Age group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [202]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            21
Age group  object           0             7
1960       object           0           136
1961       object           0           140
1962       object           0           140
...           ...         ...           ...
2017       object           0           124
2018       object           0           128
2019       object           0           126
2020       object           0           121
2021       object           7           119

[64 rows x 3 columns]


In [203]:
df_19_2.columns = df_19_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_2.head()

Unnamed: 0,country,age_group,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Australia,15-19,443,474,447,459,470,475,489,484,...,161,146,129,120,105,103,95,88,79,71
1,Australia,20-24,2201,2258,2160,2082,1905,1793,1731,1708,...,532,513,474,473,447,431,428,401,377,388
2,Australia,25-29,2163,2212,2167,2112,1981,1885,1839,1850,...,1026,991,948,934,922,897,893,843,803,867
3,Australia,30-34,1275,1311,1277,1239,1191,1101,1051,1028,...,1269,1248,1204,1217,1236,1191,1201,1156,1114,1206
4,Australia,35-39,623,634,614,597,584,530,506,478,...,715,709,692,698,720,713,716,693,663,709


In [204]:
# --- Ensure "country" and "age_group" are strings
df_19_2["country"] = df_19_2["country"].astype(str).str.strip().str.title()
df_19_2["age_group"] = df_19_2["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_2.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_2[num_cols] = (
    df_19_2[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_2[num_cols] = df_19_2[num_cols].round(2)

In [205]:
df_19_2.drop_duplicates(inplace=True)
df_19_2.dropna(inplace = True)

In [206]:
df_info = pd.DataFrame({
    'dtype': df_19_2.dtypes,
    'null_count': df_19_2.isnull().sum(),
    'unique_count': df_19_2.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            19
age_group   object           0             7
1960       float64           0           124
1961       float64           0           126
1962       float64           0           126
...            ...         ...           ...
2017       float64           0           118
2018       float64           0           121
2019       float64           0           120
2020       float64           0           115
2021       float64           0           118

[64 rows x 3 columns]


In [207]:
# --- Identify id columns and year columns
id_cols = [c for c in ['country','age','age_group','indicator'] if c in df_19_2.columns]
year_cols = [c for c in df_19_2.columns if re.fullmatch(r'\d{4}', c)]

# --- Wide → Long
df_19_2 = df_19_2.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# --- Fix types
df_19_2['year'] = df_19_2['year'].astype(int)
df_19_2['value'] = pd.to_numeric(df_19_2['value'], errors='coerce')
df_19_2 = df_19_2.dropna(subset=['value'])

# --- Add unit & indicator if missing
if 'indicator' not in df_19_2.columns:
    df_19_2['indicator'] = 'fertility_by_age'
df_19_2['unit'] = '‰'
df_19_2['source'] = 'OECD'

In [208]:
df_19_2

Unnamed: 0,country,age_group,year,value,indicator,unit,source
0,Australia,15-19,1960,44.3,fertility_by_age,‰,OECD
1,Australia,20-24,1960,220.1,fertility_by_age,‰,OECD
2,Australia,25-29,1960,216.3,fertility_by_age,‰,OECD
3,Australia,30-34,1960,127.5,fertility_by_age,‰,OECD
4,Australia,35-39,1960,62.3,fertility_by_age,‰,OECD
...,...,...,...,...,...,...,...
8241,Bulgaria,25-29,2021,91.7,fertility_by_age,‰,OECD
8242,Bulgaria,30-34,2021,73.6,fertility_by_age,‰,OECD
8243,Bulgaria,35-39,2021,34.8,fertility_by_age,‰,OECD
8244,Bulgaria,40-44,2021,7.8,fertility_by_age,‰,OECD


In [209]:
df_19_2.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_1960_oecd.csv', index=False)

In [210]:
df_19_2.to_sql('fertility_per_1000_from_1960_oecd', engine, if_exists='replace', index=False)

246

In [211]:
df_19_3 = pd.read_csv('../data/Raw/OECD/SF_2_3_Age_mothers_childbirth_fertility_by_age_2000_S3.csv')
#fertility_per_1000_from_2000
df_19_3

Unnamed: 0,Country,Age group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Romania,25-29,782,770,786,820,848,908,923,930,...,918,883,944,989,1001,1090,1083,1091,1094,1109
297,Romania,30-34,388,381,388,388,416,475,511,542,...,666,648,715,754,785,866,859,864,871,875
298,Romania,35-39,134,138,152,194,232,251,257,249,...,273,274,299,321,330,368,367,383,406,411
299,Romania,40-44,31,31,30,29,31,31,28,31,...,49,48,56,61,68,73,78,80,85,82


In [212]:
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

            dtype  null_count  unique_count
Country    object           0            43
Age group  object           0             7
2000       object           0           233
2001       object           0           248
2002       object           0           240
2003       object           0           239
2004       object           0           245
2005       object           0           240
2006       object           0           239
2007       object           0           242
2008       object           0           252
2009       object           0           251
2010       object           0           239
2011       object           0           235
2012       object           0           242
2013       object           0           234
2014       object           0           238
2015       object           0           237
2016       object           0           248
2017       object           0           236
2018       object           0           245
2019       object           0   

In [213]:
df_19_3.columns = df_19_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_19_3.head()

Unnamed: 0,country,age_group,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,OECD-Average,15-19,226,220,211,205,203,201,200,205,...,179,168,162,152,144,135,126,117,102,95
1,OECD-Average,20-24,717,693,668,655,647,632,629,630,...,564,538,533,519,504,488,470,450,420,405
2,OECD-Average,25-29,1079,1050,1031,1035,1034,1023,1026,1034,...,994,965,969,961,949,928,907,884,855,869
3,OECD-Average,30-34,881,872,886,911,934,946,976,1000,...,1036,1019,1040,1049,1053,1041,1033,1017,996,1036
4,OECD-Average,35-39,381,386,395,406,422,435,456,477,...,531,534,551,563,571,570,574,575,559,587


In [214]:
# --- Ensure "country" and "age_group" are strings
df_19_3["country"] = df_19_3["country"].astype(str).str.strip().str.title()
df_19_3["age_group"] = df_19_3["age_group"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_19_3.columns if c not in ["country", "age_group"]]
# --- Robust cleaning -> convert to float ---
df_19_3[num_cols] = (
    df_19_3[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_19_3[num_cols] = df_19_3[num_cols].round(2)

In [215]:
df_19_3.drop_duplicates(inplace=True)
df_19_3.dropna(inplace=True)

In [216]:
# --- Identify id columns and year columns
id_cols = [c for c in ['country','age','age_group','indicator'] if c in df_19_3.columns]
year_cols = [c for c in df_19_3.columns if re.fullmatch(r'\d{4}', c)]

# --- Wide → Long
df_19_3 = df_19_3.melt(
    id_vars=id_cols,
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# --- Fix types
df_19_3['year'] = df_19_3['year'].astype(int)
df_19_3['value'] = pd.to_numeric(df_19_3['value'], errors='coerce')
df_19_3 = df_19_3.dropna(subset=['value'])

# --- Add unit & indicator if missing
if 'indicator' not in df_19_3.columns:
    df_19_3['indicator'] = 'fertility_by_age'
df_19_3['unit'] = '‰'
df_19_3['source'] = 'OECD'

In [217]:
df_19_3

Unnamed: 0,country,age_group,year,value,indicator,unit,source
0,Oecd-Average,15-19,2000,22.6,fertility_by_age,‰,OECD
1,Oecd-Average,20-24,2000,71.7,fertility_by_age,‰,OECD
2,Oecd-Average,25-29,2000,107.9,fertility_by_age,‰,OECD
3,Oecd-Average,30-34,2000,88.1,fertility_by_age,‰,OECD
4,Oecd-Average,35-39,2000,38.1,fertility_by_age,‰,OECD
...,...,...,...,...,...,...,...
6287,Romania,25-29,2021,110.9,fertility_by_age,‰,OECD
6288,Romania,30-34,2021,87.5,fertility_by_age,‰,OECD
6289,Romania,35-39,2021,41.1,fertility_by_age,‰,OECD
6290,Romania,40-44,2021,8.2,fertility_by_age,‰,OECD


In [218]:
#Check again
df_info = pd.DataFrame({
    'dtype': df_19_3.dtypes,
    'null_count': df_19_3.isnull().sum(),
    'unique_count': df_19_3.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            41
age_group   object           0             7
year         int32           0            22
value      float64           0          1337
indicator   object           0             1
unit        object           0             1
source      object           0             1


In [219]:
df_19_3.to_csv('../data/Cleaned/cleaned_fertility_per_1000_from_2000_oecd.csv',index=False)

In [220]:
df_19_3.to_sql('fertility_per_1000_from_2000_oecd',engine, if_exists='replace', index=False)

292

In [221]:
df_20= pd.read_csv('../data/Raw/OECD/SF_2_4_Share_births_outside_marriage_1960.csv')
#(%)share_of_births_outside_of_marriage
df_20

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Austria,130,126,120,116,113,112,114,115,120,...,404,415,414,417,421,422,420,413,406,412
1,Belgium,21,20,21,22,23,24,25,25,27,...,470,477,495,494,480,490,528,524,..,..
2,Czech Republic,49,46,45,47,48,50,53,53,54,...,418,434,450,467,478,486,490,485,482,485
3,Denmark,78,80,83,89,93,95,102,111,111,...,490,506,515,525,538,540,542,542,541,542
4,Finland,40,41,40,42,44,46,48,51,53,...,409,415,421,428,443,449,448,446,454,461
5,Germany,76,71,66,61,59,58,57,58,61,...,339,345,348,350,350,355,347,339,333,331
6,Greece,12,12,12,12,11,11,10,10,11,...,74,76,70,82,88,94,103,111,124,138
7,Hungary,55,55,54,53,52,52,51,50,50,...,423,445,456,473,479,467,447,439,387,304
8,Iceland,253,253,245,251,267,269,284,300,305,...,650,669,..,..,..,696,712,705,694,..
9,Ireland,16,16,18,18,20,22,23,25,26,...,339,351,353,363,366,367,376,379,384,..


In [222]:
df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

          dtype  null_count  unique_count
Country  object           0            26
1960     object           0            26
1961     object           0            24
1962     object           0            24
1963     object           0            24
...         ...         ...           ...
2016     object           0            24
2017     object           0            26
2018     object           0            25
2019     object           0            25
2020     object           0            24

[62 rows x 3 columns]


In [223]:
df_20.columns = df_20.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [224]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_20.columns if c != "country"]

df_20[num_cols] = (
    df_20[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [225]:
df_20.drop_duplicates(inplace=True)
df_20.dropna(inplace=True)



In [226]:

# Identify columns
id_cols = [c for c in ['country','category','sex'] if c in df_20.columns]
year_cols = [c for c in df_20.columns if re.fullmatch(r'\d{4}', c)]

# Melt wide → long
df_20 = df_20.melt(id_vars=id_cols, value_vars=year_cols,
                       var_name='year', value_name='value')

# Fix dtypes
df_20['year'] = df_20['year'].astype(int)
df_20['value'] = pd.to_numeric(df_20['value'], errors='coerce')
df_20 = df_20.dropna(subset=['value'])

# Add required columns
if 'category' not in df_20.columns: df_20['category'] = 'total'
if 'sex' not in df_20.columns:      df_20['sex'] = 'all'
df_20['indicator'] = 'mean_age_first_marriage'
df_20['unit'] = '%'
df_20['source'] = 'OECD'

df_20.sample(10)

Unnamed: 0,country,year,value,category,sex,indicator,unit,source
127,Sweden,1965,13.8,total,all,mean_age_first_marriage,%,OECD
1250,Switzerland,2016,24.2,total,all,mean_age_first_marriage,%,OECD
713,Luxembourg,1992,12.7,total,all,mean_age_first_marriage,%,OECD
163,Luxembourg,1967,3.5,total,all,mean_age_first_marriage,%,OECD
1270,Spain,2017,46.8,total,all,mean_age_first_marriage,%,OECD
508,Denmark,1983,40.6,total,all,mean_age_first_marriage,%,OECD
914,Norway,2001,49.7,total,all,mean_age_first_marriage,%,OECD
645,Italy,1989,6.1,total,all,mean_age_first_marriage,%,OECD
37,Slovenia,1961,10.0,total,all,mean_age_first_marriage,%,OECD
412,Spain,1978,2.5,total,all,mean_age_first_marriage,%,OECD


In [227]:


df_info = pd.DataFrame({
    'dtype': df_20.dtypes,
    'null_count': df_20.isnull().sum(),
    'unique_count': df_20.nunique()
})
print(df_info)

             dtype  null_count  unique_count
country     object           0            22
year         int32           0            61
value      float64           0           488
category    object           0             1
sex         object           0             1
indicator   object           0             1
unit        object           0             1
source      object           0             1


In [228]:
#df_20.to_csv('../data/Cleaned/cleaned_share_of_births_outside_of_marriage_oecd.csv', index=False)

In [229]:
#df_20.to_sql('share_of_births_outside_of_marriage_oecd',engine, if_exists='replace', index=False)

In [230]:
df_21_1= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rate_mean_age_first_marriage_S1.csv')
#mean_age_first_marriage
df_21_1

Unnamed: 0,Country,Gender,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Australia,Male,265,267,269,270,272,273,276,278,...,297,298,299,300,301,303,304,307,307,306
1,Australia,Female,243,245,247,248,251,253,257,259,...,280,281,283,284,285,287,288,292,293,292
2,Czechia,Male,243,243,245,247,251,255,259,265,...,310,312,313,314,316,317,318,319,320,324
3,Czechia,Female,216,216,219,221,224,228,231,236,...,281,283,285,287,288,290,291,292,294,297
4,Denmark,Male,305,306,310,314,318,319,325,322,...,338,343,344,344,343,347,348,349,351,353
5,Denmark,Female,278,280,283,288,292,292,299,301,...,314,318,319,319,319,322,324,325,328,330
6,Greece,Male,290,293,296,297,299,301,302,306,...,327,328,329,330,332,332,333,334,337,338
7,Greece,Female,249,252,255,255,258,260,263,266,...,294,295,297,299,301,301,303,303,307,307
8,Japan,Male,284,284,284,284,285,285,285,285,...,307,308,309,311,311,311,311,311,312,310
9,Japan,Female,259,259,260,261,262,263,264,266,...,290,292,293,294,294,294,294,294,296,294


In [231]:
df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            10
Gender     object           0             2
1990       object           0            17
1991       object           0            18
1992       object           0            18
1993       object           0            19
1994       object           0            16
1995       object           0            18
1996       object           0            19
1997       object           0            17
1998       object           0            14
1999       object           0            19
2000       object           0            18
2001       object           0            18
2002       object           0            19
2003       object           0            19
2004       object           0            16
2005       object           0            18
2006       object           0            18
2007       object           0            19
2008       object           0            18
2009       object           0   

In [232]:
df_21_1.columns = df_21_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [233]:
# --- Ensure "country" and "gender" are strings
df_21_1["country"] = df_21_1["country"].astype(str).str.strip().str.title()
df_21_1["gender"] = df_21_1["gender"].astype(str).str.strip()

# --- Identify numeric columns: everything except country and birth order ---
num_cols = [c for c in df_21_1.columns if c not in ["country", "gender"]]
# --- Robust cleaning -> convert to float ---
df_21_1[num_cols] = (
    df_21_1[num_cols].astype(str)
    .replace({"\xa0": "", "\u202f": "", ",": "."}, regex=True)
    .replace(r"[^\d\.\-]", "", regex=True)
    .replace(r"\.\.+", ".", regex=True)
    .replace(r"^\.$|^\s*$", np.nan, regex=True)
    .apply(pd.to_numeric, errors="coerce")
)
df_21_1[num_cols] = df_21_1[num_cols].round(2)

In [234]:
# 1) Gender -> sex
df_21_1.columns = df_21_1.columns.astype(str).str.strip()
df_21_1.rename(columns={'Gender':'sex','gender':'sex'}, inplace=True)
df_21_1['sex'] = df_21_1['sex'].astype(str).str.strip().str.lower()

# 2) Year columns
year_cols = [c for c in df_21_1.columns if re.fullmatch(r'\d{4}', str(c))]

# 3) Wide -> Long (keep sex)
df_21_1 = df_21_1.melt(
    id_vars=[c for c in ['country','sex'] if c in df_21_1.columns],
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

# 4) Types
df_21_1['year']  = df_21_1['year'].astype(int)
df_21_1['value'] = pd.to_numeric(df_21_1['value'], errors='coerce')
df_21_1.dropna(subset=['value'], inplace=True)

# 5) Metadata
df_21_1['indicator'] = 'divorce_rates_per_1000'
df_21_1['unit']      = '‰'
df_21_1['source']    = 'OECD'


# 7) Final columns (only those that exist)
final_cols = [c for c in ['country','year','indicator','sex','unit','value','source'] if c in df_21_1.columns]
df_21_1 = df_21_1[final_cols]
df_21_1.sample(10)

Unnamed: 0,country,year,indicator,sex,unit,value,source
74,Slovenia,1993,divorce_rates_per_1000,male,‰,27.8,OECD
375,Slovenia,2008,divorce_rates_per_1000,female,‰,28.6,OECD
407,Greece,2010,divorce_rates_per_1000,female,‰,29.3,OECD
318,United States,2005,divorce_rates_per_1000,male,‰,27.1,OECD
550,Korea,2017,divorce_rates_per_1000,male,‰,32.9,OECD
447,Greece,2012,divorce_rates_per_1000,female,‰,29.5,OECD
142,Czechia,1997,divorce_rates_per_1000,male,‰,26.5,OECD
346,Greece,2007,divorce_rates_per_1000,male,‰,32.4,OECD
517,United Kingdom,2015,divorce_rates_per_1000,female,‰,31.2,OECD
436,United Kingdom,2011,divorce_rates_per_1000,male,‰,32.2,OECD


In [235]:
df_21_1.drop_duplicates(inplace=True)
df_21_1.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_1.dtypes,
    'null_count': df_21_1.isnull().sum(),
    'unique_count': df_21_1.nunique()
})
print(df_info)

          datatypes  null_count  unique_count
country      object           0            10
year          int32           0            31
indicator    object           0             1
sex          object           0             2
unit         object           0             1
value       float64           0           114
source       object           0             1


In [236]:
df_21_1.to_csv('../data/Cleaned/cleaned_mean_age_first_marriage_oecd.csv',index=False)

In [237]:
df_21_1.to_sql('mean_age_first_marriage_oecd', engine, if_exists='replace', index= False)

618

In [238]:
df_21_2 = pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_S2.csv')
#divorce_rates_per_1000_oecd
df_21_2

Unnamed: 0,Country,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Austria,14,13,13,13,14,14,15,15,16,...,19,19,19,18,18,18.0,18.0,17,16,15
1,Belgium,07,7,8,9,10,11,13,13,14,...,22,22,22,21,20,20.0,20.0,18,19,17
2,Czechia,22,24,23,25,25,26,25,25,26,...,27,25,25,24,24,23.0,23.0,20,20,19
3,Denmark,19,27,26,25,26,26,26,26,26,...,34,34,29,30,26,26.0,18.0,27,22,21
4,Estonia,32,32,33,32,33,34,36,39,38,...,25,24,26,25,25,24.0,21.0,19,,19
5,Finland,13,16,18,19,21,20,21,21,22,...,25,25,25,25,24,24.0,24.0,24,22,20
6,Germany,13,14,15,16,18,19,20,15,10,...,21,21,20,20,19,18.0,18.0,17,17,16
7,Greece,04,4,4,5,4,4,4,5,5,...,15,13,14,10,18,,,,,
8,Hungary,22,23,23,24,23,25,26,26,27,...,20,20,21,20,19,17.0,18.0,15,19,18
9,Italy,..,3,6,3,3,2,2,2,2,...,9,9,14,16,15,15.0,14.0,11,14,14


In [239]:
df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
Country    object           0            28
1970       object           0            18
1971       object           0            19
1972       object           0            19
1973       object           0            18
1974       object           0            18
1975       object           0            19
1976       object           0            18
1977       object           0            18
1978       object           0            18
1979       object           0            15
1980       object           0            18
1981       object           0            20
1982       object           0            22
1983       object           0            24
1984       object           0            20
1985       object           0            19
1986       object           0            20
1987       object           0            20
1988       object           0            20
1989       object           0            19
1990       object           0   

In [240]:
df_21_2.columns = df_21_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [241]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_21_2.columns if c != "country"]

df_21_2[num_cols] = (
    df_21_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [242]:
df_21_2.drop_duplicates(inplace=True)
df_21_2.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_2.dtypes,
    'null_count': df_21_2.isnull().sum(),
    'unique_count': df_21_2.nunique()
})
print(df_info)

        datatypes  null_count  unique_count
country    object           0            23
1970      float64           0            15
1971      float64           0            17
1972      float64           0            15
1973      float64           0            14
1974      float64           0            15
1975      float64           0            16
1976      float64           0            14
1977      float64           0            13
1978      float64           0            15
1979      float64           0            12
1980      float64           0            14
1981      float64           0            17
1982      float64           0            17
1983      float64           0            19
1984      float64           0            16
1985      float64           0            15
1986      float64           0            16
1987      float64           0            16
1988      float64           0            15
1989      float64           0            15
1990      float64           0   

In [243]:
# Identify columns
id_cols = [c for c in ['country','category','sex'] if c in df_21_2.columns]
year_cols = [c for c in df_21_2.columns if re.fullmatch(r'\d{4}', c)]

# Melt wide → long
df_21_2 = df_21_2.melt(id_vars=id_cols, value_vars=year_cols,
                       var_name='year', value_name='value')

# Fix dtypes
df_21_2['year'] = df_21_2['year'].astype(int)
df_21_2['value'] = pd.to_numeric(df_21_2['value'], errors='coerce')
df_21_2 = df_21_2.dropna(subset=['value'])

# Add required columns
if 'sex' not in df_21_2.columns:      df_21_2['sex'] = 'all'
df_21_2['indicator'] = 'divorce_rates_per_1000'
df_21_2['unit'] = '‰'
df_21_2['source'] = 'OECD'
df_21_2

Unnamed: 0,country,year,value,sex,indicator,unit,source
0,Austria,1970,1.4,all,divorce_rates_per_1000,‰,OECD
1,Belgium,1970,0.7,all,divorce_rates_per_1000,‰,OECD
2,Czechia,1970,2.2,all,divorce_rates_per_1000,‰,OECD
3,Denmark,1970,1.9,all,divorce_rates_per_1000,‰,OECD
4,Finland,1970,1.3,all,divorce_rates_per_1000,‰,OECD
...,...,...,...,...,...,...,...
1214,Sweden,2022,2.1,all,divorce_rates_per_1000,‰,OECD
1215,Switzerland,2022,1.8,all,divorce_rates_per_1000,‰,OECD
1216,Bulgaria,2022,1.4,all,divorce_rates_per_1000,‰,OECD
1217,Croatia,2022,1.2,all,divorce_rates_per_1000,‰,OECD


In [244]:
#df_21_2.to_csv('../data/Cleaned/cleaned_divorce_rates_per_1000_oecd.csv', index=False)

In [245]:
#df_21_2.to_sql('divorce_rates_per_1000_oecd',engine, if_exists= 'replace' , index=False)

In [246]:
df_21_3= pd.read_csv('../data/Raw/OECD/SF_3_1_Marriage_divorce_rates_prev_marital_status_S3.csv')
#share_of_previous_marital_status
df_21_3.sample

<bound method NDFrame.sample of             Country Previous marital status  2000  2001  2002  2003  2004  \
0         Australia    Single never married  75,9  76,1  75,5  75,6  76,2   
1         Australia                Divorced  22,0  21,8  22,4  22,3  21,8   
2         Australia                 Widowed   2,1   2,1   2,1   2,1   1,9   
3           Austria    Single never married  76,6  74,7  74,1  73,7  72,9   
4           Austria                Divorced  22,2  24,2  24,7  25,2  25,9   
5           Austria                 Widowed   1,2   1,1   1,2   1,1   1,2   
6           Czechia    Single never married  74,9  74,5  74,3  74,0  73,9   
7           Czechia                Divorced  23,7  24,2  24,4  24,7  24,7   
8           Czechia                 Widowed   1,4   1,3   1,3   1,3   1,4   
9           Denmark    Single never married  75,9  76,0  76,2  76,4  76,0   
10          Denmark                Divorced  22,0  21,9  21,8  21,7  22,1   
11          Denmark                 Widowed 

In [247]:
df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
Country                    object           0            20
Previous marital status    object           0             3
2000                       object           0            47
2001                       object           0            51
2002                       object           0            56
2003                       object           0            50
2004                       object           0            50
2005                       object           0            52
2006                       object           0            49
2008                       object           0            47
2009                       object           0            50
2010                       object           0            49
2011                       object           0            49
2012                       object           0            53
2013                       object           0            49
2014                       object       

In [248]:
df_21_3.columns = df_21_3.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,759,761,755,756,762,769,773,782,...,796,797,800,805,805,801,803,801,803,807
1,Australia,Divorced,220,218,224,223,218,213,209,202,...,190,188,186,182,181,185,183,185,183,180
2,Australia,Widowed,21,21,21,21,19,18,18,17,...,15,15,14,13,14,14,14,14,13,13
3,Austria,Single never married,766,747,741,737,729,731,739,748,...,755,757,767,771,775,777,781,781,782,780
4,Austria,Divorced,222,242,247,252,259,257,249,242,...,235,234,223,220,215,215,209,210,210,216


In [249]:
# --- Ensure id cols are strings
df_21_3["country"] = df_21_3["country"].astype(str).str.strip().str.title()
df_21_3["previous_marital_status"] = df_21_3["previous_marital_status"].astype(str).str.strip()

# --- Identify numeric columns
id_cols = {"country", "previous_marital_status"}
num_cols = [c for c in df_21_3.columns if c not in id_cols]

# --- Clean numeric cols
for c in num_cols:
    s = df_21_3[c].astype(str)

    # Normalize spaces & decimal separator
    s = (s.replace({"\xa0": "", "\u202f": ""}, regex=True)   # remove no-break/narrow spaces
           .str.replace(",", ".", regex=False))              # comma → dot

    # Keep only digits, dot, minus; collapse multiple dots
    s = (s.str.replace(r"[^\d\.\-]", "", regex=True)
           .str.replace(r"\.\.+", ".", regex=True)
           .str.replace(r"^\.$|^\s*$", "", regex=True))      # lone dot/empty → ""

    # Convert to numeric ("" → NaN)
    df_21_3[c] = pd.to_numeric(s, errors="coerce")

# --- Round numeric cols
df_21_3[num_cols] = df_21_3[num_cols].round(2)

# --- Round after numeric conversion
df_21_3[num_cols] = df_21_3[num_cols].round(2)

In [250]:
df_21_3.head()

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,Single never married,75.9,76.1,75.5,75.6,76.2,76.9,77.3,78.2,...,79.6,79.7,80.0,80.5,80.5,80.1,80.3,80.1,80.3,80.7
1,Australia,Divorced,22.0,21.8,22.4,22.3,21.8,21.3,20.9,20.2,...,19.0,18.8,18.6,18.2,18.1,18.5,18.3,18.5,18.3,18.0
2,Australia,Widowed,2.1,2.1,2.1,2.1,1.9,1.8,1.8,1.7,...,1.5,1.5,1.4,1.3,1.4,1.4,1.4,1.4,1.3,1.3
3,Austria,Single never married,76.6,74.7,74.1,73.7,72.9,73.1,73.9,74.8,...,75.5,75.7,76.7,77.1,77.5,77.7,78.1,78.1,78.2,78.0
4,Austria,Divorced,22.2,24.2,24.7,25.2,25.9,25.7,24.9,24.2,...,23.5,23.4,22.3,22.0,21.5,21.5,20.9,21.0,21.0,21.6


In [251]:
df_21_3.drop_duplicates(inplace=True)
df_21_3.dropna(inplace=True)

df_info = pd.DataFrame({
    'datatypes': df_21_3.dtypes,
    'null_count': df_21_3.isnull().sum(),
    'unique_count': df_21_3.nunique()
})
print(df_info)

                        datatypes  null_count  unique_count
country                    object           0            20
previous_marital_status    object           0             3
2000                      float64           0            47
2001                      float64           0            51
2002                      float64           0            56
2003                      float64           0            50
2004                      float64           0            50
2005                      float64           0            52
2006                      float64           0            49
2008                      float64           0            47
2009                      float64           0            50
2010                      float64           0            49
2011                      float64           0            49
2012                      float64           0            53
2013                      float64           0            49
2014                      float64       

In [252]:
df_21_3.sample(10)

Unnamed: 0,country,previous_marital_status,2000,2001,2002,2003,2004,2005,2006,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
28,New Zealand,Divorced,23.9,23.5,23.5,23.5,23.4,23.3,22.6,21.3,...,19.9,19.8,20.1,19.8,19.6,19.7,19.0,18.5,18.3,17.8
24,Luxembourg,Single never married,76.3,75.3,76.1,74.0,75.6,74.8,74.5,75.9,...,76.2,77.9,76.8,74.9,75.6,75.3,74.3,75.5,76.9,77.3
9,Denmark,Single never married,75.9,76.0,76.2,76.4,76.0,75.6,75.6,76.3,...,77.2,76.0,75.0,76.2,76.1,76.9,76.4,77.1,77.6,78.3
27,New Zealand,Single never married,73.6,73.5,73.8,73.9,73.9,74.1,75.0,76.4,...,77.9,78.0,77.7,77.8,78.1,78.2,79.0,79.4,79.5,80.0
21,Lithuania,Single never married,79.1,78.8,78.9,79.3,78.1,77.9,77.5,77.3,...,79.3,78.2,78.8,78.4,77.8,78.1,76.8,76.4,75.7,74.8
31,Poland,Divorced,7.3,7.1,7.2,7.3,7.1,7.6,8.3,1.7,...,9.1,9.6,9.9,10.9,11.0,11.5,12.0,12.5,13.0,13.9
29,New Zealand,Widowed,2.5,2.9,2.7,2.5,2.7,2.6,2.3,2.4,...,2.2,2.2,2.1,2.4,2.3,2.1,2.0,2.0,2.1,2.1
22,Lithuania,Divorced,18.6,19.1,19.2,18.7,20.0,20.2,20.5,21.0,...,19.1,20.2,19.6,20.0,20.7,20.5,21.6,22.0,22.7,23.6
48,Switzerland,Single never married,78.0,77.9,77.9,77.4,77.2,77.1,77.1,77.3,...,78.1,78.0,79.0,79.1,79.7,79.7,80.3,80.4,80.9,80.9
52,Bulgaria,Divorced,11.4,10.9,10.2,10.9,11.2,11.7,11.3,10.6,...,11.1,11.0,11.1,11.2,11.0,10.6,10.6,11.7,11.6,12.2


In [253]:
year_cols = [c for c in df_21_3.columns if re.fullmatch(r"\d{4}", str(c))]

# Wide → Long
df_21_3 = df_21_3.melt(
    id_vars=["country", "previous_marital_status"],
    value_vars=year_cols,
    var_name="year",
    value_name="value"
)

df_21_3["year"] = df_21_3["year"].astype(int)
df_21_3["value"] = pd.to_numeric(df_21_3["value"], errors="coerce")
df_21_3 = df_21_3.dropna(subset=["value"])

# Standardize
df_21_3 = df_21_3.rename(columns={"previous_marital_status": "category"})
df_21_3["sex"] = "all"
df_21_3["indicator"] = "share_of_previous_marital_status"
df_21_3["unit"] = "%"
df_21_3["source"] = "OECD"

# Final column order
df_21_3 = df_21_3[["country","year","indicator","category","sex","unit","value","source"]]

df_21_3.sample(15)

Unnamed: 0,country,year,indicator,category,sex,unit,value,source
969,Denmark,2017,share_of_previous_marital_status,Single never married,all,%,77.1,OECD
696,Slovak Republic,2012,share_of_previous_marital_status,Single never married,all,%,86.2,OECD
1031,Denmark,2018,share_of_previous_marital_status,Widowed,all,%,1.4,OECD
907,Czechia,2016,share_of_previous_marital_status,Divorced,all,%,22.6,OECD
771,Bulgaria,2013,share_of_previous_marital_status,Single never married,all,%,88.1,OECD
534,Croatia,2009,share_of_previous_marital_status,Single never married,all,%,91.6,OECD
31,Poland,2000,share_of_previous_marital_status,Divorced,all,%,7.3,OECD
1111,Poland,2019,share_of_previous_marital_status,Divorced,all,%,13.9,OECD
556,Greece,2010,share_of_previous_marital_status,Divorced,all,%,11.1,OECD
389,New Zealand,2006,share_of_previous_marital_status,Widowed,all,%,2.3,OECD


In [254]:
#df_21_3.to_csv('../data/Cleaned/cleaned_share_of_previous_marital_status_oecd.csv', index=False)

In [255]:
#df_21_3.to_sql('share_of_previous_marital_status_oecd', engine, if_exists= 'replace', index =  False)

In [256]:
df_22_1 = pd.read_csv('../data/Raw/OECD/SF3_3_A_in_private_households_by_partnership_status_S1.csv')
#hauseholds_by_partnership_status_oecd
df_22_1

Unnamed: 0,Country,20+_All_Total_Living_with_a_partner(%),20+_All_Married or in a civil or registered partnership_living_with_a_partner(%),20+_All_Cohabiting_living_with_a_partner(%),20+_All_Not living with a partner(%),20/34_Total_living_with_a_partner(%),20/34_Married or in a civil or registered partnership_living_with_a_partner(%),20/34_Cohabiting_living_with_a_partner(%),Not living with a partner_Total(%),Living with at least one parent(%)
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620
5,Denmark,6415,5002,1412,3585,5054,2186,2868,4946,1067
6,Estonia,5393,3730,1664,4607,4531,1781,2750,5469,2646
7,France,6414,4941,1472,3586,5042,2189,2853,4958,2208
8,Germany,6261,5391,869,3739,3953,2215,1739,5974,2754
9,Greece,6023,5852,171,3977,3313,2924,390,6687,4543


In [257]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
20+_All_Total_Living_with_a_partner(%)                object           0   
20+_All_Married or in a civil or registered par...    object           0   
20+_All_Cohabiting_living_with_a_partner(%)           object           0   
20+_All_Not living with a partner(%)                  object           0   
20/34_Total_living_with_a_partner(%)                  object           0   
20/34_Married or in a civil or registered partn...    object           0   
20/34_Cohabiting_living_with_a_partner(%)             object           0   
Not living with a partner_Total(%)                    object           0   
Living with at least one parent(%)                    object           0   

                                                    unique_count  
Country                                                       37  
20+_All_Total_Living_with_a_p

In [258]:
df_22_1.columns = df_22_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)


df_22_1.head()

Unnamed: 0,country,20_all_total_living_with_a_partner,20_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,20_all_cohabiting_living_with_a_partner,20_all_not_living_with_a_partner,2034_total_living_with_a_partner,2034_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,2034_cohabiting_living_with_a_partner,not_living_with_a_partner_total,living_with_at_least_one_parent
0,Australia (c),6379,5359,1020,3621,4706,2941,1765,5294,..
1,Austria,5880,4910,970,4120,3911,2215,1697,6089,3382
2,Belgium,6215,5351,864,3785,4528,2933,1594,5472,3134
3,Canada (d),6689,5446,1243,3311,5534,3355,2179,4466,..
4,Czech Republic,5117,4539,579,4883,3078,2132,946,6922,3620


In [259]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_1.columns if c != "country"]

df_22_1[num_cols] = (
    df_22_1[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [260]:
df_22_1["country"] = df_22_1["country"].str.replace(r"\s*\(.*?\)", "", regex=True)
print(df_22_1["country"].unique())

['Australia' 'Austria' 'Belgium' 'Canada' 'Czech Republic' 'Denmark'
 'Estonia' 'France' 'Germany' 'Greece' 'Hungary' 'Iceland' 'Ireland'
 'Italy' 'Latvia' 'Luxembourg' 'Netherlands' 'New Zealand' 'Norway'
 'Poland' 'Portugal' 'Slovak Republic' 'Slovenia' 'Spain' 'Sweden'
 'Switzerland' 'United Kingdom' 'United States' 'OECD-28 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [261]:
df_22_1.drop_duplicates(inplace = True)
df_22_1.dropna(inplace=True)

if "year" not in df_22_1.columns :
    df_22_1["year"] = 2021
df_22_1["unit"] = "%"
df_22_1["source"] = "OECD"  
df_22_1.sample(8)

Unnamed: 0,country,20_all_total_living_with_a_partner,20_all_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,20_all_cohabiting_living_with_a_partner,20_all_not_living_with_a_partner,2034_total_living_with_a_partner,2034_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,2034_cohabiting_living_with_a_partner,not_living_with_a_partner_total,living_with_at_least_one_parent,year,unit,source
18,Norway,60.96,46.07,14.89,39.04,41.99,18.94,23.05,58.01,25.54,2021,%,OECD
9,Greece,60.23,58.52,1.71,39.77,33.13,29.24,3.9,66.87,45.43,2021,%,OECD
19,Poland,57.72,55.6,2.12,42.28,37.6,34.2,3.4,62.4,45.65,2021,%,OECD
4,Czech Republic,51.17,45.39,5.79,48.83,30.78,21.32,9.46,69.22,36.2,2021,%,OECD
11,Iceland,59.15,46.78,12.36,40.85,36.52,17.21,19.31,63.48,29.44,2021,%,OECD
28,OECD-28 average,59.81,49.84,9.97,40.19,40.34,23.45,16.89,59.34,34.65,2021,%,OECD
5,Denmark,64.15,50.02,14.12,35.85,50.54,21.86,28.68,49.46,10.67,2021,%,OECD
32,Lithuania,56.17,49.86,6.31,43.83,39.75,31.03,8.73,60.25,38.66,2021,%,OECD


In [262]:
df_info = pd.DataFrame({
    'datatypes': df_22_1.dtypes,
    'null_count': df_22_1.isnull().sum(),
    'unique_count': df_22_1.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
20_all_total_living_with_a_partner                   float64           0   
20_all_married_or_in_a_civil_or_registered_part...   float64           0   
20_all_cohabiting_living_with_a_partner              float64           0   
20_all_not_living_with_a_partner                     float64           0   
2034_total_living_with_a_partner                     float64           0   
2034_married_or_in_a_civil_or_registered_partne...   float64           0   
2034_cohabiting_living_with_a_partner                float64           0   
not_living_with_a_partner_total                      float64           0   
living_with_at_least_one_parent                      float64           0   
year                                                   int64           0   
unit                                                  object           0   
source      

In [263]:
#df_22_1.to_csv('../data/Cleaned/cleaned_households_by_partnership_status_oecd.csv', index=False)

In [264]:
#df_22_1.to_sql('households_by_partnership_status_oecd', engine, if_exists='replace', index= False)

In [265]:
df_22_2 = pd.read_csv('../data/Raw/OECD/SF3_3_B_ by level of educational attainment_S2.csv')
#level_of_educational_attainment
df_22_2

Unnamed: 0,Country,Low_Education_Total_living_with_a_partner(%),Low_educationMarried or in a civil or registered partnership_living_with_a_partner(%),Low_education_Cohabiting_living_with_a_partner(%),Not living with a partner_Low_education(%),Medium education_Total_Living with a partner(%),Medium education_Married or in a civil or registered partnership_Living with a partner(%),Medium education_Cohabiting_Living with a partner(%),Not living with a partner_Medium education(%),High education_Total_Living with a partner(%),High education_Married or in a civil or registered partnership_Living with a partner(%),High education_Cohabiting_Living with a partner(%),Not living with a partner_High education(%)
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442
5,Germany,5446,4879,567,4554,6238,5313,925,3762.0,6889,5916,974,3111
6,Greece,6381,6288,93,3619,5700,5488,212,4300.0,5833,5570,263,4167
7,Hungary,5033,4038,995,4967,5794,4678,1115,4206.0,5956,5102,855,4044
8,Iceland,5186,4102,1084,4814,5831,4657,1174,4169.0,6972,5453,1519,3028
9,Latvia,3627,2592,1035,6373,4932,3954,978,5068.0,5291,4539,752,4709


In [266]:
df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
Country                                               object           0   
Low_Education_Total_living_with_a_partner(%)          object           0   
Low_educationMarried or in a civil or registere...    object           0   
Low_education_Cohabiting_living_with_a_partner(%)     object           0   
Not living with a partner_Low_education(%)            object           0   
Medium education_Total_Living with a partner(%)       object           0   
Medium education_Married or in a civil or regis...    object           0   
Medium education_Cohabiting_Living with a partn...    object           0   
Not living with a partner_Medium education(%)         object           2   
High education_Total_Living with a partner(%)         object           0   
High education_Married or in a civil or registe...    object           0   
High education_Cohabiting_Living with a partner(%)    object           0   
Not living w

In [267]:
df_22_2.columns = df_22_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)


df_22_2.head()

Unnamed: 0,country,low_education_total_living_with_a_partner,low_educationmarried_or_in_a_civil_or_registered_partnership_living_with_a_partner,low_education_cohabiting_living_with_a_partner,not_living_with_a_partner_low_education,medium_education_total_living_with_a_partner,medium_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,medium_education_cohabiting_living_with_a_partner,not_living_with_a_partner_medium_education,high_education_total_living_with_a_partner,high_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,high_education_cohabiting_living_with_a_partner,not_living_with_a_partner_high_education
0,Austria,5681,5049,632,4319,5927,4873,1054,,6003,4838,1165,3997
1,Belgium,6228,5611,617,3772,6079,4980,1099,,6709,5658,1051,3291
2,Czech Republic,4081,3655,426,5919,5399,4787,612,4601.0,5729,5026,703,4271
3,Estonia,4217,2639,1578,5783,5441,3661,1779,4559.0,6014,4445,1569,3986
4,France,6112,5193,918,3888,6568,4917,1651,3432.0,6558,4660,1898,3442


In [268]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_22_2.columns if c != "country"]

df_22_2[num_cols] = (
    df_22_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)

In [269]:
df_22_2["country"] = df_22_2["country"].str.replace(r"\s*\(.*?\)", "", regex=True)

print(df_22_2["country"].unique())

['Austria' 'Belgium' 'Czech Republic' 'Estonia' 'France' 'Germany'
 'Greece' 'Hungary' 'Iceland' 'Latvia' 'Luxembourg' 'Norway' 'Poland'
 'Portugal' 'Slovenia' 'Spain' 'Sweden' 'United Kingdom' 'OECD-19 average'
 'Bulgaria' 'Croatia' 'Cyprus' 'Lithuania' 'Malta' 'Romania' 'EU average'
 'Eurozone average']


In [270]:
df_22_2.drop_duplicates(inplace=True)
df_22_2.dropna(inplace=True)


In [271]:
if "year" not in df_22_1.columns :
    df_22_2["year"] = 2021
df_22_2["unit"] = "%"
df_22_2["source"] = "OECD"    

df_22_2.sample(10)

Unnamed: 0,country,low_education_total_living_with_a_partner,low_educationmarried_or_in_a_civil_or_registered_partnership_living_with_a_partner,low_education_cohabiting_living_with_a_partner,not_living_with_a_partner_low_education,medium_education_total_living_with_a_partner,medium_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,medium_education_cohabiting_living_with_a_partner,not_living_with_a_partner_medium_education,high_education_total_living_with_a_partner,high_education_married_or_in_a_civil_or_registered_partnership_living_with_a_partner,high_education_cohabiting_living_with_a_partner,not_living_with_a_partner_high_education,unit,source
8,Iceland,51.86,41.02,10.84,48.14,58.31,46.57,11.74,41.69,69.72,54.53,15.19,30.28,%,OECD
23,Malta,65.14,63.3,1.84,34.86,54.84,51.17,3.67,45.16,57.23,53.14,4.09,42.77,%,OECD
12,Poland,48.72,47.04,1.68,51.28,61.65,59.51,2.14,38.35,58.52,55.81,2.71,41.48,%,OECD
20,Croatia,56.18,53.83,2.36,43.82,61.86,58.87,2.99,38.14,60.58,57.21,3.38,39.42,%,OECD
5,Germany,54.46,48.79,5.67,45.54,62.38,53.13,9.25,37.62,68.89,59.16,9.74,31.11,%,OECD
6,Greece,63.81,62.88,0.93,36.19,57.0,54.88,2.12,43.0,58.33,55.7,2.63,41.67,%,OECD
25,EU average,56.0,49.31,6.69,44.0,58.98,49.91,9.07,41.02,61.48,52.42,9.06,38.52,%,OECD
11,Norway,52.46,39.63,12.83,47.54,62.17,47.18,14.99,37.83,67.62,50.94,16.68,32.38,%,OECD
19,Bulgaria,58.16,45.7,12.46,41.84,59.72,51.6,8.12,40.28,63.75,55.52,8.23,36.25,%,OECD
15,Spain,63.17,56.31,6.86,36.83,58.8,47.83,10.97,41.2,59.82,48.37,11.45,40.18,%,OECD


In [272]:
df_info = pd.DataFrame({
    'datatypes': df_22_2.dtypes,
    'null_count': df_22_2.isnull().sum(),
    'unique_count': df_22_2.nunique()
})
print(df_info)

                                                   datatypes  null_count  \
country                                               object           0   
low_education_total_living_with_a_partner            float64           0   
low_educationmarried_or_in_a_civil_or_registere...   float64           0   
low_education_cohabiting_living_with_a_partner       float64           0   
not_living_with_a_partner_low_education              float64           0   
medium_education_total_living_with_a_partner         float64           0   
medium_education_married_or_in_a_civil_or_regis...   float64           0   
medium_education_cohabiting_living_with_a_partner    float64           0   
not_living_with_a_partner_medium_education           float64           0   
high_education_total_living_with_a_partner           float64           0   
high_education_married_or_in_a_civil_or_registe...   float64           0   
high_education_cohabiting_living_with_a_partner      float64           0   
not_living_w

In [273]:
#df_22_2.to_csv('../data/Cleaned/cleaned_level_of_educational_attainment_oecd.csv', index=False)

In [274]:
#df_22_2.to_sql('level_of_educational_attainment_oecd',engine, if_exists='replace', index= False)

In [275]:
df_23_1 = pd.read_csv('../data/Raw/OECD/SF_1_3_Living_arrangements_of_children_by_income_status.csv')

df_23_1.head()

Unnamed: 0,country,Married_living with two parents,Cohabiting_living with two parents,Mother_living with one parent,Father_living with one parent,Other,Married_living with two parents.1,Cohabiting_living with two parents.1,Mother_living with one parent.1,Father_living with one parent.1,Other.1
0,Austria,6712,347,2710,169,62,7093,1368,1320,144,75
1,Belgium,2824,1559,4727,646,245,5817,2164,1509,432,78
2,Czech Republic,2954,2958,3756,238,94,7182,1785,884,91,58
3,Estonia,4198,2625,3019,33,125,5447,3195,1226,91,41
4,Finland,3293,1330,3892,1261,224,7011,1791,989,171,37


In [276]:
df_23_1.columns = df_23_1.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)


df_23_1.head()

Unnamed: 0,country,married_living_with_two_parents,cohabiting_living_with_two_parents,mother_living_with_one_parent,father_living_with_one_parent,other,married_living_with_two_parents1,cohabiting_living_with_two_parents1,mother_living_with_one_parent1,father_living_with_one_parent1,other1
0,Austria,6712,347,2710,169,62,7093,1368,1320,144,75
1,Belgium,2824,1559,4727,646,245,5817,2164,1509,432,78
2,Czech Republic,2954,2958,3756,238,94,7182,1785,884,91,58
3,Estonia,4198,2625,3019,33,125,5447,3195,1226,91,41
4,Finland,3293,1330,3892,1261,224,7011,1791,989,171,37


In [277]:
df_info = pd.DataFrame({
    'datatypes': df_23_1.dtypes,
    'null_count': df_23_1.isnull().sum(),
    'unique_count': df_23_1.nunique()
})
print(df_info)

                                    datatypes  null_count  unique_count
country                                object           0            31
married_living_with_two_parents        object           0            31
cohabiting_living_with_two_parents     object           0            31
mother_living_with_one_parent          object           0            31
father_living_with_one_parent          object           0            29
other                                  object           0            30
married_living_with_two_parents1       object           0            30
cohabiting_living_with_two_parents1    object           0            31
mother_living_with_one_parent1         object           0            31
father_living_with_one_parent1         object           0            30
other1                                 object           0            29


In [278]:
# 3) Drop empty rows and 'OECD average'
df_23_1 = df_23_1.dropna(subset=["country"])
df_23_1 = df_23_1[~df_23_1["country"].str.contains("OECD", case=False, na=False)]

# 4) Melt to long
df_23_1 = df_23_1.melt(id_vars="country",
                       var_name="arr_src",
                       value_name="value")

# 5) Build indicator from duplicate-column suffix (… and ….1)
df_23_1["indicator"] = df_23_1["arr_src"].str.endswith(".1").map(
    {True: "not_in_poverty", False: "living_in_poverty"}
)

# 6) Clean living_arrangement names (strip trailing '.1')
df_23_1["living_arrangement"] = df_23_1["arr_src"].str.replace(r"\.1$", "", regex=True)
df_23_1 = df_23_1.drop(columns="arr_src")

# 7) Convert numbers to float (handle decimal commas/spaces)
df_23_1["value"] = (
    df_23_1["value"].astype(str)
    .str.replace("\xa0", "", regex=False)
    .str.replace("\u202f", "", regex=False)
    .str.replace(",", ".", regex=False)
)
df_23_1["value"] = pd.to_numeric(df_23_1["value"], errors="coerce").astype("float64")

# 8) Final tidy ordering
df_23_1 = df_23_1[["country", "indicator", "living_arrangement", "value"]].reset_index(drop=True)

df_23_1.sample(15)
# tidy.head()

Unnamed: 0,country,indicator,living_arrangement,value
109,Slovenia,living_in_poverty,father_living_with_one_parent,6.1
34,Finland,living_in_poverty,cohabiting_living_with_two_parents,13.3
148,Romania,living_in_poverty,other,2.84
74,Netherlands,living_in_poverty,mother_living_with_one_parent,33.68
41,Latvia,living_in_poverty,cohabiting_living_with_two_parents,17.18
6,Greece,living_in_poverty,married_living_with_two_parents,85.64
183,Estonia,living_in_poverty,cohabiting_living_with_two_parents1,31.95
85,Croatia,living_in_poverty,mother_living_with_one_parent,13.86
114,Bulgaria,living_in_poverty,father_living_with_one_parent,4.56
261,Sweden,living_in_poverty,father_living_with_one_parent1,5.95


In [279]:
df_23_1.drop_duplicates(inplace = True)
df_23_1.dropna(inplace = True)

df_23_1["unit"] = "%"
df_23_1["source"] = "OECD"
df_23_1["indicator"] = "children_by_income_status"
df_23_1["year"] = '2016'
df_23_1.sample(12)

Unnamed: 0,country,indicator,living_arrangement,value,unit,source,year
244,Finland,children_by_income_status,father_living_with_one_parent1,1.71,%,OECD,2016
64,Finland,children_by_income_status,mother_living_with_one_parent,38.92,%,OECD,2016
230,Spain,children_by_income_status,mother_living_with_one_parent1,11.73,%,OECD,2016
235,Croatia,children_by_income_status,mother_living_with_one_parent1,6.05,%,OECD,2016
70,Italy,children_by_income_status,mother_living_with_one_parent,18.47,%,OECD,2016
112,Switzerland,children_by_income_status,father_living_with_one_parent,1.11,%,OECD,2016
156,Greece,children_by_income_status,married_living_with_two_parents1,92.23,%,OECD,2016
113,United Kingdom,children_by_income_status,father_living_with_one_parent,5.59,%,OECD,2016
286,Poland,children_by_income_status,other1,0.61,%,OECD,2016
214,Finland,children_by_income_status,mother_living_with_one_parent1,9.89,%,OECD,2016


In [280]:
df_info = pd.DataFrame({
    'datatypes': df_23_1.dtypes,
    'null_count': df_23_1.isnull().sum(),
    'unique_count': df_23_1.nunique()
})
print(df_info)

                   datatypes  null_count  unique_count
country               object           0            30
indicator             object           0             1
living_arrangement    object           0            10
value                float64           0           282
unit                  object           0             1
source                object           0             1
year                  object           0             1


In [None]:
#df_23_1.to_csv('../data/Cleaned/cleaned_children_by_income_status_oecd.csv', index=False)

In [None]:
#df_23_1.to_sql('children_by_income_status_oecd',engine, if_exists='replace', index=False)

300

In [283]:
df_23_2 = pd.read_csv('../data/Raw/OECD/SF_1_3_Living_arrangements_of_children_by_mothers_level_of_education.csv')
df_23_2.sample(10)

Unnamed: 0,country,Married,Cohabiting,Mother,Married.1,Cohabiting.1,Mother.1,Married.2,Cohabiting.2,Mother.2
10,Italy,8460,633,906,7797,875,1327,8170,856,974
2,Czech Republic,4137,3764,2099,6866,1888,1247,8004,1310,685
17,Portugal,6161,1550,2289,6750,1631,1619,7834,1120,1046
28,Cyprus,7427,958,1615,8409,298,1293,9158,175,667
3,Estonia,3665,4616,1719,5128,3084,1788,5941,2864,1195
5,France,5608,1990,2401,5001,2915,2084,6101,2765,1134
26,Bulgaria,3377,4920,1702,7031,1586,1383,8124,925,951
6,Greece,9225,148,627,9173,21,806,9362,69,569
31,EU average,6175,1823,2001,7049,1358,1594,7944,1077,980
4,Finland,5344,2343,2313,6239,2257,1504,7702,1531,767


In [284]:
df_info = pd.DataFrame({
    'datatypes': df_23_2.dtypes,
    'null_count': df_23_2.isnull().sum(),
    'unique_count': df_23_2.nunique()
})
print(df_info)

             datatypes  null_count  unique_count
country         object           0            32
Married         object           0            32
Cohabiting      object           0            32
Mother          object           0            32
Married.1       object           0            32
Cohabiting.1    object           0            32
Mother.1        object           0            32
Married.2       object           0            32
Cohabiting.2    object           0            32
Mother.2        object           0            31


In [285]:
df_23_2.columns = df_23_2.columns.str.lower() \
                .str.replace(' ', '_') \
                .str.replace('(', '') \
                .str.replace(')', '') \
                .str.replace('[^0-9a-zA-Z_]', '', regex=True)


df_23_2.head()

Unnamed: 0,country,married,cohabiting,mother,married1,cohabiting1,mother1,married2,cohabiting2,mother2
0,Austria,7509,683,1808,7138,1419,1444,7476,1380,1144
1,Belgium,4873,1987,3139,5517,2082,2401,6422,2375,1202
2,Czech Republic,4137,3764,2099,6866,1888,1247,8004,1310,685
3,Estonia,3665,4616,1719,5128,3084,1788,5941,2864,1195
4,Finland,5344,2343,2313,6239,2257,1504,7702,1531,767


In [286]:
# Convert all but 'country' to float (robust + compact)
num_cols = [c for c in df_23_2.columns if c != "country"]

df_23_2[num_cols] = (
    df_23_2[num_cols].astype(str)
    .replace({'\xa0': '', '\u202f': '', ',': '.'}, regex=True)   # spaces & decimal comma
    .replace(r'[^\d\.\-]', '', regex=True)                       # keep digits/dot/minus
    .replace(r'\.\.+', '.', regex=True)                          # collapse multi-dots
    .replace(r'^\.$|^\s*$', np.nan, regex=True)                  # lone dot/empty -> NaN
    .apply(pd.to_numeric, errors="coerce")
)
df_23_2.head()

Unnamed: 0,country,married,cohabiting,mother,married1,cohabiting1,mother1,married2,cohabiting2,mother2
0,Austria,75.09,6.83,18.08,71.38,14.19,14.44,74.76,13.8,11.44
1,Belgium,48.73,19.87,31.39,55.17,20.82,24.01,64.22,23.75,12.02
2,Czech Republic,41.37,37.64,20.99,68.66,18.88,12.47,80.04,13.1,6.85
3,Estonia,36.65,46.16,17.19,51.28,30.84,17.88,59.41,28.64,11.95
4,Finland,53.44,23.43,23.13,62.39,22.57,15.04,77.02,15.31,7.67


In [287]:
# --- 1) Wide -> Long
df_23_2 = df_23_2.melt(id_vars="country", var_name="col", value_name="value")

# Suffix -> education: .1/1 = medium, .2/2 = high, yoksa low
suf = df_23_2["col"].str.extract(r"(\.?\d)$")[0].fillna("")
df_23_2["mothers_education"] = suf.map({
    "1": "medium", ".1": "medium",
    "2": "high",   ".2": "high"
}).fillna("low")

# living_arrangement: sondaki .1/.2/1/2'yi at, isimleri normalize et
df_23_2["living_arrangement"] = (
    df_23_2["col"].str.replace(r"(\.?\d)$", "", regex=True)  # drop suffix
                      .str.strip().str.lower()
                      .map({"married":"married", "cohabiting":"cohabiting", "mother":"mother"})
)

# (opsiyonel) Üst kategori istersen:
df_23_2["category"] = df_23_2["living_arrangement"].map({"married":"two_parents","cohabiting":"two_parents","mother":"one_parent"})

# Son seçim (category istemiyorsan bu satırı kullan)
df_23_2 = df_23_2[["country", "mothers_education", "living_arrangement", "value"]].reset_index(drop=True)

df_23_2.sample(15)


Unnamed: 0,country,mothers_education,living_arrangement,value
100,Finland,medium,married,62.39
266,Italy,high,mother,9.74
6,Greece,low,married,92.25
186,Bulgaria,medium,mother,13.83
203,Latvia,high,married,66.42
95,EU average,low,mother,20.01
264,Iceland,high,mother,13.04
124,Cyprus,medium,married,84.09
55,United Kingdom,low,cohabiting,20.9
143,Norway,medium,cohabiting,25.79


In [288]:
df_23_2.drop_duplicates(inplace=True)
df_23_2.dropna(inplace=True)
df_23_2["unit"] = "%"
df_23_2["source"] = "OECD"
df_23_2["indicator"] = "children_by_mothers_level_of_education"
df_23_2["year"] = '2016'
df_23_2.sample(12)

Unnamed: 0,country,mothers_education,living_arrangement,value,unit,source,indicator,year
153,OECD average,medium,cohabiting,15.26,%,OECD,children_by_mothers_level_of_education,2016
79,Norway,low,mother,37.92,%,OECD,children_by_mothers_level_of_education,2016
43,Latvia,low,cohabiting,34.61,%,OECD,children_by_mothers_level_of_education,2016
146,Slovak Republic,medium,cohabiting,6.17,%,OECD,children_by_mothers_level_of_education,2016
115,Slovenia,medium,married,51.56,%,OECD,children_by_mothers_level_of_education,2016
243,Slovenia,high,cohabiting,29.3,%,OECD,children_by_mothers_level_of_education,2016
200,Iceland,high,married,63.7,%,OECD,children_by_mothers_level_of_education,2016
139,Latvia,medium,cohabiting,19.53,%,OECD,children_by_mothers_level_of_education,2016
190,Romania,medium,mother,7.28,%,OECD,children_by_mothers_level_of_education,2016
75,Latvia,low,mother,38.47,%,OECD,children_by_mothers_level_of_education,2016


In [289]:
df_info = pd.DataFrame({
    'datatypes': df_23_2.dtypes,
    'null_count': df_23_2.isnull().sum(),
    'unique_count': df_23_2.nunique()
})
print(df_info)

                   datatypes  null_count  unique_count
country               object           0            32
mothers_education     object           0             3
living_arrangement    object           0             3
value                float64           0           283
unit                  object           0             1
source                object           0             1
indicator             object           0             1
year                  object           0             1


In [None]:
#df_23_2.to_csv('../data/Cleaned/cleaned_children_by_mothers_level_of_education_oecd.csv', index=False)

In [None]:
#df_23_2.to_sql('children_by_mothers_level_of_education_oecd', engine, if_exists='replace', index=False)

288