In [456]:
import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 

In [457]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [458]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [459]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [460]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [461]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [462]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [463]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [464]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [465]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [466]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [467]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [468]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [469]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [470]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [471]:
df_2.drop_duplicates(inplace=True)


In [472]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [473]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [474]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [475]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [476]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [477]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [478]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [479]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [480]:
df_3.drop_duplicates(inplace=True)


In [481]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [482]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [483]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [484]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [485]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [486]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [487]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [488]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [489]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [490]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [491]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [492]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [493]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [494]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [495]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [496]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [497]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [498]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [499]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
61,Women,,44,84.0,91.3,95.4,86.5,74.1,,,
11,Men,,28,62.7,66.3,77.7,56.8,33.1,17.8,12.9,
5,Men,,22,13.5,16.8,25.8,18.8,6.8,2.5,1.1,0.4
62,Women,,45,84.2,91.4,95.4,86.7,74.5,,,
41,Women,,24,40.8,56.1,75.5,55.1,29.7,13.4,6.8,


In [500]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [501]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [502]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [503]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [504]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [505]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [506]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
1528,Norway,NOR,2013,55.2
343,Croatia,HRV,1979,5.1
88,Austria,AUT,2002,33.8
1294,Malta,MLT,1992,2.3
1202,Luxembourg,LUX,1961,3.4


In [507]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [508]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [509]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [510]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [511]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [512]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
66,Women,GBR,49,85.2,91.9,95.7,87.3,76.0,,,
41,Women,GBR,24,40.8,56.1,75.5,55.1,29.7,13.4,6.8,
15,Men,GBR,32,79.3,82.2,86.5,68.8,48.2,33.5,25.1,
62,Women,GBR,45,84.2,91.4,95.4,86.7,74.5,,,
14,Men,GBR,31,76.5,79.7,85.1,66.6,45.1,29.6,21.3,


In [513]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
14,Men,31,76.5,79.7,85.1,66.6,45.1
24,Men,41,90.2,90.3,91.4,78.6,65.0
62,Women,45,84.2,91.4,95.4,86.7,74.5
65,Women,48,85.0,91.8,95.6,87.2,75.7
10,Men,27,55.8,59.2,73.5,52.2,28.7


In [514]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [515]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [516]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [517]:
#!pip install openpyxl

In [518]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [519]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [520]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [521]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [522]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [523]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [524]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [525]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [526]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [527]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [528]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [529]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
22805,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,788,Tunisia,TN,TUN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,51.8
4888,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,175,Mayotte,YT,MYT,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,54.45
16300,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,562,Niger,NE,NER,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,86.27
11833,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,412,Kosovo (under UNSC res. 1244),XK,XKX,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.07
20791,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,710,South Africa,ZA,ZAF,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,36.15


In [530]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
12732,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,434,Libya,LY,LBY,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,45.81
22640,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,784,United Arab Emirates,AE,ARE,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,78.69
3739,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,132,Cabo Verde,CV,CPV,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,43.56
22798,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,788,Tunisia,TN,TUN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,52.25
10284,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,360,Indonesia,ID,IDN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,70.99


In [531]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [532]:
df_9.drop_duplicates(inplace=True)

In [533]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [534]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [535]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [536]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [537]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [538]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
90236,Netherlands,528,Married or in-union women,1990,35-39,85.53439,482.728727,Estimate
41607,Fiji,242,Married or in-union women,1986,15-49,62.89151,119.027843,Estimate
89021,Nauru,520,Married or in-union women,2000,40-44,76.3,0.257894,Estimate
6714,Bahamas,44,Married or in-union women,1999,25-29,43.611613,6.254996,Estimate
35775,Dominican Republic,214,Married or in-union women,1986,15-49,58.059295,939.899577,Estimate
109846,Saint Kitts and Nevis,659,Married or in-union women,2011,45-49,72.287087,1.181171,Projection
61284,Iraq,368,Married or in-union women,2016,35-39,85.726667,973.257418,Estimate
15424,Solomon Islands,90,Married or in-union women,2035,15-19,7.492839,3.71761,Projection
76524,"China, Macao SAR",446,Married or in-union women,1977,35-39,89.183554,3.287306,Estimate
12,Afghanistan,4,Married or in-union women,1971,35-39,93.4,273.831988,Estimate


In [539]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
88247,Namibia,516,Married or in-union women,1984,15-49,47.135924,110.806659,Estimate
110831,Saint Lucia,662,Married or in-union women,1972,15-49,54.10493,11.500544,Estimate
44029,French Guiana,254,Married or in-union women,2046,40-44,59.841416,7.949633,Projection
106479,Qatar,634,Married or in-union women,1995,15-49,65.998483,60.719265,Estimate
18276,Burundi,108,Married or in-union women,1986,35-39,81.578684,99.413008,Estimate


In [540]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [541]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [542]:
df_10.dropna(inplace=True)

In [543]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [544]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [545]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [546]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [547]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [548]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
49439,Tonga,776,1976,1976,Women,[35-39],35,39,86.38,Census,1976 Census,84,Tonga 1976 Census,UNSD,1.0,,
53565,Zimbabwe,716,2009,2009,Men,[65-69],65,69,88.49,Survey,2009 MICS_HH,5044,Zimbabwe 2009 Multiple Indicator Cluster Survey,MICS_HH,,,
29490,Luxembourg,442,1981,1981,Women,[15-19],15,19,4.2,Census,1981 Census,61,Luxembourg 1981 Census,UNSD,,,
47849,Switzerland,756,1994,1994,Men,[40-44],40,44,77.76,Estimate,1994 Estimate,2228,Switzerland 1994 Estimate,UNSD,,,
53213,Zambia,894,2000,2000,Women,[55+],55,999,41.24,Census,2000 Census,324,Zambia 2000 Census,UNSD,1.0,,
34694,Nicaragua,558,1971,1971,Men,[35-39],35,39,84.12,Census,1971 Census,1135,Nicaragua 1971 Census,UNSD,1.0,,
20155,Hungary,348,1983,1983,Men,[65-69],65,69,83.38,Estimate,1983 Estimate,2120,Hungary 1983 Estimate,UNSD,,,
12222,Dominica,212,1991,1991,Men,[55-59],55,59,60.85,Census,1991 Census,2344,Dominica 1991 Census,US Census Bureau,,,


In [549]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
11586,Denmark,208,2001,2001,Men,[60-64],60,64,75.41,Estimate,2001 Estimate,2081,Denmark 2001 Estimate,UNSD,,,Excluding Faeroe Islands and Greenland shown s...
34097,Netherlands,528,2008,2008,Men,[15-19],15,19,0.02,Estimate,2008 Estimate,2170,Netherlands 2008 Estimate,UNSD,,,
10485,Czechia,203,1993,1993,Men,[30-34],30,34,75.62,Estimate,1993 Estimate,2079,Czech Republic 1993 Estimate,UNSD,,,
34956,Niger,562,1992,1992,Men,[20-24],20,24,27.7,Survey,1992 DHS,1731,Niger 1992 Demographic and Health Survey,DHS_HH,,,
1676,Austria,40,1973,1973,Women,[55-59],55,59,59.15,Estimate,1973 Estimate,2038,Austria 1973 Estimate,UNSD,,,
23652,Ireland,372,1991,1991,Men,[75+],75,999,49.82,Census,1991 Census,1080,Ireland 1991 Census,UNSD,1.0,,
41538,San Marino,674,1982,1982,Men,[45-49],45,49,92.25,Estimate,1982 Estimate,2208,San Marino 1982 Estimate,UNSD,,,
27969,Liberia,430,1969,1970,Men,[40-44],40,44,83.71,Survey,1969-1970 PGS,714,Liberia 1969-1970 Population Growth Survey,INED,,,


In [550]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
12624,Ecuador,218,Women,[40-44],40,44,78.44,Census,1974 Census,UNSD
32950,Namibia,516,Women,[45-49],45,49,62.08,Census,2001 Census,UNSD
18346,Greenland,304,Women,[60-64],60,64,53.76,Estimate,2011 Estimate,UNSD
42332,Saudi Arabia,682,Men,[60-64],60,64,96.2,Survey,2007 DS,UNSD
51884,United States Virgin Islands,850,Men,[65+],65,999,66.25,Census,1990 Census,US Census Bureau
38811,Portugal,620,Men,[25-29],25,29,20.25,Census,2011 Census,UNSD
5638,Burundi,108,Men,[25-29],25,29,75.2,Survey,2010 DHS,DHS_HH
9641,Costa Rica,188,Men,[50-54],50,54,71.2,Estimate,2016 Estimate,UNSD
9103,Congo,178,Men,[75+],75,999,79.7,Survey,2011-2012 DHS,DHS_HH
12386,Dominican Republic,214,Women,[40-44],40,44,73.66,Survey,2000 MICS,MICS


In [551]:
df_11.drop_duplicates(inplace=True)

In [552]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [553]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [554]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [555]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [556]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
19547,Greenland,304,1999,1999,Women,[75+],75,999,89.35,Estimate,1999 Estimate,2109,Greenland 1999 Estimate,UNSD,,,
48948,Sweden,752,1991,1991,Women,[20-24],20,24,11.16,Estimate,1991 Estimate,2227,Sweden 1991 Estimate,UNSD,,,
29769,Liberia,430,2008,2008,Men,[75+],75,999,90.08,Census,2008 Census,2425,Liberia 2008 Census,UNSD,1.0,,
23974,Indonesia,360,1990,1990,Men,[30-34],30,34,90.62,Census,1990 Census,1263,Indonesia 1990 Census,UNSD,,,
42175,Russian Federation,643,1989,1989,Men,[55-59],55,59,98.3,Census,1989 Census,1291,Russian Federation 1989 Census,UNSD,1.0,,
42884,Saint Vincent and the Grenadines,670,1991,1991,Women,[15-19],15,19,0.77,Census,1991 Census,377,Saint Vincent and the Grenadines 1991 Census,UNSD,,,
24136,Indonesia,360,2002,2003,Men,[25-29],25,29,65.6,Survey,2002-2003 DHS,1689,Indonesia 2002-2003 Demographic and Health Survey,DHS_HH,,,
52406,Turkey,792,2013,2014,Women,[50-54],50,54,97.6,Survey,2013 NDHS,5557,Turkey 2013 Demographic and Health Survey,DHS_HH,,,


In [557]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
26036,Israel,376,Men,[20-24],20,24,11.39,Estimate,UNSD
2373,Azerbaijan,31,Women,[65-69],65,69,97.92,Estimate,UNSD
1276,Australia,36,Men,[25-29],25,29,45.6,Estimate,UNSD
40104,Philippines,608,Women,[35-39],35,39,86.54,Census,IPUMS
55423,Zambia,894,Men,[70-74],70,74,99.0,Survey,DHS_HH
22363,Hungary,348,Women,[40-44],40,44,79.93,Estimate,UNSD
39638,Peru,604,Women,[20-24],20,24,55.49,Census,UNSD
41058,Qatar,634,Women,[15-19],15,19,3.57,Census,UNSD


In [558]:
df_12.dropna(inplace=True)

In [559]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [560]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [561]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [562]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [563]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
65270,South Africa,710,[Total],TFR,2014.170044,2.746496,"2016 DHS,Birth Histories,FBH analysis 2018,579...",Birth histories,Survey,DHS,5797,South Africa 2016 Demographic and Health Surve...,2016 DHS,2016,2016.0,Fertility rates from full birth histories anal...,2018.0
5696,Bangladesh,50,[45-49],ASFR4549,1987.5,10.0,"SVRS,Computed rate from reported ASFR,Report",Computed rate from reported ASFR,SRS,SVRS,767,Bangladesh Sample Vital Registration System,SVRS,1980,,Bangladesh (1994). Statistical Yearbook of Ban...,1995.0
21473,Dominican Republic,214,[25-29],ASFR2529,1984.33374,201.0,"1996 DHS,Direct,DHS,1685-16-39167",Direct,Survey,DHS,1685,Dominican Republic 1996 Demographic and Health...,1996 DHS,1996,1996.0,DHS Statcompiler,2012.0
46244,Mauritania,478,[40-44],ASFR4044,1962.454064,74.72189,"1964-1965 PS,Feeney modified P/F Ratio method,...",P/F Ratio method (Feeney),Survey,Survey,3027,Mauritania 1964-1965 Population Survey,1964-1965 PS,1964,1965.0,United Nations Population Division,2009.0
72710,Turkmenistan,795,[35-39],ASFR3539,2005.5,43.23848,"Register, Direct, TransMonee database",Direct,Register,VR,571,Vital Registration,Register,2005,2005.0,TransMonee database,
28776,Germany,276,[25-29],ASFR2529,2017.5,85.53,Eurostat.20190531,Official estimates,Estimate,Estimate,2244,All sources of estimates,Estimates,2017,2017.0,"Eurostat Statistics, Fertility rates by age [d...",2019.0


In [564]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
})

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
0,Afghanistan,[Total],1964.977051,7.966653,Reverse survival method,Census,1979 Census,1979,1979
1,Afghanistan,[Total],1965.977051,8.212275,Reverse survival method,Census,1979 Census,1979,1979
2,Afghanistan,[Total],1966.977051,8.317603,Reverse survival method,Census,1979 Census,1979,1979
3,Afghanistan,[Total],1967.977051,8.225812,Reverse survival method,Census,1979 Census,1979,1979
4,Afghanistan,[Total],1968.977051,8.068459,Reverse survival method,Census,1979 Census,1979,1979
...,...,...,...,...,...,...,...,...,...
79769,Zimbabwe,[35-39],2017.135616,83.877220,Recent births,Survey,2017 ICDS,2017,2017
79770,Zimbabwe,[40-44],2017.135616,43.317440,Recent births,Survey,2017 ICDS,2017,2017
79771,Zimbabwe,[45-49],2017.135616,6.958529,Recent births,Survey,2017 ICDS,2017,2017
79772,Zimbabwe,[Total],2017.135616,28.388350,Recent births,Survey,2017 ICDS,2017,2017


In [572]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.head(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
0,Afghanistan,[Total],1965,8.0,Reverse survival method,Census,1979 Census,1979,1979
1,Afghanistan,[Total],1966,8.0,Reverse survival method,Census,1979 Census,1979,1979
2,Afghanistan,[Total],1967,8.0,Reverse survival method,Census,1979 Census,1979,1979
3,Afghanistan,[Total],1968,8.0,Reverse survival method,Census,1979 Census,1979,1979
4,Afghanistan,[Total],1969,8.0,Reverse survival method,Census,1979 Census,1979,1979
5,Afghanistan,[Total],1970,8.0,Reverse survival method,Census,1979 Census,1979,1979
6,Afghanistan,[Total],1971,8.0,Reverse survival method,Census,1979 Census,1979,1979
7,Afghanistan,[Total],1972,8.0,Reverse survival method,Census,1979 Census,1979,1979
8,Afghanistan,[Total],1973,8.0,Reverse survival method,Census,1979 Census,1979,1979
9,Afghanistan,[15-19],1974,122.0,Recent births,Survey,1972-1974 NDFGS,1972,1974


In [None]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [None]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

774

In [575]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head(10)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
5,Afghanistan,4,1972,1974,Men,Divorced,,,[40-44],40,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
6,Afghanistan,4,1972,1974,Men,Divorced,,,[45-49],45,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
7,Afghanistan,4,1972,1974,Men,Divorced,,,[50-54],50,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
8,Afghanistan,4,1972,1974,Men,Divorced,,,[55-59],55,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
9,Afghanistan,4,1972,1974,Men,Divorced,,,[60-64],60,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
