In [134]:
import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 

In [135]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [136]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [137]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [138]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [139]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [140]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [141]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [142]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [143]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [144]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [145]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [146]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [147]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [148]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [149]:
df_2.drop_duplicates(inplace=True)


In [150]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [151]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [152]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [153]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [154]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [155]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [156]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [157]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [158]:
df_3.drop_duplicates(inplace=True)


In [159]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [160]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [161]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [162]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [163]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [164]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [165]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [166]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [167]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [168]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [169]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [170]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [171]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [172]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [173]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [174]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [175]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [176]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [177]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
55,Women,,38,81.2,89.8,94.6,84.5,70.5,59.3,,
35,Women,,18,0.4,1.6,4.6,4.6,1.3,0.4,0.1,0.0
43,Women,,26,55.1,68.3,84.2,65.2,40.1,21.2,12.9,
27,Men,,44,91.5,91.1,91.8,80.0,67.5,,,
62,Women,,45,84.2,91.4,95.4,86.7,74.5,,,


In [178]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [179]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [180]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [181]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [182]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [183]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [184]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
2012,Turkey,TUR,2016,2.9
295,Costa Rica,CRI,1992,40.6
1194,Lithuania,LTU,2014,29.0
750,Greece,GRC,1968,1.1
2118,United States,USA,2000,33.2


In [185]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [186]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [187]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [188]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [189]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [190]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
48,Women,GBR,31,73.9,84.5,91.7,78.4,58.7,42.0,30.2,
46,Women,GBR,29,68.7,80.4,89.8,74.5,52.4,33.6,24.8,
10,Men,GBR,27,55.8,59.2,73.5,52.2,28.7,14.4,9.7,
28,Men,GBR,45,91.8,91.3,92.0,80.5,68.2,,,
22,Men,GBR,39,88.8,89.5,90.8,77.4,62.6,52.5,,


In [191]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
40,Women,23,32.7,49.5,68.2,48.4,24.0
16,Men,33,81.5,84.1,87.6,70.8,51.1
47,Women,30,71.6,82.7,90.9,76.7,55.8
11,Men,28,62.7,66.3,77.7,56.8,33.1
62,Women,45,84.2,91.4,95.4,86.7,74.5


In [192]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [193]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [194]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [195]:
#!pip install openpyxl

In [196]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [197]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [198]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [199]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [200]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [201]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [202]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [203]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [204]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [205]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [206]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [207]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
5300,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,188,Costa Rica,CR,CRI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,52.62
21041,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,724,Spain,ES,ESP,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,53.31
16988,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,585,Palau,PW,PLW,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,52.78
23234,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,798,Tuvalu,TV,TUV,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,71.35
23133,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,796,Turks and Caicos Islands,TC,TCA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,58.5


In [208]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
9571,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,332,Haiti,HT,HTI,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,59.3
17213,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,591,Panama,PA,PAN,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,56.47
15857,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,540,New Caledonia,NC,NCL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,40.06
24262,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,854,Burkina Faso,BF,BFA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,77.37
24065,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,840,United States of America,US,USA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,52.86


In [209]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [210]:
df_9.drop_duplicates(inplace=True)

In [211]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [212]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [213]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [214]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [215]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [216]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
65719,Kazakhstan,398,Married or in-union women,2003,15-49,62.730178,2845.879343,Estimate
118353,Slovakia,703,Married or in-union women,2022,20-24,17.506245,23.47955,Estimate
26276,"China, Taiwan Province of China",158,Married or in-union women,2014,35-39,72.893693,712.526011,Estimate
58947,Iceland,352,Married or in-union women,2048,30-34,63.982751,7.650417,Projection
93597,New Zealand,554,Married or in-union women,2005,40-44,76.83442,128.091046,Estimate
18144,Burundi,108,Married or in-union women,1970,15-19,18.64,36.765256,Estimate
36809,Ecuador,218,Married or in-union women,2035,20-24,24.490336,186.077454,Projection
97200,Micronesia (Fed. States of),583,Married or in-union women,1970,15-19,8.9,0.278614,Estimate
55719,Guyana,328,Married or in-union women,2049,15-49,56.734667,132.627213,Projection
80630,Malta,470,Married or in-union women,2004,45-49,65.233334,9.941886,Estimate


In [217]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
80487,Malta,470,Married or in-union women,1986,15-49,54.253541,49.883689,Estimate
12315,Bosnia and Herzegovina,70,Married or in-union women,1970,30-34,82.08,127.502251,Estimate
10820,Bermuda,60,Married or in-union women,2026,35-39,63.628443,1.19367,Projection
13734,Brazil,76,Married or in-union women,1985,45-49,75.915,2057.362926,Estimate
36793,Ecuador,218,Married or in-union women,2033,20-24,24.840286,194.312892,Projection


In [218]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [219]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [220]:
df_10.dropna(inplace=True)

In [221]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [222]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [223]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [224]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [225]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [226]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
16475,French Guiana,254,2006,2006,Men,[60-64],60,64,49.16,Census,2006 Census,2362,French Guiana 2006 Census,UNSD,,,
1504,Australia,36,2006,2006,Women,[15-19],15,19,0.37,Estimate,2006 Estimate,2037,Australia 2006 Estimate,UNSD,,,
37248,Panama,591,1980,1980,Women,[25-29],25,29,70.29,Census,1980 Census,291,Panama 1980 Census,UNSD,1.0,,Excluding the former Canal Zone and tribal Ind...
2882,Barbados,52,1970,1970,Men,[25-29],25,29,22.49,Census,1970 Census,95,Barbados 1970 Census,US Census Bureau,,,
16572,French Polynesia,258,2012,2012,Men,[30-34],30,34,22.55,Census,2012 Census,5688,French Polynesia 2012 Census,National statistics,,,
45751,State of Palestine,275,2007,2007,Men,[35-39],35,39,96.05,Census,2007 Census,2364,Occupied Palestinian Territory 2007 Census,UNSD,,Data have not been adjusted for underenumeration.,
34374,Netherlands Antilles,530,1981,1981,Men,[20-24],20,24,8.15,Census,1981 Census,1348,Netherlands Antilles 1981 Census,US Census Bureau,,,
11744,Denmark,208,2007,2007,Men,[35-39],35,39,54.05,Estimate,2007 Estimate,2081,Denmark 2007 Estimate,UNSD,,Based on data compiled from registers.,Excluding Faeroe Islands and Greenland shown s...


In [227]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
37868,Peru,604,2010,2010,Women,[30-34],30,34,73.9,Survey,2010 DHS,2516,Peru 2010 Demographic and Health Survey (Conti...,DHS_STATcompiler,1.0,,
36280,Norway,578,2001,2001,Men,[45-49],45,49,75.13,Census,2001 Census,1248,Norway 2001 Census,UNSD,1.0,Based on data compiled from registers.,Including residents temporarily outside the co...
25613,Jordan,400,1979,1979,Women,[25-29],25,29,84.74,Census,1979 Census,964,Jordan 1979 Census,UNSD,,,Excluding data for Jordanian territory under o...
9887,Côte d'Ivoire,384,2000,2000,Women,[70-74],70,74,25.48,Survey,2000 MICS_HH,4430,Côte d'Ivoire 2000 Multiple Indicator Cluster ...,MICS_HH,1.0,,
42694,Senegal,686,2010,2011,Men,[35-39],35,39,80.9,Survey,2010-2011 DHS-MICS,5274,Senegal 2010-2011 Enquête Démographique et de...,DHS_HH,,,
21549,Iceland,352,1987,1987,Men,[45-49],45,49,75.46,Estimate,1987 Estimate,2121,Iceland 1987 Estimate,UNSD,,,
19389,Haiti,332,1971,1971,Women,[40-44],40,44,76.79,Census,1971 Census,1656,Haiti 1971 Census,UNSD,1.0,,
49213,Togo,768,2013,2014,Men,[40-44],40,44,90.3,Survey,2013-2014 DHS-MICS,5071,Togo 2013-2014 Demographic and Health Survey a...,DHS_HH,,,


In [228]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
26422,Kiribati,296,Men,[60-64],60,64,80.2,Census,1995 Census,National statistics
35904,Norway,578,Women,[45-49],45,49,79.92,Estimate,1986 Estimate,UNSD
2222,Azerbaijan,31,Women,[10-14],10,14,0.0,Estimate,2013 Estimate,UNSD
18075,Greenland,304,Men,[10-14],10,14,0.0,Estimate,1990 Estimate,UNSD
49754,Trinidad and Tobago,780,Women,[30-34],30,34,47.32,Census,2011 Census,UNSD
35903,Norway,578,Women,[40-44],40,44,79.92,Estimate,1986 Estimate,UNSD
48721,Thailand,764,Women,[45-49],45,49,80.24,Census,1970 Census,UNSD
37940,Peru,604,Women,[35-39],35,39,75.7,Survey,2012 DHS,DHS_HH
9568,Costa Rica,188,Women,[35-39],35,39,65.48,Estimate,2013 Estimate,UNSD
13878,Faeroe Islands,234,Women,[20-24],20,24,18.6,Estimate,1992 Estimate,UNSD


In [229]:
df_11.drop_duplicates(inplace=True)

In [230]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [231]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [232]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [233]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [234]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
47390,Spain,724,1978,1978,Men,[65-69],65,69,92.7,Estimate,1978 Estimate,2222,Spain 1978 Estimate,UNSD,,,
51838,Tunisia,788,1980,1980,Men,[75+],75,999,95.13,Survey,1980,5183,Tunisia 1980 Enquête Population-Emploi,INED,,,
10348,Côte d'Ivoire,384,2000,2000,Women,[15-19],15,19,22.44,Survey,2000 MICS,4430,Côte d'Ivoire 2000 Multiple Indicator Cluster ...,MICS,,,
26017,Israel,376,2011,2011,Men,[65-69],65,69,97.24,Estimate,2011 Estimate,2127,Israel 2011 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
46816,Solomon Islands,90,1986,1986,Men,[50-54],50,54,92.86,Census,1986 Census,971,Solomon Islands 1986 Census,US Census Bureau,,,
45316,Sierra Leone,694,2004,2004,Men,[40-44],40,44,91.61,Census,2004 Census,1163,Sierra Leone 2004 Census,UNSD,1.0,,
54914,Wallis and Futuna Islands,876,2013,2013,Men,[15-19],15,19,0.33,Census,2013 Census,6047,Wallis and Futuna 2013 Census,National statistics,,,
32514,Martinique,474,1982,1982,Men,[20-24],20,24,2.17,Census,1982 Census,1517,Martinique 1982 Census,UNSD,,,


In [235]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
19646,Greenland,304,Women,[35-39],35,39,59.38,Estimate,UNSD
40942,Puerto Rico,630,Women,[25-29],25,29,83.87,Survey,National statistics
7511,Cayman Islands,136,Women,[55-59],55,59,87.92,Census,UNSD
31832,Maldives,462,Women,[60-64],60,64,99.53,Census,UNSD
10265,Côte d'Ivoire,384,Men,[30-34],30,34,73.29,Census,UNSD
50486,Syrian Arab Republic,760,Women,[50-54],50,54,98.55,Survey,National statistics
23159,Iceland,352,Men,[40-44],40,44,84.27,Estimate,UNSD
27163,Jordan,400,Women,[25-29],25,29,87.5,Survey,National statistics


In [236]:
df_12.dropna(inplace=True)

In [237]:
df_12.isnull().sum()

country        0
isocode        0
sex            0
agegroup       0
age_start      0
age_end        0
datavalue      0
dataprocess    0
datasource     0
dtype: int64

In [238]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [239]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

In [240]:
df_13 = pd.read_csv('../data/processed/fertility_indicators_un.csv', header=6, low_memory=False)
df_13.head()

Unnamed: 0,Country or Area,Country or Area Code,Age Group,Indicator,Date,Value,Series,DataType,Data Source Type,Survey Programme,Data Source Inventory ID,Data Source Name,Data Source Name (short),Data Source Start Year,Data Source End Year,Reference,Reference Year
0,Afghanistan,4,[Total],TFR,1964.977051,7.966653,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
1,Afghanistan,4,[Total],TFR,1965.977051,8.212275,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
2,Afghanistan,4,[Total],TFR,1966.977051,8.317603,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
3,Afghanistan,4,[Total],TFR,1967.977051,8.225812,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012
4,Afghanistan,4,[Total],TFR,1968.977051,8.068459,"1979 Census,Reverse survival methods,Computed",Reverse survival method,Census,Census,280,Afghanistan 1979 Census,1979 Census,1979,1979,United Nations Population Division,2012


In [241]:
df_13.columns = (df_13.columns
        .str.lower()
        .str.strip()
        .str.replace(' ', '')
        .str.replace('(', '')
        .str.replace(')', '')
        .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
        )

df_13.sample(6)

Unnamed: 0,countryorarea,countryorareacode,agegroup,indicator,date,value,series,datatype,datasourcetype,surveyprogramme,datasourceinventoryid,datasourcename,datasourcenameshort,datasourcestartyear,datasourceendyear,reference,referenceyear
22743,Egypt,818,[20-24],ASFR2024,1997.872192,204.0,"2005 DHS,Direct,DHS,1806-16-39167",Direct,Survey,DHS,1806,Egypt 2005 Demographic and Health Survey,2005 DHS,2005,2005,DHS Statcompiler,2012
21474,Dominican Republic,214,[30-34],ASFR3034,1984.33374,150.0,"1996 DHS,Direct,DHS,1685-16-39167",Direct,Survey,DHS,1685,Dominican Republic 1996 Demographic and Health...,1996 DHS,1996,1996,DHS Statcompiler,2012
74108,United Kingdom,826,[25-29],ASFR2529,2002.5,90.758,"Estimates,Fertility data (Adjusted),HFC-ODE,22...",Fertility data (adjusted),Estimate,Estimate,2246,All sources of estimates,Estimates,2002,2002,European Demographic Observatory (ODE). Data c...,2011
30921,Guinea,324,[15-19],ASFR1519,2003.0,147.4686,"2016 MICS,Birth Histories,FBH analysis 2018,56...",Birth histories,Survey,MICS,5676,Guinea 2016 Multiple Indicator Cluster Survey,2016 MICS,2016,2016,Fertility rates from full birth histories anal...,2018
16006,"China, Taiwan Province of China",158,[Total],TFR,2003.5,1.235005,NSO.2015,Direct,Register,VR,1,Vital Registration,Register,2003,2003,"Dept. of Household Registration Affairs, MOI. ...",2015
76216,Uzbekistan,860,[45-49],ASFR4549,1992.5,2.643995,Transmonee.2014,Direct,Register,VR,555,Vital Registration,Register,1992,1992,TransMonee database,2014


In [242]:
df_13 = df_13.drop(columns=['countryorareacode','indicator','datasourceinventoryid','surveyprogramme','series','datasourcename','reference','referenceyear'])

df_13.replace({
    "agegroup": "age_group",
    "countryorarea": "country",
    "datatype": "data_type",
},inplace=True)

In [243]:
df_13['date'] = df_13['date'].astype(int)
df_13['value'] = df_13['value'].round(2)
df_13.sample(12)

Unnamed: 0,countryorarea,agegroup,date,value,datatype,datasourcetype,datasourcenameshort,datasourcestartyear,datasourceendyear
6751,Belarus,[20-24],1983,174.6,Fertility data (adjusted),Estimate,Estimates,1983,1983
72381,Turkmenistan,[15-19],1983,21.4,Direct,Register,Register,1983,1983
52310,New Zealand,[35-39],2010,70.67,Direct,Register,Register,2010,2010
29067,Ghana,[40-44],1981,155.65,Extrapolated from Truncated Birth Histories,Survey,1993 DHS,1993,1994
13618,China,[15-19],1950,90.08,Birth histories,Survey,1982 One-per-Thousand FS,1982,1982
57479,Puerto Rico,[35-39],1983,34.37,Computed rate from DYB,Register,Register,1983,1983
31622,Guyana,[20-24],2013,147.97,Birth histories,Survey,2014 MICS,2014,2014
23844,Estonia,[45-49],2009,0.33,Official estimates,Estimate,Estimates,2009,2009
28792,Germany,[20-24],2018,32.6,Direct,Register,Register,2018,2018
61527,Senegal,[15-19],1990,135.0,Direct,Survey,1992-1993 DHS,1992,1993


In [244]:
#df_13.to_csv("cleaned_fertility_indicators.csv", index=False)

In [245]:
#df_13.to_sql('fertility_indicators_un',engine, if_exists='replace', index=False)

In [246]:
df_14 = pd.read_csv('../data/processed/marital_status_by_age_un.csv', header= 2, low_memory=False)
df_14.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,MaritalStatus,Non-standard_AgeGroups,Series_contains_Non-standard_AgeGroups,AgeGroup,AgeStart,...,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Age groups,Note on Marital Status,Note on Data,Note on Country and Population,Note Other
0,Afghanistan,4,1972,1974,Men,Divorced,,,[15-19],15,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
1,Afghanistan,4,1972,1974,Men,Divorced,,,[20-24],20,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
2,Afghanistan,4,1972,1974,Men,Divorced,,,[25-29],25,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
3,Afghanistan,4,1972,1974,Men,Divorced,,,[30-34],30,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,
4,Afghanistan,4,1972,1974,Men,Divorced,,,[35-39],35,...,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,,,,


In [247]:
df_14.columns= (df_14.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '' , regex=True)  
    )
df_14.sample(5)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,maritalstatus,nonstandard_agegroups,series_contains_nonstandard_agegroups,agegroup,agestart,...,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteonagegroups,noteonmaritalstatus,noteondata,noteoncountryandpopulation,noteother
87555,Georgia,268,2006,2006,Men,Consensual union,,1.0,[35-39],35,...,2006 GGS,5433,Georgia 2006 Generations and Gender Survey,GGS,,,,,,
64933,Dominica,212,1970,1970,Women,Divorced or Separated,,,[45-49],45,...,1970 Census,2342,Dominica 1970 Census,US Census Bureau,,,,,,
50125,Costa Rica,188,2007,2007,Men,Single,,,[20-24],20,...,2007 Estimate,2075,Costa Rica 2007 Estimate,UNSD,,,,,,
137669,Lao People's Dem. Republic,418,2017,2017,Men,Divorced,,,[20-24],20,...,2017 DHS-MICS Special,7348,Lao People's Democratic Republic 2017 Social I...,MICS,,,,,,
61979,Denmark,208,1994,1994,Women,Consensual union,,,[45-49],45,...,1994 Estimate,2081,Denmark 1994 Estimate,UNSD,,,,,Excluding Faeroe Islands and Greenland shown s...,


In [248]:
df_14 = df_14.drop(columns=['datacataloglongname', 'noteondata', 'noteoncountryandpopulation','noteonagegroups', 'noteother',
                             'including_consensual_unions','isocode', 'datacatalogid', 'noteonmaritalstatus', 'series_contains_nonstandard_agegroups','nonstandard_agegroups'])

df_14.rename(columns={
    "countryorarea": "country",
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "yearstart": "year_start",
    "yearend": "year_end",
    }, inplace =True
    )

df_14.sample(10)

Unnamed: 0,country,year_start,year_end,sex,marital_status,age_group,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datasource
260162,United States of America,1989,1989,Men,Single,[65-74],65,74,4.9,Estimate,1989 Estimate,UNSD
87852,Germany,1987,1987,Men,Married,[70-74],70,74,81.36,Census,1987 Census,IPUMS
139898,Latvia,2016,2016,Women,Single,[30-34],30,34,39.97,Estimate,2016 Estimate,UNSD
248393,Tunisia,1980,1980,Women,Divorced,[15-19],15,19,0.08,Survey,1980,INED
91545,Ghana,2014,2014,Men,Living together,[30-34],30,34,20.5,Survey,2014 DHS,DHS_STATcompiler
89932,Ghana,1960,1960,Men,Widowed,[65+],65,999,11.64,Census,1960 Census,INED
85583,French Polynesia,1977,1977,Women,Divorced or Separated,[40-44],40,44,2.4,Census,1977 Census,US Census Bureau
196475,Poland,2002,2002,Men,Single,[20-24],20,24,87.74,Census,2002 Census,UNSD
47762,Congo,2011,2012,Women,Widowed,[45-49],45,49,9.8,Survey,2011-2012 DHS,DHS_STATcompiler
119316,Iran (Islamic Republic of),2016,2016,Men,Married,[35-39],35,39,88.77,Census,2016 Census,UNSD


In [249]:
df_14.drop_duplicates(inplace=True)
df_14.isnull().sum()

country                 0
year_start              0
year_end                0
sex                     0
marital_status          0
age_group               0
agestart                0
ageend                  0
datavalue               0
dataprocess             0
datacatalogshortname    0
datasource              0
dtype: int64

In [250]:
#df_14.to_csv("cleaned_marital_status_by_age_un.csv", index=False)

In [251]:
#df_14.to_sql('marital_status_by_age_un', engine, if_exists='replace', index=False)

In [252]:
df_15 = pd.read_csv('../data/processed/regions_un.csv', header=5, low_memory= False)
df_15.head(10)

Unnamed: 0,Region and subregion,ISO code,Regional Classification,Indicator,Year,AgeGroup,Percentage,Number,DataProcess
0,World,900,M49,Married or in-union women,1970,15-19,22.576683,71867.82,Estimate
1,World,900,M49,Married or in-union women,1970,20-24,63.802057,162860.4,Estimate
2,World,900,M49,Married or in-union women,1970,25-29,87.174827,182681.1,Estimate
3,World,900,M49,Married or in-union women,1970,30-34,90.825027,179121.4,Estimate
4,World,900,M49,Married or in-union women,1970,35-39,90.284386,161526.3,Estimate
5,World,900,M49,Married or in-union women,1970,40-44,86.483531,139334.4,Estimate
6,World,900,M49,Married or in-union women,1970,45-49,82.680237,116088.4,Estimate
7,World,900,M49,Married or in-union women,1970,15-49,69.379111,1013480.0,Estimate
8,World,900,M49,Married or in-union women,1971,15-19,22.630416,74127.62,Estimate
9,World,900,M49,Married or in-union women,1971,20-24,63.613178,170087.3,Estimate


In [253]:
df_15.columns = (df_15.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(','')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
    )
df_15.sample(6)

Unnamed: 0,regionandsubregion,isocode,regionalclassification,indicator,year,agegroup,percentage,number,dataprocess
9938,Eastern Asia,906,SDG-M49,Married or in-union women,1997,25-29,82.495304,116412.035541,Estimate
15787,Latin America and the Caribbean,904,SDG-M49,Married or in-union women,1999,30-34,75.701558,29657.295157,Estimate
13376,Eastern Europe,923,SDG-M49,Married or in-union women,2022,15-19,3.161273,226.565913,Estimate
21728,Micronesia,954,SDG-M49,Married or in-union women,2013,15-19,8.684367,4.26854,Estimate
18780,Northern America,905,SDG-M49,Married or in-union women,2049,35-39,66.107684,8926.66893,Projection
13480,Eastern Europe,923,SDG-M49,Married or in-union women,2035,15-19,2.178727,167.260981,Projection


In [254]:
df_15 = df_15.drop(columns=['regionalclassification'])

df_15.rename(columns={
    "regionandsubregion": "region",
    "isocode": "iso_code",
    "agegroup": "age_group",
    "dataprocess": "process"
}, inplace=True)

df_15.sample(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
11629,Western Asia,922,Married or in-union women,2046,40-44,75.572153,19408.67,Projection
23560,Developing countries,902,Married or in-union women,1999,15-19,18.171333,43046.35,Estimate
5616,Eastern Africa,910,Married or in-union women,2024,15-19,18.232317,10171.47,Estimate
5070,Africa,903,Married or in-union women,2036,45-49,73.580028,65891.88,Projection
24261,Other developing countries,934,Married or in-union women,2005,40-44,88.5015,130051.1,Estimate
24955,Least developed countries,941,Married or in-union women,2011,30-34,86.369417,24980.18,Estimate
18832,Oceania,909,Married or in-union women,1975,15-19,9.205688,108.9346,Estimate
9455,Central Asia,5500,Married or in-union women,2017,15-49,67.925864,25691.38,Estimate
8205,Western Africa,914,Married or in-union women,2023,40-44,85.623708,18043.67,Estimate
23943,Developing countries,902,Married or in-union women,2046,15-49,62.086088,1189734.0,Projection


In [255]:
df_15.dropna(inplace=True)
df_15.isnull().sum()

region        0
iso_code      0
indicator     0
year          0
age_group     0
percentage    0
number        0
process       0
dtype: int64

In [256]:
print(df_15['number'] % 1 != 0)

0        True
1        True
2        True
3        True
4        True
         ... 
28507    True
28508    True
28509    True
28510    True
28511    True
Name: number, Length: 28512, dtype: bool


In [257]:
df_15['percentage'] = df_15['percentage'].round(2)
df_15['number'] = df_15['number'].astype(int)
df_15.head(10)

Unnamed: 0,region,iso_code,indicator,year,age_group,percentage,number,process
0,World,900,Married or in-union women,1970,15-19,22.58,71867,Estimate
1,World,900,Married or in-union women,1970,20-24,63.8,162860,Estimate
2,World,900,Married or in-union women,1970,25-29,87.17,182681,Estimate
3,World,900,Married or in-union women,1970,30-34,90.83,179121,Estimate
4,World,900,Married or in-union women,1970,35-39,90.28,161526,Estimate
5,World,900,Married or in-union women,1970,40-44,86.48,139334,Estimate
6,World,900,Married or in-union women,1970,45-49,82.68,116088,Estimate
7,World,900,Married or in-union women,1970,15-49,69.38,1013479,Estimate
8,World,900,Married or in-union women,1971,15-19,22.63,74127,Estimate
9,World,900,Married or in-union women,1971,20-24,63.61,170087,Estimate


In [258]:
#df_15.to_csv('cleaned_regions_un.csv', index=False)



In [259]:
#df_15.to_sql('regions_un', engine, if_exists='replace',index=False)

In [260]:
df_16 = pd.read_csv('../data/processed/smam_un.csv', header= 2, low_memory= False)
df_16.head(10)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,26.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,
1,Afghanistan,4,1972,1974,Women,18.1,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,
2,Afghanistan,4,1979,1979,Men,25.3,Census,1979 Census,280,Afghanistan 1979 Census,UNSD,Data have not been adjusted for underenumeration.,Excluding nomad population.
3,Afghanistan,4,1979,1979,Women,17.8,Census,1979 Census,280,Afghanistan 1979 Census,UNSD,Data have not been adjusted for underenumeration.,Excluding nomad population.
4,Afghanistan,4,2010,2010,Women,21.5,Survey,2010 DHS Special,5045,Afghanistan 2010 Mortality Survey,National statistics,,Excluding areas in the South zone.
5,Afghanistan,4,2010,2011,Women,21.2,Survey,2010-2011 MICS,4983,Afghanistan 2010-2011 Multiple Indicator Clust...,MICS,,
6,Afghanistan,4,2015,2016,Men,24.3,Survey,2015 DHS,5689,Afghanistan 2015 Demographic and Health Survey,DHS_HH,,
7,Afghanistan,4,2015,2016,Men,24.7,Survey,2015 DHS,5689,Afghanistan 2015 Demographic and Health Survey,DHS_STATcompiler,,
8,Afghanistan,4,2015,2016,Women,21.5,Survey,2015 DHS,5689,Afghanistan 2015 Demographic and Health Survey,DHS_STATcompiler,,
9,Afghanistan,4,2015,2016,Women,21.3,Survey,2015 DHS,5689,Afghanistan 2015 Demographic and Health Survey,DHS_HH,,


In [261]:
df_17 = pd.read_csv('../data/Raw/export-2025-07-29T15_11_06.755Z.csv')
df_17

ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 3


In [None]:
df_18 = pd.read_csv('../data/Raw/export-2025-07-29T15_11_06.950Z.csv')
df_18

In [None]:
df_20 = pd.read_csv('../data/Raw/export-2025-08-07T12_25_12.019Z.csv')
df_20