In [218]:
import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 

In [219]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [220]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [221]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [222]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [223]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [224]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [225]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [226]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [227]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [228]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [229]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [230]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [231]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [232]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [233]:
df_2.drop_duplicates(inplace=True)


In [234]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [235]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [236]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [237]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [238]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [239]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [240]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [241]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [242]:
df_3.drop_duplicates(inplace=True)


In [243]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [244]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [245]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [246]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [247]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [248]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [249]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [250]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [251]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [252]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [253]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [254]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [255]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [256]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [257]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [258]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [259]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [260]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [261]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
64,Women,,47,84.8,91.7,95.6,87.0,75.4,,,
13,Men,,30,72.9,76.4,83.3,63.9,41.4,25.5,20.1,
8,Men,,25,39.6,41.6,59.7,40.6,19.4,8.5,5.0,
31,Men,,48,92.5,91.7,92.3,81.3,69.9,,,
39,Women,,22,24.3,39.3,57.5,40.6,18.2,7.1,2.9,0.9


In [262]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [263]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [264]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [265]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [266]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [267]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [268]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
613,Finland,FIN,1976,10.9
825,Hungary,HUN,1982,7.7
759,Greece,GRC,1977,1.3
1935,Sweden,SWE,2011,54.3
1156,Lithuania,LTU,1976,6.1


In [269]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [270]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [271]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [272]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [273]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [274]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
39,Women,GBR,22,24.3,39.3,57.5,40.6,18.2,7.1,2.9,0.9
35,Women,GBR,18,0.4,1.6,4.6,4.6,1.3,0.4,0.1,0.0
17,Men,GBR,34,83.2,85.5,88.5,72.4,53.7,40.9,,
2,Men,GBR,19,0.8,0.6,2.0,2.5,0.7,0.3,0.1,0.0
54,Women,GBR,37,80.6,89.4,94.4,84.1,69.4,57.8,,


In [275]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
46,Women,29,68.7,80.4,89.8,74.5,52.4
55,Women,38,81.2,89.8,94.6,84.5,70.5
23,Men,40,89.6,89.9,91.1,78.0,64.0
67,Women,50,85.4,92.0,95.7,87.5,76.3
22,Men,39,88.8,89.5,90.8,77.4,62.6


In [276]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [277]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [278]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [279]:
#!pip install openpyxl

In [280]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [281]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [282]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [283]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [284]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [285]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [286]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [287]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [288]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [289]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [290]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [291]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
8794,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,300,Greece,GR,GRC,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,56.76
16864,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,584,Marshall Islands,MH,MHL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,67.04
19915,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,688,Serbia,RS,SRB,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,58.41
19526,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,678,Sao Tome and Principe,ST,STP,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.63
14819,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,504,Morocco,MA,MAR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,64.94


In [292]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
10005,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,352,Iceland,IS,ISL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,59.08
23411,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,804,Ukraine,UA,UKR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,70.79
17779,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,616,Poland,PL,POL,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.03
2269,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,72,Botswana,BW,BWA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,41.58
283,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,12,Algeria,DZ,DZA,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,55.84


In [293]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [294]:
df_9.drop_duplicates(inplace=True)

In [295]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [296]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [297]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [298]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [299]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [300]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
81193,Martinique,474,Married or in-union women,1994,20-24,23.318182,3.266877,Estimate
56325,Haiti,332,Married or in-union women,2044,40-44,77.966077,389.686537,Projection
106095,Puerto Rico,630,Married or in-union women,2028,15-49,39.023023,268.436252,Projection
24142,Chad,148,Married or in-union women,1990,45-49,72.9,65.343551,Estimate
109973,Saint Kitts and Nevis,659,Married or in-union women,2027,40-44,73.227646,1.482494,Projection
35241,Dominica,212,Married or in-union women,2001,20-24,38.2,0.845557,Estimate
72441,Lesotho,426,Married or in-union women,2034,20-24,26.577549,34.039462,Projection
122108,Spain,724,Married or in-union women,2005,35-39,79.018049,1415.666422,Estimate
135233,Tuvalu,798,Married or in-union women,2026,20-24,41.320619,0.140284,Projection
52311,Grenada,308,Married or in-union women,2028,15-49,42.004899,12.975103,Projection


In [301]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
142912,Venezuela (Bolivarian Republic of),862,Married or in-union women,2014,15-19,15.454732,209.349728,Projection
59378,India,356,Married or in-union women,2021,25-29,87.047707,50695.979799,Projection
59224,India,356,Married or in-union women,2002,15-19,27.250857,14813.601505,Estimate
14873,Belize,84,Married or in-union women,2047,20-24,34.901841,5.783584,Projection
67222,Kenya,404,Married or in-union women,2029,45-49,68.456433,939.980755,Projection


In [302]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [303]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [304]:
df_10.dropna(inplace=True)

In [305]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [306]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [307]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [308]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [309]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [310]:
df_11.sample(8)

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
8088,"China, Hong Kong SAR",344,1981,1981,Women,[50-54],50,54,83.63,Census,1981 Census,1548,"China, Hong Kong (SAR) 1981 Census",UNSD,1.0,,
27839,Lesotho,426,2006,2006,Men,[55-59],55,59,80.7,Census,2006 Census,1425,Lesotho 2006 Census,UNSD,1.0,,
48132,Switzerland,756,2008,2008,Men,[55-59],55,59,75.09,Estimate,2008 Estimate,2228,Switzerland 2008 Estimate,UNSD,,,
48024,Switzerland,756,2003,2003,Men,[75+],75,999,70.34,Estimate,2003 Estimate,2228,Switzerland 2003 Estimate,UNSD,,,
12642,Ecuador,218,1982,1982,Women,[10-14],10,14,1.17,Census,1982 Census,1440,Ecuador 1982 Census,UNSD,1.0,Data have not been adjusted for underenumeration.,Excluding nomadic Indian tribes.
29843,Malawi,454,1970,1972,Women,[50+],50,999,58.9,Survey,1970-1972 PCS,1847,Malawi 1970-1972 Population Change Survey,INED,,,
44282,Slovenia,705,2005,2005,Men,[65-69],65,69,80.01,Estimate,2005 Estimate,2218,Slovenia 2005 Estimate,UNSD,,,Excluding citizens temporarily residing abroad.
7305,Chad,148,2014,2014,Women,[65-69],65,69,31.13,Survey,2014 DHS,5564,Chad 2014 Demographic and Health Survey,DHS_HH,1.0,,


In [311]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
13007,Egypt,818,2005,2005,Women,[15-19],15,19,12.2,Survey,2005 DHS,1806,Egypt 2005 Demographic and Health Survey,DHS_HH,,,
7375,Channel Islands,830,1991,1991,Men,[20-24],20,24,9.63,Census,1991 Census,983,Channel Islands 1991 Census,US Census Bureau,,,Data pertain to resident population of Jersey ...
23769,Ireland,372,2002,2002,Women,[30-34],30,34,55.55,Census,2002 Census,1542,Ireland 2002 Census,UNSD,,,
948,Armenia,51,2015,2016,Men,[65-69],65,69,87.7,Survey,2015 DHS,5806,Armenia 2015-16 Demographic and Health Survey,DHS_HH,,,
30438,Mali,466,1987,1987,Men,[75+],75,999,87.13,Census,1987 Census,1324,Mali 1987 Census,UNSD,1.0,,
18291,Greenland,304,2009,2009,Women,[65-69],65,69,48.16,Estimate,2009 Estimate,2109,Greenland 2009 Estimate,UNSD,,Based on data compiled from registers.,
36591,Norway,578,2012,2012,Men,[10-14],10,14,0.0,Estimate,2012 Estimate,2180,Norway 2012 Estimate,UNSD,,,Including residents temporarily outside the co...
17276,Germany,276,2013,2013,Women,[50-54],50,54,65.93,Estimate,2013 Estimate,2102,Germany 2013 Estimate,UNSD,,,


In [312]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation', 'including_consensual_unions'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,data_value,dataprocess,datacatalogshortname,data_source
45083,South Africa,710,Women,[40-44],40,44,59.75,Census,2011 Census,UNSD
40428,Rwanda,646,Men,[40-44],40,44,93.0,Survey,2007-2008 DHS Interim,DHS_HH
1233,Australia,36,Women,[20-24],20,24,25.71,Estimate,1989 Estimate,UNSD
43844,Slovakia,703,Women,[60-64],60,64,62.97,Estimate,2009 Estimate,UNSD
40841,Saint Vincent and the Grenadines,670,Men,[25-29],25,29,9.95,Census,1991 Census,UNSD
27149,Latvia,428,Women,[55-59],55,59,62.19,Estimate,2000 Estimate,UNSD
31823,Mongolia,496,Men,[45-49],45,49,90.0,Census,1979 Census,National statistics
47546,Sweden,752,Men,[65-69],65,69,60.43,Estimate,2018 Estimate,UNSD
38542,Poland,616,Men,[65-69],65,69,78.36,Estimate,1998 Estimate,UNSD
26999,Lao People's Dem. Republic,418,Men,[20-24],20,24,40.69,Survey,2017 DHS-MICS Special,MICS


In [313]:
df_11.drop_duplicates(inplace=True)

In [314]:
df_11.isnull().sum()

country                 0
isocode                 0
sex                     0
agegroup                0
age_start               0
ageend                  0
data_value              0
dataprocess             0
datacatalogshortname    0
data_source             0
dtype: int64

In [315]:
#df_11.to_csv("cleaned_currently_married_un.csv", index=False)

In [316]:
#df_11.to_sql('currently_married_un', engine, if_exists='replace', index=False)

In [317]:
df_12 = pd.read_csv('../data/processed/ever_married_un.csv', header= 2, low_memory = False)
df_12.head()

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.7,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,32.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,61.4,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,83.0,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,91.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,


In [318]:
df_12.columns = (
    df_12.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_12.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
24971,Ireland,372,1991,1991,Women,[75+],75,999,78.0,Census,1991 Census,1080,Ireland 1991 Census,UNSD,1.0,,
18638,Ghana,288,2014,2014,Women,[45-49],45,49,99.0,Survey,2014 DHS,5778,Ghana 2014 Demographic and Health Survey,DHS_STATcompiler,1.0,,
26069,Israel,376,2013,2013,Men,[45-49],45,49,90.79,Estimate,2013 Estimate,2127,Israel 2013 Estimate,UNSD,,,
15989,Finland,246,2005,2005,Women,[15-19],15,19,0.53,Estimate,2005 Estimate,2093,Finland 2005 Estimate,UNSD,1.0,,
25607,Israel,376,1987,1987,Men,[50-54],50,54,97.2,Estimate,1987 Estimate,2127,Israel 1987 Estimate,UNSD,,,Including data for East Jerusalem and Israeli ...
37409,Niue,570,2006,2006,Men,[35-39],35,39,73.47,Census,2006 Census,2489,Niue 2006 Census,National statistics,1.0,,
9914,Costa Rica,188,2006,2006,Women,[30-34],30,34,84.58,Estimate,2006 Estimate,2075,Costa Rica 2006 Estimate,UNSD,1.0,,
52014,Turkey,792,1975,1975,Men,[35-39],35,39,95.02,Census,1975 Census,1406,Turkey 1975 Census,UNSD,,,


In [319]:
df_12 = df_12.drop(columns = ['yearstart', 'yearend', 'datacatalogshortname', 'datacatalogid', 'datacataloglongname', 'including_consensual_unions', 'noteondata', 'noteoncountryandpopulation'])

df_12.rename(columns={
    "agestart": "age_start",
    "ageend": "age_end",
    "countryorarea": "country"
}, inplace=True)
df_12.sample(8)

Unnamed: 0,country,isocode,sex,agegroup,age_start,age_end,datavalue,dataprocess,datasource
35058,Nepal,524,Women,[70-74],70,74,98.8,Survey,DHS_HH
1814,Austria,40,Women,[45-49],45,49,90.76,Estimate,UNSD
28845,Latvia,428,Men,[50-54],50,54,90.81,Estimate,UNSD
1010,Aruba,533,Men,[65-69],65,69,93.36,Census,UNSD
28145,Kuwait,414,Women,[55-59],55,59,97.52,Census,UNSD
24082,Indonesia,360,Men,[45-49],45,49,98.52,Census,UNSD
40627,Poland,616,Women,[35-39],35,39,91.45,Census,UNSD
27680,Kenya,404,Women,[35-39],35,39,96.5,Census,INED


In [None]:
#df_12.to_csv("cleaned_ever_married_un.csv", index=False)

In [None]:
#df_12.to_sql('ever_married_un', engine, if_exists= 'replace', index= False)

826