In [194]:
import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text 

In [195]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [196]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [197]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [198]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [199]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [200]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [201]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [202]:
df_1.drop_duplicates(inplace=True)

df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)

In [203]:
df_1.isnull().sum()

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64

In [204]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [205]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

In [206]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')

In [207]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [208]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

In [209]:
df_2.drop_duplicates(inplace=True)


In [210]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)

In [211]:
df_2.isnull().sum()

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64

In [212]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [213]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

In [214]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')

In [215]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [216]:
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [217]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)

In [218]:
df_3.drop_duplicates(inplace=True)


In [219]:
df_3.isnull().sum()

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64

In [220]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [221]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

In [222]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')

In [223]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [224]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1",
    "entity": "country"
}, inplace=True)

In [225]:
df_4.drop_duplicates(inplace=True)
df_4.dropna(inplace=True)

In [226]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')

In [227]:
df_4.isnull().sum()

country                        0
code                           0
year                           0
crude_marriage_rate            0
crude_marriage_rate_people1    0
year_1                         0
dtype: int64

In [228]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [229]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

In [230]:
df_5 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [231]:
df_5.columns = df_5.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [232]:

df_5.rename(columns={
    "shareofbirthsoutsideofmarriageofallbirths": "share_of_births_outside_of_marriage",
    "entity": "country"
}, inplace=True)

df_5.drop_duplicates(inplace=True)

In [233]:
df_5.isnull().sum()

country                                0
code                                   0
year                                   0
share_of_births_outside_of_marriage    0
dtype: int64

In [234]:
#df_5.to_csv("cleaned_share-of-births-outside-marriage.csv", index=False)

In [235]:
#df_5.to_sql('share_of_births_outside_marriage', engine, if_exists='replace', index=False)

In [236]:
df_6 = pd.read_csv('../data/Raw/share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv')

In [237]:
df_6.columns = df_6.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

df_6.drop_duplicates(inplace=True)
df_6.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
38,Women,,21,14.6,26.1,42.2,31.5,12.7,4.8,1.7,0.6
24,Men,,41,90.2,90.3,91.4,78.6,65.0,54.4,,
50,Women,,33,76.8,86.8,93.0,81.1,63.1,49.0,38.7,
31,Men,,48,92.5,91.7,92.3,81.3,69.9,,,
27,Men,,44,91.5,91.1,91.8,80.0,67.5,,,


In [238]:
df_6 = df_6.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_6.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

In [239]:
df_6.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [240]:
#df_6.to_csv("cleaned_share-of-men-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [241]:
#df_6.to_sql('men_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [242]:
df_7 = pd.read_csv('../data/Raw/share-of-births-outside-marriage.csv')

In [243]:
df_7.columns = df_7.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [244]:
df_7.rename(columns={
    "shareofsingleparenthouseholds": "share_of_single_parent_households",
    "entity": "country"
}, inplace=True)

df_7.drop_duplicates(inplace=True)
df_7.sample(5)

Unnamed: 0,country,code,year,shareofbirthsoutsideofmarriageofallbirths
530,Denmark,DNK,1985,43.0
1269,Malta,MLT,1967,1.3
573,Estonia,EST,1996,48.1
375,Croatia,HRV,2011,14.0
2006,Turkey,TUR,2010,2.6


In [245]:
df_7.isnull().sum()

country                                      0
code                                         0
year                                         0
shareofbirthsoutsideofmarriageofallbirths    0
dtype: int64

In [246]:
#df_7.to_csv("cleaned_share-of-single-parent-households.csv", index=False)

In [247]:
#df_7.to_sql('single_parent_households', engine, if_exists='replace', index=False)

In [248]:
df_8 = pd.read_csv('../data/Raw/share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv')

In [249]:
df_8.columns = df_8.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [250]:
df_8['code'] = df_8['code'].fillna('GBR')
df_8.sample(5)

Unnamed: 0,entity,code,year,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort,proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort
49,Women,GBR,32,75.5,85.8,92.4,79.9,61.0,45.8,34.2,
14,Men,GBR,31,76.5,79.7,85.1,66.6,45.1,29.6,21.3,
53,Women,GBR,36,79.9,88.9,94.2,83.5,68.2,56.2,,
22,Men,GBR,39,88.8,89.5,90.8,77.4,62.6,52.5,,
37,Women,GBR,20,6.8,13.3,27.0,21.5,7.6,2.8,0.9,0.4


In [251]:
df_8 = df_8.drop(columns=['code','proportionsofmenorwomenwhohadevermarriedbyacertainagefor1980birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor1990birthcohort',
    'proportionsofmenorwomenwhohadevermarriedbyacertainagefor2000birthcohort'])

df_8.rename(columns={
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1900birthcohort": "1900_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1920birthcohort": "1920_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1940birthcohort": "1940_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1960birthcohort": "1960_birthcohort",
    "proportionsofmenorwomenwhohadevermarriedbyacertainagefor1970birthcohort": "1970_birthcohort",
    "entity": "sex"
}, inplace=True)

df_8.drop_duplicates(inplace=True)
df_8.sample(5)

Unnamed: 0,sex,year,1900_birthcohort,1920_birthcohort,1940_birthcohort,1960_birthcohort,1970_birthcohort
32,Men,49,92.7,91.8,92.3,81.5,70.3
8,Men,25,39.6,41.6,59.7,40.6,19.4
63,Women,46,84.5,91.6,95.5,86.9,75.0
58,Women,41,82.9,90.7,95.1,85.7,72.7
20,Men,37,87.1,88.4,90.1,75.8,59.8


In [252]:
df_8.isnull().sum()

sex                 0
year                0
1900_birthcohort    0
1920_birthcohort    0
1940_birthcohort    0
1960_birthcohort    0
1970_birthcohort    0
dtype: int64

In [253]:
#df_8.to_csv("cleaned_share-of-women-in-england-and-wales-who-have-ever-married-by-age.csv", index=False)

In [254]:
#df_8.to_sql('women_in_england_and_wales_married_by_age', engine, if_exists='replace', index=False)

In [255]:
#!pip install openpyxl

In [256]:
df_excel_1 = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')

In [257]:
#all_sheets = pd.read_excel('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx', sheet_name=None)

In [258]:
xls_1 = pd.ExcelFile('../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx')
print(xls_1.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']


In [259]:
excel_1 = '../data/Raw/undesa_pd_2019_wmd_marital_status.xlsx'

# Output directory (make sure it exists)
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# List of sheets you want to extract
sheets_to_extract = ['MARITAL_STATUS_BY_AGE', 'CURRENTLY MARRIED', 'EVER_MARRIED', 'SMAM']

In [260]:
"""for sheet in sheets_to_extract:
    # Read just this sheet into a DataFrame
    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)
    
    # Optional: Clean the filename (replace spaces with underscores, etc.)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    
    # Save the DataFrame as CSV
    df_excel_1.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")
"""

'for sheet in sheets_to_extract:\n    # Read just this sheet into a DataFrame\n    df_excel_1 = pd.read_excel(excel_1, sheet_name=sheet)\n    \n    # Optional: Clean the filename (replace spaces with underscores, etc.)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    \n    # Save the DataFrame as CSV\n    df_excel_1.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n'

In [261]:
xls_2 = pd.ExcelFile('../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx')
print(xls_2.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'FERTILITY INDICATORS']


In [262]:
excel_2 = '../data/Raw/undesa_pd_2019_world_fertility_dataset.xlsx'
sheet_name = 'FERTILITY INDICATORS'
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)

df_excel_2 = pd.read_excel(excel_2, sheet_name=sheet_name)


In [263]:
"""csv_name = sheet_name.replace(' ', '_').lower() + '.csv'
csv_path = os.path.join(output_dir, csv_name)
df_excel_2.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")
"""

'csv_name = sheet_name.replace(\' \', \'_\').lower() + \'.csv\'\ncsv_path = os.path.join(output_dir, csv_name)\ndf_excel_2.to_csv(csv_path, index=False)\nprint(f"Saved: {csv_path}")\n'

In [264]:
xls_3 = pd.ExcelFile('../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx')
print(xls_3.sheet_names)

['INFORMATION NOTE', 'Database Field Descriptions', 'Countries', 'Regions']


In [265]:
excel_3 = '../data/Raw/undesa_pd_ds_1970-2030_fp_rev-2024_rev.xlsx'
sheets_to_extract = ['Countries', 'Regions']
output_dir = '../data/processed/'
os.makedirs(output_dir, exist_ok=True)


In [266]:
"""
for sheet in sheets_to_extract:
    df = pd.read_excel(excel_3, sheet_name=sheet)
    csv_name = sheet.replace(' ', '_').lower() + '.csv'
    csv_path = os.path.join(output_dir, csv_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

"""

'\nfor sheet in sheets_to_extract:\n    df = pd.read_excel(excel_3, sheet_name=sheet)\n    csv_name = sheet.replace(\' \', \'_\').lower() + \'.csv\'\n    csv_path = os.path.join(output_dir, csv_name)\n    df.to_csv(csv_path, index=False)\n    print(f"Saved: {csv_path}")\n\n'

In [267]:
df_9 = pd.read_csv('../data/Raw/unpopulation_dataportal_20250728095844.csv')
df_9.sample(5)

Unnamed: 0,IndicatorId,IndicatorName,IndicatorShortName,Source,SourceYear,Author,LocationId,Location,Iso2,Iso3,...,AgeStart,AgeEnd,Age,CategoryId,Category,EstimateTypeId,EstimateType,EstimateMethodId,EstimateMethod,Value
8629,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,296,Kiribati,KI,KIR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,61.69
2663,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,90,Solomon Islands,SB,SLB,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,65.39
1047,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,40,Austria,AT,AUT,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,57.69
8740,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,300,Greece,GR,GRC,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,66.2
3368,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,116,Cambodia,KH,KHM,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,63.93


In [268]:
df_9.columns = df_9.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_9.sample(5)

Unnamed: 0,indicatorid,indicatorname,indicatorshortname,source,sourceyear,author,locationid,location,iso2,iso3,...,agestart,ageend,age,categoryid,category,estimatetypeid,estimatetype,estimatemethodid,estimatemethod,value
5957,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,208,Denmark,DK,DNK,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,66.75
3537,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,120,Cameroon,CM,CMR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,68.1
6556,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,226,Equatorial Guinea,GQ,GNQ,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,62.55
24110,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,850,United States Virgin Islands,VI,VIR,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,38.32
11294,42,Currently married (Percent),Currently married (Percent),Estimates and Projections of Women of Reproduc...,2024,United Nations Population Division,398,Kazakhstan,KZ,KAZ,...,15,49,15-49,100,Married or in a union women,1,Model-based Estimates,2,Interpolation,65.14


In [269]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)



In [270]:
df_9.drop_duplicates(inplace=True)

In [271]:
df_9

Unnamed: 0,indicatorname,year,country,code,time,variant,sex,age,estimate_method,estimatemethod,value
0,Currently married (Percent),2024,Afghanistan,AFG,1970,Median,Female,15-49,2,Interpolation,80.94
2,Currently married (Percent),2024,Afghanistan,AFG,1971,Median,Female,15-49,2,Interpolation,80.90
4,Currently married (Percent),2024,Afghanistan,AFG,1972,Median,Female,15-49,2,Interpolation,80.87
6,Currently married (Percent),2024,Afghanistan,AFG,1973,Median,Female,15-49,2,Interpolation,80.84
8,Currently married (Percent),2024,Afghanistan,AFG,1974,Median,Female,15-49,2,Interpolation,80.53
...,...,...,...,...,...,...,...,...,...,...,...
25078,Currently married (Percent),2024,Zambia,ZMB,2021,Median,Female,15-49,3,Projection,54.31
25080,Currently married (Percent),2024,Zambia,ZMB,2022,Median,Female,15-49,3,Projection,53.82
25082,Currently married (Percent),2024,Zambia,ZMB,2023,Median,Female,15-49,3,Projection,53.35
25084,Currently married (Percent),2024,Zambia,ZMB,2024,Median,Female,15-49,3,Projection,52.91


In [272]:
df_9.isnull().sum()

indicatorname      0
year               0
country            0
code               0
time               0
variant            0
sex                0
age                0
estimate_method    0
estimatemethod     0
value              0
dtype: int64

In [273]:
#df_9.to_csv("cleaned_unpopulation_dataportal.csv", index=False)

In [274]:
#df_9.to_sql('unpopulation_dataportal', engine, if_exists='replace', index=False)

In [275]:
df_10 = pd.read_csv('../data/processed/countries_un.csv',  header=5, low_memory=False)

In [276]:
df_10.columns = (
    df_10.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
)
df_10.sample(10)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,dataprocess
119039,Viet Nam,704,Married or in-union women,2026,15-49,71.601491,18642.076758,Projection
131185,Trinidad and Tobago,780,Married or in-union women,2006,20-24,28.071429,19.384585,Estimate
19463,Cambodia,116,Married or in-union women,1972,15-49,64.699791,971.806712,Estimate
15462,Solomon Islands,90,Married or in-union women,2039,45-49,84.574524,22.243523,Projection
86081,Montserrat,500,Married or in-union women,2038,20-24,21.350813,0.026262,Projection
120088,Somalia,706,Married or in-union women,1996,15-19,21.661183,84.861585,Estimate
19497,Cambodia,116,Married or in-union women,1977,20-24,63.42275,184.658191,Estimate
55190,Guyana,328,Married or in-union women,1983,45-49,70.433333,8.984124,Estimate
116961,Sierra Leone,694,Married or in-union women,2010,20-24,65.40397,182.135013,Estimate
12307,Bolivia (Plurinational State of),68,Married or in-union women,2050,30-34,61.333116,360.939256,Projection


In [277]:
df_10.rename(columns={
    "dataprocess": "data_process",
}, inplace=True)

df_10.drop_duplicates(inplace=True)
df_10.sample(5)

Unnamed: 0,countryorarea,isocode,indicator,year,agegroup,percentage,number,data_process
138547,United Kingdom,826,Married or in-union women,2035,30-34,56.158114,1223.560344,Projection
39881,Estonia,233,Married or in-union women,2014,20-24,32.681,12.961121,Estimate
59149,India,356,Married or in-union women,1992,40-44,87.348,18676.1733,Estimate
113689,Sao Tome and Principe,678,Married or in-union women,2006,20-24,52.899798,4.493573,Estimate
142873,Venezuela (Bolivarian Republic of),862,Married or in-union women,2009,20-24,41.59,536.808369,Estimate


In [278]:
for col in ['percentage', 'number']:
    if col in df_10.columns:
        df_10[col] = (
            df_10[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.extract(r'([-+]?[0-9]*\.?[0-9]+)', expand=False)
            .astype(float)
            .round(2)
        )

In [279]:
unnamed_cols = [col for col in df_10.columns if 'unnamed' in col.lower()]
df_10.drop(columns=unnamed_cols, inplace=True)

In [280]:
df_10.dropna(inplace=True)

In [281]:
df_10.isnull().sum()

countryorarea    0
isocode          0
indicator        0
year             0
agegroup         0
percentage       0
number           0
data_process     0
dtype: int64

In [282]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145800 entries, 0 to 145799
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   countryorarea  145800 non-null  object 
 1   isocode        145800 non-null  int64  
 2   indicator      145800 non-null  object 
 3   year           145800 non-null  int64  
 4   agegroup       145800 non-null  object 
 5   percentage     145800 non-null  float64
 6   number         145800 non-null  float64
 7   data_process   145800 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 8.9+ MB


In [283]:
#df_10.to_csv("cleaned_countries_1970_2025_un.csv", index=False)

In [284]:
#df_10.to_sql('countries_1970_2025_un', engine, if_exists='replace', index=False)

In [306]:
df_11 = pd.read_csv('../data/processed/currently_married_un.csv',  header=2, low_memory=False)

In [307]:
df_11

Unnamed: 0,Country or area,ISO code,YearStart,YearEnd,Sex,AgeGroup,AgeStart,AgeEnd,DataValue,DataProcess,DataCatalog ShortName,DataCatalog ID,DataCatalog LongName,Data Source,Including_consensual_unions,Note on Data,Note on Country and Population
0,Afghanistan,4,1972,1974,Men,[15-19],15,19,7.6,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
1,Afghanistan,4,1972,1974,Men,[20-24],20,24,31.9,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
2,Afghanistan,4,1972,1974,Men,[25-29],25,29,59.1,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
3,Afghanistan,4,1972,1974,Men,[30-34],30,34,78.9,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
4,Afghanistan,4,1972,1974,Men,[35-39],35,39,87.2,Survey,1972-1974 NDFGS,160,Afghanistan 1972-1974 National Demographic and...,National statistics,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53716,Zimbabwe,716,2017,2017,Women,[55-59],55,59,54.4,Survey,2017 ICDS,7670,Zimbabwe 2017 Inter-Censal Demographic Survey,National statistics,,,
53717,Zimbabwe,716,2017,2017,Women,[60-64],60,64,49.8,Survey,2017 ICDS,7670,Zimbabwe 2017 Inter-Censal Demographic Survey,National statistics,,,
53718,Zimbabwe,716,2017,2017,Women,[65-69],65,69,39.7,Survey,2017 ICDS,7670,Zimbabwe 2017 Inter-Censal Demographic Survey,National statistics,,,
53719,Zimbabwe,716,2017,2017,Women,[70-74],70,74,27.2,Survey,2017 ICDS,7670,Zimbabwe 2017 Inter-Censal Demographic Survey,National statistics,,,


In [308]:
df_11.columns = (
    df_11.columns
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('[^0-9a-zA-Z_]','' ,regex= True)
    )
df_11.sample(8)

Unnamed: 0,countryorarea,isocode,yearstart,yearend,sex,agegroup,agestart,ageend,datavalue,dataprocess,datacatalogshortname,datacatalogid,datacataloglongname,datasource,including_consensual_unions,noteondata,noteoncountryandpopulation
27366,Latvia,428,2011,2011,Women,[40-44],40,44,67.96,Census,2011 Census,4829,Latvia 2011 Census,Eurostat,1.0,Estimates computed based on data on marital st...,
11352,Denmark,208,1971,1971,Women,[75+],75,999,20.02,Estimate,1971 Estimate,2081,Denmark 1971 Estimate,UNSD,1.0,,Excluding Faeroe Islands and Greenland shown s...
23207,Iraq,368,1977,1977,Men,[50-54],50,54,92.86,Census,1977 Census,1496,Iraq 1977 Census,UNSD,,,
50328,Turkey,792,2013,2014,Women,[55-59],55,59,78.71,Survey,2013 NDHS,5557,Turkey 2013 Demographic and Health Survey,DHS_HH,,,
7193,Chad,148,1996,1997,Women,[25-29],25,29,92.6,Survey,1996-1997 DHS,1675,Chad 1996-1997 Demographic and Health Survey,DHS_STATcompiler,1.0,,
22892,Indonesia,360,2005,2005,Women,[65-69],65,69,45.32,Survey,2005 SUPAS,9,Indonesia 2005 Intercensal Population Survey,UNSD,,Based on the results of an intercensal survey.,"Excluding Province Nanggroe Aceh Darussalam, R..."
47517,Sweden,752,2017,2017,Men,[60-64],60,64,56.54,Estimate,2017 Estimate,2227,Sweden 2017 Estimate,UNSD,1.0,,
1557,Australia,36,2016,2016,Women,[20-24],20,24,7.29,Census,2016 Census,7141,Australia 2016 Census,National statistics,,,


In [None]:
df_11 = df_11.drop(columns = ['datacataloglongname', 'datacatalogid', 'yearstart' , 'yearend', 'noteondata', 'noteoncountryandpopulation'])

df_11.rename(columns={
    "agestart": "age_start",
    "countryorarea": "country",
    "datasource": "data_source",
    "datavalue" : "data_value"
}, inplace=True)

df_11.sample(10)

Unnamed: 0,country,isocode,sex,agegroup,age_start,ageend,datavalue,dataprocess,datacatalogshortname,data_source,including_consensual_unions
19196,Guyana,328,Women,[15-19],15,19,11.61,Census,1980 Census,UNSD,
45728,State of Palestine,275,Women,[30-34],30,34,84.4,Survey,2000 HS,National statistics,
15181,Finland,246,Men,[50-54],50,54,56.63,Census,2010 Census,UNSD,1.0
11971,Denmark,208,Men,[60-64],60,64,68.28,Estimate,2014 Estimate,UNSD,
13445,Estonia,233,Women,[35-39],35,39,66.09,Census,2011-2012 Census,UNSD,1.0
14535,Finland,246,Women,[70-74],70,74,31.77,Estimate,1981 Estimate,UNSD,
52273,Venezuela (Bolivarian Republic of),862,Women,[20-24],20,24,49.08,Census,1981 Census,UNSD,1.0
49698,Trinidad and Tobago,780,Women,[25-29],25,29,52.86,Survey,2000 MICS,MICS,
48699,TFYR Macedonia,807,Women,[40-44],40,44,91.81,Survey,2011 MICS,MICS,1.0
19866,Hungary,348,Men,[20-24],20,24,32.47,Estimate,1974 Estimate,UNSD,


In [None]:
df_9 = df_9.drop(columns=['indicatorid','indicatorshortname',
    'source',
    'author', 'locationid', 'iso2','estimatetypeid','category','categoryid','agestart','ageend','author','ageid', 'estimatetype','variantid','sexid','timeid'])

df_9.rename(columns={
    "sourceyear": "year",
    "location": "country",
    "estimatemethodid": "estimate_method",
    "iso3": "code",
}, inplace=True)
