In [68]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

In [69]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [70]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'


In [71]:
engine = create_engine(url, echo=False)

In [72]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [30]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [31]:
df_1.sample(10)

Unnamed: 0,Sr.No.,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
192547,192548,Peru,[70-74],Man,Widowed,Survey,2009,2009,DHS_HH
131980,131981,Jordan,[30-34],Man,Not living together,Survey,2009,2009,DHS_HH
194740,194741,Philippines,[75+],Woman,Divorced,Census,2010,2010,UNSD
10912,10913,Azerbaijan,[15-19],Woman,Never married,Survey,2006,2006,DHS_STATcompiler
266996,266997,Wallis and Futuna Islands,[30-34],Woman,Single,Census,1983,1983,US Census Bureau
225603,225604,Solomon Islands,[60-64],Woman,Separated,Census,2009,2009,National statistics
114920,114921,Iceland,[75+],Woman,Married,Estimate,2016,2016,UNSD
14857,14858,Belarus,[30-34],Woman,Divorced or Separated,Census,1979,1979,National statistics
202894,202895,Romania,[50-54],Man,Widowed,Estimate,2003,2003,UNSD
47673,47674,Congo,[45-49],Man,Divorced,Survey,2011,2012,DHS_STATcompiler


In [32]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [33]:
df_1

Unnamed: 0,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
0,Afghanistan,[15-19],Man,Divorced,Survey,1972,1974,National statistics
1,Afghanistan,[20-24],Man,Divorced,Survey,1972,1974,National statistics
2,Afghanistan,[25-29],Man,Divorced,Survey,1972,1974,National statistics
3,Afghanistan,[30-34],Man,Divorced,Survey,1972,1974,National statistics
4,Afghanistan,[35-39],Man,Divorced,Survey,1972,1974,National statistics
...,...,...,...,...,...,...,...,...
271599,Zimbabwe,[55-59],Woman,Widowed,Survey,2017,2017,National statistics
271600,Zimbabwe,[60-64],Woman,Widowed,Survey,2017,2017,National statistics
271601,Zimbabwe,[65-69],Woman,Widowed,Survey,2017,2017,National statistics
271602,Zimbabwe,[70-74],Woman,Widowed,Survey,2017,2017,National statistics


In [34]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [35]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [36]:
df_1.sample(5)

Unnamed: 0,country,age_group,sex,marital_status,data_process,data_collection_start_year,data_collection_end_year,data_source
202521,Romania,[70-74],Man,Single,Estimate,1997,1997,UNSD
140182,Lebanon,[30-34],Woman,Single,Survey,1997,1997,National statistics
256986,United Republic of Tanzania,[45-49],Man,Single,Census,2002,2002,National statistics
21427,Botswana,[35-39],Woman,Divorced,Census,1971,1971,UNSD
168779,Nepal,[35-39],Man,Divorced,Survey,2016,2017,DHS_HH


In [37]:
df_1.drop_duplicates(inplace=True)
print(df_1.isnull().sum())

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64


In [38]:
print(df_1.head())

       country age_group  sex marital_status data_process  \
0  Afghanistan   [15-19]  Man       Divorced       Survey   
1  Afghanistan   [20-24]  Man       Divorced       Survey   
2  Afghanistan   [25-29]  Man       Divorced       Survey   
3  Afghanistan   [30-34]  Man       Divorced       Survey   
4  Afghanistan   [35-39]  Man       Divorced       Survey   

   data_collection_start_year  data_collection_end_year          data_source  
0                        1972                      1974  National statistics  
1                        1972                      1974  National statistics  
2                        1972                      1974  National statistics  
3                        1972                      1974  National statistics  
4                        1972                      1974  National statistics  


In [None]:
df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)
print(df_1[['data_collection_start_year', 'data_collection_end_year']].dtypes)

data_collection_start_year    int32
data_collection_end_year      int32
dtype: object


In [40]:
df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

417

In [53]:
df_2.drop_duplicates(inplace=True)
print(df_2.isnull().sum())

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64


In [22]:
print(df_1.dtypes)

country                       object
age_group                     object
sex                           object
marital_status                object
data_process                  object
data_collection_start_year     int32
data_collection_end_year       int32
data_source                   object
dtype: object


In [None]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [42]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')
df_2.sample(5)

Unnamed: 0,Entity,Code,Year,Mean age of women at first marriage,1005564-annotations
79,Bulgaria,BGR,1990,21.5,
208,Czechia,CZE,2013,28.5,
938,Sweden,SWE,2018,34.0,
989,United Kingdom,GBR,1994,26.51,
305,France,FRA,2000,28.4,


In [43]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [44]:
df_2

Unnamed: 0,entity,code,year,mean_age_of_women_at_first_marriage,1005564annotations
0,Australia,AUS,1990,24.3,median
1,Australia,AUS,1991,24.5,median
2,Australia,AUS,1992,24.7,median
3,Australia,AUS,1993,24.8,median
4,Australia,AUS,1994,25.1,median
...,...,...,...,...,...
1042,United States,USA,2017,27.4,median
1043,United States,USA,2018,27.8,median
1044,United States,USA,2019,28.0,median
1045,United States,USA,2020,28.1,median


In [45]:
df_2 = df_2.drop(columns=['1005564annotations'])

# Rename long column
df_2.rename(columns={
    "entity": "country",   
}, inplace=True)

# Optional: Save cleaned version
df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [46]:
df_2

Unnamed: 0,country,code,year,mean_age_of_women_at_first_marriage
0,Australia,AUS,1990,24.3
1,Australia,AUS,1991,24.5
2,Australia,AUS,1992,24.7
3,Australia,AUS,1993,24.8
4,Australia,AUS,1994,25.1
...,...,...,...,...
1042,United States,USA,2017,27.4
1043,United States,USA,2018,27.8
1044,United States,USA,2019,28.0
1045,United States,USA,2020,28.1


In [49]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)
print(df_2['year'].dtypes)

int32


In [57]:
df_2.dtypes

country                                 object
code                                    object
year                                     int32
mean_age_of_women_at_first_marriage    float64
dtype: object

In [None]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [None]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

47

In [58]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')
df_3.sample(5)

Unnamed: 0,Entity,Code,Year,"Crude marriage rate (marriages per 1,000 people)"
524,Czechia,CZE,1991,7.0
1580,New Zealand,NZL,1978,7.1
1769,Portugal,PRT,1984,7.0
376,Croatia,HRV,1960,8.9
1929,Slovakia,SVK,2018,5.7


In [60]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_3.sample(5)

Unnamed: 0,entity,code,year,crude_marriage_rate_marriages_per_1000_people
772,Germany,DEU,1970,7.4
571,Denmark,DNK,1977,6.3
1132,Italy,ITA,1989,5.7
1298,Lithuania,LTU,1974,8.8
1169,Japan,JPN,1967,9.6


In [61]:
# Rename long column
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [62]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)
print(df_2['year'].dtypes)

int32


In [63]:
df_3.drop_duplicates(inplace=True)
print(df_3.isnull().sum())

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64


In [65]:
df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [73]:
df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

427