In [3]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

In [4]:
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [5]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'


In [6]:
engine = create_engine(url, echo=False)

In [8]:
my_schema = 'team_5' # update it to your schema

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [9]:
df_1= pd.read_csv('../data/Raw/World_Marriage_Dataset.csv')

In [10]:
df_1.sample(10)

Unnamed: 0,Sr.No.,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
228396,228397,Spain,[35-39],Man,Divorced,Census,2001,2001,UNSD
223693,223694,Slovenia,[70-74],Man,Single,Estimate,2010,2010,UNSD
169297,169298,Netherlands,[40-44],Man,Single,Estimate,1972,1972,UNSD
7372,7373,Australia,[35-39],Man,Widowed,Estimate,1999,1999,UNSD
2953,2954,Angola,[60-64],Woman,Separated,Census,2014,2014,UNSD
96377,96378,Greenland,[30-34],Woman,Married,Estimate,2015,2015,UNSD
201025,201026,Republic of Moldova,[40-44],Woman,Widowed,Survey,2005,2005,DHS_STATcompiler
134078,134079,Kenya,[40-44],Woman,Never married,Survey,2003,2003,DHS_STATcompiler
51426,51427,Costa Rica,[15-19],Woman,Single,Estimate,2016,2016,UNSD
74514,74515,Faeroe Islands,[20-24],Man,Single,Estimate,2013,2013,UNSD


In [11]:
df_1.drop(columns=["Sr.No."], inplace=True)

In [33]:
df_1

Unnamed: 0,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
0,Afghanistan,[15-19],Man,Divorced,Survey,1972,1974,National statistics
1,Afghanistan,[20-24],Man,Divorced,Survey,1972,1974,National statistics
2,Afghanistan,[25-29],Man,Divorced,Survey,1972,1974,National statistics
3,Afghanistan,[30-34],Man,Divorced,Survey,1972,1974,National statistics
4,Afghanistan,[35-39],Man,Divorced,Survey,1972,1974,National statistics
...,...,...,...,...,...,...,...,...
271599,Zimbabwe,[55-59],Woman,Widowed,Survey,2017,2017,National statistics
271600,Zimbabwe,[60-64],Woman,Widowed,Survey,2017,2017,National statistics
271601,Zimbabwe,[65-69],Woman,Widowed,Survey,2017,2017,National statistics
271602,Zimbabwe,[70-74],Woman,Widowed,Survey,2017,2017,National statistics


In [12]:
df_1.columns = df_1.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [13]:
df_1.rename(columns={
    "agegroup": "age_group",
    "maritalstatus": "marital_status",
    "dataprocess": "data_process",
    "Data Collection (Start Year)": "data_collection_start_year",
    "Data Collection (End Year)": "data_collection_end_year",
    "Data Source": "data_source",
    "Country": "country",
    "Sex": "sex"
}, inplace=True)

In [14]:
df_1.sample(5)

Unnamed: 0,country,age_group,sex,marital_status,data_process,data_collection_start_year,data_collection_end_year,data_source
142983,Liberia,[65-69],Man,Divorced,Census,2008,2008,UNSD
190465,Paraguay,[60-64],Man,Consensual union,Census,1972,1972,UNSD
22144,Botswana,[75+],Woman,Divorced,Census,2011,2011,UNSD
142190,Liberia,[30-34],Woman,Divorced,Survey,1969,1970,INED
232390,Sweden,[40-44],Woman,Widowed,Census,1970,1970,UNSD


In [15]:
df_1.drop_duplicates(inplace=True)
print(df_1.isnull().sum())

country                       0
age_group                     0
sex                           0
marital_status                0
data_process                  0
data_collection_start_year    0
data_collection_end_year      0
data_source                   0
dtype: int64


In [16]:
print(df_1.head())

       country age_group  sex marital_status data_process  \
0  Afghanistan   [15-19]  Man       Divorced       Survey   
1  Afghanistan   [20-24]  Man       Divorced       Survey   
2  Afghanistan   [25-29]  Man       Divorced       Survey   
3  Afghanistan   [30-34]  Man       Divorced       Survey   
4  Afghanistan   [35-39]  Man       Divorced       Survey   

   data_collection_start_year  data_collection_end_year          data_source  
0                        1972                      1974  National statistics  
1                        1972                      1974  National statistics  
2                        1972                      1974  National statistics  
3                        1972                      1974  National statistics  
4                        1972                      1974  National statistics  


In [17]:
df_1['data_collection_start_year'] = df_1['data_collection_start_year'].astype(str).str.replace(',', '').astype(int)
df_1['data_collection_end_year'] = df_1['data_collection_end_year'].astype(str).str.replace(',', '').astype(int)
print(df_1[['data_collection_start_year', 'data_collection_end_year']].dtypes)

data_collection_start_year    int32
data_collection_end_year      int32
dtype: object


In [None]:
#df_1.to_sql('world_marriage', engine, if_exists='replace', index=False)

417

In [20]:
print(df_1.dtypes)

country                       object
age_group                     object
sex                           object
marital_status                object
data_process                  object
data_collection_start_year     int32
data_collection_end_year       int32
data_source                   object
dtype: object


In [None]:
#df_1.to_csv("cleaned_world_marriage.csv", index=False)

In [28]:
df_2 = pd.read_csv('../data/Raw/age-at-marriage-women.csv')
df_2.sample(5)

Unnamed: 0,Entity,Code,Year,Mean age of women at first marriage,1005564-annotations
322,France,FRA,2018,33.1,
503,Japan,JPN,1999,26.8,
315,France,FRA,2010,30.7,
170,Cyprus,CYP,1992,25.4,
447,Israel,ISR,2002,24.2,


In [29]:
df_2.columns = df_2.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [30]:
df_2

Unnamed: 0,entity,code,year,mean_age_of_women_at_first_marriage,1005564annotations
0,Australia,AUS,1990,24.3,median
1,Australia,AUS,1991,24.5,median
2,Australia,AUS,1992,24.7,median
3,Australia,AUS,1993,24.8,median
4,Australia,AUS,1994,25.1,median
...,...,...,...,...,...
1042,United States,USA,2017,27.4,median
1043,United States,USA,2018,27.8,median
1044,United States,USA,2019,28.0,median
1045,United States,USA,2020,28.1,median


In [31]:
df_2 = df_2.drop(columns=['1005564annotations'])

df_2.rename(columns={
    "entity": "country",   
}, inplace=True)


In [32]:
df_2.drop_duplicates(inplace=True)
print(df_2.isnull().sum())

country                                0
code                                   0
year                                   0
mean_age_of_women_at_first_marriage    0
dtype: int64


In [33]:
df_2

Unnamed: 0,country,code,year,mean_age_of_women_at_first_marriage
0,Australia,AUS,1990,24.3
1,Australia,AUS,1991,24.5
2,Australia,AUS,1992,24.7
3,Australia,AUS,1993,24.8
4,Australia,AUS,1994,25.1
...,...,...,...,...
1042,United States,USA,2017,27.4
1043,United States,USA,2018,27.8
1044,United States,USA,2019,28.0
1045,United States,USA,2020,28.1


In [34]:
df_2['year'] = df_2['year'].astype(str).str.replace(',', '').astype(int)
print(df_2['year'].dtypes)

int32


In [35]:
df_2.dtypes

country                                 object
code                                    object
year                                     int32
mean_age_of_women_at_first_marriage    float64
dtype: object

In [None]:
#df_2.to_csv("cleaned_age_at_marriage_women.csv", index=False)

In [None]:
#df_2.to_sql('age_at_marriage_women', engine, if_exists='replace', index=False)

47

In [36]:
df_3= pd.read_csv('../data/Raw/marriage-rate-per-1000-inhabitants.csv')
df_3.sample(5)

Unnamed: 0,Entity,Code,Year,"Crude marriage rate (marriages per 1,000 people)"
232,Bulgaria,BGR,2019,4.2
485,Cyprus,CYP,2012,6.7
2124,Sweden,SWE,1979,4.5
2225,Turkey,TUR,1980,8.2
1645,Norway,NOR,1982,5.3


In [37]:
df_3.columns = df_3.columns.str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_3.sample(5)

Unnamed: 0,entity,code,year,crude_marriage_rate_marriages_per_1000_people
2362,United States,USA,1955,9.3
1174,Japan,JPN,1972,10.4
1186,Japan,JPN,1984,6.2
702,Finland,FIN,1986,5.2
1692,Poland,POL,1968,8.0


In [38]:
# Rename long column
df_3.rename(columns={
    "entity": "country",   
}, inplace=True)

In [39]:
df_3['year'] = df_3['year'].astype(str).str.replace(',', '').astype(int)
print(df_3['year'].dtypes)

int32


In [40]:
df_3.drop_duplicates(inplace=True)
print(df_3.isnull().sum())

country                                          0
code                                             0
year                                             0
crude_marriage_rate_marriages_per_1000_people    0
dtype: int64


In [None]:
#df_3.to_csv("cleaned_marriage-rate-per-1000-inhabitants.csv", index=False)

In [None]:
#df_3.to_sql('married_rate_per_1000', engine, if_exists='replace', index=False)

427

In [93]:
df_4= pd.read_csv('../data/Raw/marriage-rates-in-1990-vs-2020.csv')
df_4.sample(6)

Unnamed: 0,Entity,Code,Year,"Crude marriage rate (marriages per 1,000 people)",World regions according to OWID,"Crude marriage rate (marriages per 1,000 people).1",Year.1
576,Czechia,CZE,1985,7.8,,8.8,1990.0
1882,Poland,POL,1964,7.4,,6.7,1990.0
740,Estonia,EST,2010,3.8,,7.5,1990.0
30,Australia,AUS,1988,7.1,,6.9,1990.0
2677,United States,USA,2017,6.9,,,
2294,Spain,ESP,1981,5.4,,5.7,1990.0


In [94]:
df_4.columns = df_4.columns.str.lower() \
    .str.replace(' ', '') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('[^0-9a-zA-Z_]', '', regex=True)
df_4.sample(6)

Unnamed: 0,entity,code,year,crudemarriageratemarriagesper1000people,worldregionsaccordingtoowid,crudemarriageratemarriagesper1000people1,year1
412,Costa Rica,CRI,2005,6.1,,7.5,1990.0
2242,South Korea,KOR,1992,9.6,,9.3,1990.0
1836,Norway,NOR,1992,4.5,,5.2,1990.0
937,Greece,GRC,1978,7.7,,5.8,1990.0
2497,United Kingdom,GBR,1960,7.5,,6.6,1990.0
747,Estonia,EST,2017,4.9,,7.5,1990.0


In [95]:
df_4 = df_4.drop(columns=['worldregionsaccordingtoowid'])

df_4.rename(columns={
    "crudemarriageratemarriagesper1000people": "crude_marriage_rate",
    "crudemarriageratemarriagesper1000people1": "crude_marriage_rate_people1",
    "year1": "year_1"
}, inplace=True)



In [96]:
df_4.drop_duplicates(inplace=True)
print(df_4.isnull().sum())

entity                           0
code                            14
year                             0
crude_marriage_rate            271
crude_marriage_rate_people1    449
year_1                         449
dtype: int64


In [97]:
df_4.dropna(inplace=True)

In [98]:
df_4.sample(6)

Unnamed: 0,entity,code,year,crude_marriage_rate,crude_marriage_rate_people1,year_1
245,Bulgaria,BGR,1998,4.3,6.9,1990.0
405,Costa Rica,CRI,1998,6.7,7.5,1990.0
1296,Japan,JPN,1974,9.1,5.9,1990.0
2046,Romania,ROU,2000,6.1,8.3,1990.0
897,Germany,DEU,2005,4.7,6.5,1990.0
70,Austria,AUT,1966,7.6,5.9,1990.0


In [99]:
df_4['year_1'] = pd.to_numeric(df_4['year_1'], errors='coerce').astype('Int64')
print(df_4['year'].dtypes)

int64


In [100]:
df_4

Unnamed: 0,entity,code,year,crude_marriage_rate,crude_marriage_rate_people1,year_1
21,Australia,AUS,1970,9.3,6.9,1990
22,Australia,AUS,1980,7.4,6.9,1990
23,Australia,AUS,1981,7.6,6.9,1990
24,Australia,AUS,1982,7.7,6.9,1990
25,Australia,AUS,1983,7.5,6.9,1990
...,...,...,...,...,...,...
2549,United Kingdom,GBR,2015,4.2,6.6,1990
2550,United Kingdom,GBR,2016,4.3,6.6,1990
2551,United Kingdom,GBR,2017,4.1,6.6,1990
2552,United Kingdom,GBR,2018,4.0,6.6,1990


In [None]:
#df_4.to_csv("cleaned_marriage-rates-in-1990-vs-2020.csv", index=False)

In [None]:
#df_4.to_sql('marriage_rates_in_1990_vs_2020', engine, if_exists='replace', index=False)

211