# Alive & Thrive
## Viet Nam Data: Women Files - Preprocessing
## Prepared by Aaron Wise; aaron@a3di.dev
### Version: 25 May 2022

In [1]:
# %load std_imports
from pathlib import Path

import numpy as np
import pandas as pd

import json
import pyreadstat

from std_utils import (
    read_spss_file,
    run_quality_assurance,
    generate_HHID,
    generate_MEMID,
    subset_edu_save,
    add_total_year,
    merge_hh_hl_data,
    subset_hl_df,
    standardize_col_names,
    standardize_col_values,
    create_elderly_hoh,
    save_merge,
    export_analyzed_data
)

pd.set_option("display.max_rows", 1500)
pd.set_option("display.max_columns", None)


In [2]:
# %load women_imports.py
from women_analysis import (
    generate_str_replace_dict,
    create_anc_4_visits,
    create_anc_3_components,
    create_inst_delivery,
    create_caesarean_del,
    create_pnc_mother,
    create_low_bw,
    create_early_bf,
    create_mother_edu,
    subset_women_file
)

from aw_analytics import mean_wt, output_mean_table


### --- 2021 ---

In [3]:
# Set parameters
country = 'VNM'
recode = 'women'

# -------------------------------------------------------------------
year = '2021'
# -------------------------------------------------------------------

In [4]:
# Read file
df = read_spss_file(country, year, recode)

# Create HHID to facilitate merge of HH and HL data
generate_HHID(df, country, year, recode)

# Add Total, Year
add_total_year(df, year)

# Run quality assurance
run_quality_assurance(df)

# Merge in HH and HL data
df = merge_hh_hl_data(df, country, year)

# Subset women file
df = subset_women_file(df, country, year)

The file -- wm_2021.sav -- has the following shape: Rows: 11294; Columns: 458
HHID is NOT unique
Drop columns if all values are NaN...
Updated -- Rows: 11294; Columns: 450
Checking if any rows are duplicates...
The are no duplicate rows
The number of mothers with a birth in the past two years is: 1566


#### Create Indicators

In [5]:
# --- Create Mother edu [mother_edu] --- #
df = create_mother_edu(df, country, year, recode)

In [6]:
# --- ANC 4+ visits [anc_4_visits] --- #
df = create_anc_4_visits(df, country, year)

# --- ANC components [anc_3_components] --- #
df = create_anc_3_components(df, country, year)

# --- Institutional [inst_delivery] --- #
df = create_inst_delivery(df, country, year)

# --- Caesarean Delivery [caesarean_del] --- #
df = create_caesarean_del(df, country, year)

# Post-natal Health Check (mother) [pnc_mother]
df = create_pnc_mother(df, country, year)

# --- Low birthweight [low_bw] --- #
df = create_low_bw(df, country, year)

# --- Early Initiation BF [early_bf] --- #
df = create_early_bf(df, country, year)

agg_value_prop_dict is: 
 {'AVERAGE': 0.0183049147442327, 'LARGER THAN AVERAGE': 0.0, 'SMALLER THAN AVERAGE': 0.2652671755725191, 'NO RESPONSE': nan, 'VERY LARGE': 0.034482758620689655, 'VERY SMALL': 0.775, 'DK': 0.0}


  agg_value_prop_dict[agg_value] = numerator / denominator


### Export working variables

In [7]:
export_analyzed_data(df, country, year, recode)

### --- 2014 ---

In [8]:
# Set parameters
country = 'VNM'
recode = 'women'

# -------------------------------------------------------------------
year = '2014'
# -------------------------------------------------------------------

In [9]:
# Read file
df = read_spss_file(country, year, recode)

# Create HHID to facilitate merge of HH and HL data
generate_HHID(df, country, year, recode)

# Add Total, Year
add_total_year(df, year)

# Run quality assurance
run_quality_assurance(df)

# Merge in HH and HL data
df = merge_hh_hl_data(df, country, year)

# Subset women file
df = subset_women_file(df, country, year)

The file -- wm_2014.sav -- has the following shape: Rows: 10190; Columns: 258
HHID is NOT unique
Drop columns if all values are NaN...
Updated -- Rows: 10190; Columns: 260
Checking if any rows are duplicates...
The are no duplicate rows
The number of mothers with a birth in the past two years is: 1484


In [10]:
# --- Create Mother edu [mother_edu] --- #
df = create_mother_edu(df, country, year, recode)

In [11]:
# --- ANC 4+ visits [anc_4_visits] --- #
df = create_anc_4_visits(df, country, year)

# --- ANC components [anc_3_components] --- #
df = create_anc_3_components(df, country, year)

# --- Institutional [inst_delivery] --- #
df = create_inst_delivery(df, country, year)

# --- Caesarean Delivery [caesarean_del] --- #
df = create_caesarean_del(df, country, year)

# Post-natal Health Check (mother) [pnc_mother]
df = create_pnc_mother(df, country, year)

# --- Low birthweight [low_bw] --- #
df = create_low_bw(df, country, year)

# --- Early Initiation BF [early_bf] --- #
df = create_early_bf(df, country, year)

agg_value_prop_dict is: 
 {'Larger than average': 0.0, 'Average': 0.01977401129943503, 'Very small': 0.75, 'Smaller than average': 0.3850806451612903, 'Missing': nan, 'Very large': 0.0, 'DK': nan}


  agg_value_prop_dict[agg_value] = numerator / denominator


#### Export working variables

In [12]:
export_analyzed_data(df, country, year, recode)

### --- 2011 ---

In [13]:
# Set parameters
country = 'VNM'
recode = 'women'

# -------------------------------------------------------------------
year = '2011'
# -------------------------------------------------------------------

In [14]:
# Read file
df = read_spss_file(country, year, recode)

# Create HHID to facilitate merge of HH and HL data
generate_HHID(df, country, year, recode)

# Add Total, Year
add_total_year(df, year)

# Run quality assurance
run_quality_assurance(df)

# Merge in HH and HL data
df = merge_hh_hl_data(df, country, year)

# Subset women file
df = subset_women_file(df, country, year)

The file -- wm_2011.sav -- has the following shape: Rows: 12115; Columns: 235
HHID is NOT unique
Drop columns if all values are NaN...
Updated -- Rows: 12115; Columns: 238
Checking if any rows are duplicates...
The are no duplicate rows
The number of mothers with a birth in the past two years is: 1363


#### Create Indicators

In [15]:
# --- Create Mother edu [mother_edu] --- #
df = create_mother_edu(df, country, year, recode)

In [16]:
# --- ANC 4+ visits [anc_4_visits] --- #
df = create_anc_4_visits(df, country, year)

# --- ANC components [anc_3_components] --- #
df = create_anc_3_components(df, country, year)

# --- Institutional [inst_delivery] --- #
df = create_inst_delivery(df, country, year)

# --- Caesarean Delivery [caesarean_del] --- #
df = create_caesarean_del(df, country, year)

# # Post-natal Health Check (mother) [pnc_mother]
# df = create_pnc_mother(df, country, year)

# --- Low birthweight [low_bw] --- #
df = create_low_bw(df, country, year)

# --- Early Initiation BF [early_bf] --- #
df = create_early_bf(df, country, year)

agg_value_prop_dict is: 
 {'Average': 0.019211065573770492, 'Smaller than average': 0.3813131313131313, 'Missing': 1.0, 'Larger than average': 0.0, 'Very small': 0.7333333333333333, 'Very large': 0.0, 'DK': 0.0}


### Export working variables

In [17]:
export_analyzed_data(df, country, year, recode)

### --- 2006 ---

In [18]:
# Set parameters
country = 'VNM'
recode = 'women'

# -------------------------------------------------------------------
year = '2006'
# -------------------------------------------------------------------

In [19]:
# Read file
df = read_spss_file(country, year, recode)

# Create HHID to facilitate merge of HH and HL data
generate_HHID(df, country, year, recode)

# Add Total, Year
add_total_year(df, year)

# Run quality assurance
run_quality_assurance(df)

# Merge in HH and HL data
df = merge_hh_hl_data(df, country, year)

# Subset women file
df = subset_women_file(df, country, year)

The file -- wm_2006.sav -- has the following shape: Rows: 9471; Columns: 252
HHID is NOT unique
Drop columns if all values are NaN...
Updated -- Rows: 9471; Columns: 255
Checking if any rows are duplicates...
The are no duplicate rows
The number of mothers with a birth in the past two years is: 1023


#### Create Indicators

In [20]:
# --- Create Mother edu [mother_edu] --- #
df = create_mother_edu(df, country, year, recode)

In [21]:
# # --- ANC 4+ visits [anc_4_visits] --- #
# df = create_anc_4_visits(df, country, year)

# --- ANC components [anc_3_components] --- #
df = create_anc_3_components(df, country, year)

# --- Institutional [inst_delivery] --- #
df = create_inst_delivery(df, country, year)

# # --- Caesarean Delivery [caesarean_del] --- #
# df = create_caesarean_del(df, country, year)

# # Post-natal Health Check (mother) [pnc_mother]
# df = create_pnc_mother(df, country, year)

# --- Low birthweight [low_bw] --- #
df = create_low_bw(df, country, year)

# --- Early Initiation BF [early_bf] --- #
df = create_early_bf(df, country, year)

agg_value_prop_dict is: 
 {'Smaller than average': 0.30319148936170215, 'Average': 0.029783393501805054, 'Larger than average': 0.0035460992907801418, 'Very large': 0.08333333333333333, 'Very small': 0.5666666666666667, 'nan': nan, 'DK': 0.125}


  agg_value_prop_dict[agg_value] = numerator / denominator


### Export working variables

In [22]:
export_analyzed_data(df, country, year, recode)

### --- 2000 ---

In [23]:
# Set parameters
country = 'VNM'
recode = 'women'

# -------------------------------------------------------------------
year = '2000'
# -------------------------------------------------------------------

In [24]:
# Read file
df = read_spss_file(country, year, recode)

# Create HHID to facilitate merge of HH and HL data
generate_HHID(df, country, year, recode)

# Add Total, Year
add_total_year(df, year)

# Run quality assurance
run_quality_assurance(df)

# Merge in HH and HL data
df = merge_hh_hl_data(df, country, year)

# Subset women file
df = subset_women_file(df, country, year)

The file -- wm_2000.sav -- has the following shape: Rows: 9252; Columns: 182
HHID is NOT unique
Drop columns if all values are NaN...
Updated -- Rows: 9252; Columns: 186
Checking if any rows are duplicates...
The are no duplicate rows
The number of mothers with a birth in the past two years is: 550


#### Create Indicators

In [25]:
# --- Create Mother edu [mother_edu] --- #
df = create_mother_edu(df, country, year, recode)

In [26]:
# # --- ANC 4+ visits [anc_4_visits] --- #
# df = create_anc_4_visits(df, country, year)

# # --- ANC components [anc_3_components] --- #
# df = create_anc_3_components(df, country, year)

# # --- Institutional [inst_delivery] --- #
# df = create_inst_delivery(df, country, year)

# # --- Caesarean Delivery [caesarean_del] --- #
# df = create_caesarean_del(df, country, year)

# # Post-natal Health Check (mother) [pnc_mother]
# df = create_pnc_mother(df, country, year)

# --- Low birthweight [low_bw] --- #
df = create_low_bw(df, country, year)

# # --- Early Initiation BF [early_bf] --- #
# df = create_early_bf(df, country, year)

agg_value_prop_dict is: 
 {'Average': 0.01683937823834197, 'Very large': 0.0, 'Larger than average': 0.0125, 'Smaller than average': 0.40625, 'Very small': 1.0, "Don't know": nan, 'nan': nan}


  agg_value_prop_dict[agg_value] = numerator / denominator


### Export working variables

In [27]:
export_analyzed_data(df, country, year, recode)