# Alive & Thrive
## Viet Nam Data: Child Files - Preprocessing
## Prepared by Aaron Wise; aaron@a3di.dev
### Version: 28 May 2022

In [9]:
# %load std_imports.py
from pathlib import Path

import numpy as np
import pandas as pd

import json
import pyreadstat

from utils import create_HHID, read_file, run_quality_assurance, add_HHID, merge_hh_hl_data
from aw_analytics import mean_wt, output_mean_table

pd.set_option("display.max_rows", 1500)
pd.set_option("display.max_columns", None)

### --- 2021 ---

In [10]:
# -- Read in child file -- #

## Set parameters
country = 'VNM'
recode = 'children'

# -------------------------------------------------------------------
year = '2021'
# -------------------------------------------------------------------


df = read_file(country, recode, year)


# --- Run quality assurance --- #

run_quality_assurance(df)

# Create HHID var to facilitate merge of HH and HL data

add_HHID(df, country, year, recode)

# Merge Age_HoH, Sex_HoH and Eth_HoH information

df = merge_hh_hl_data(df, country, year)

Rows: 4404; Columns: 537
Drop columns if all values are NaN...
Updated -- Rows: 4404; Columns: 533
Checking if any rows are duplicates...
The are no duplicate rows
HHID is NOT unique


#### Create Age Indicators ####

In [11]:
# CAGE appears to be created by CSPro and therefor not reproducible from raw dataset

# --- Age 0-5 months [ch_age_05] --- #
df['ch_age_05'] = np.where(df['CAGE'] <= 5, 1, 0)

# --- Age 6-8 months [ch_age_68] --- #
df['ch_age_68'] = np.where((df['CAGE'] >= 6) & (df['CAGE'] <= 8), 1, 0)

# --- Age 9-23 months [ch_age_923] --- #
df['ch_age_923'] = np.where((df['CAGE'] >= 9) & (df['CAGE'] <= 23), 1, 0)

# --- Age 12-23 months [ch_age_1223] --- #
df['ch_age_1223'] = np.where((df['CAGE'] >= 12) & (df['CAGE'] <= 23), 1, 0)

# --- Age 6-23 months [ch_age_623] --- #
df['ch_age_623'] = np.where((df['CAGE'] >= 6) & (df['CAGE'] <= 23), 1, 0)


#### Create Outcome Indicators

In [None]:
# --- Exclusive BF [excl_bf] --- #

## Child drank NO liquids
bd7_cols = ['BD7A', 'BD7B1', 'BD7B2', 'BD7C', 'BD7D', 'BD7E', 'BD7X']
df['bd7_none'] = np.where(df[bd7_cols].eq('NO').all(axis=1), 1, 0)

## Child ate NO solids
bd8_cols = ['BD8A', 'BD8B', 'BD8C', 'BD8D', 'BD8E', 'BD8F', 'BD8G', 'BD8H', 'BD8I', 'BD8J',
'BD8K', 'BD8L', 'BD8M', 'BD8N', 'BD8X']
df['bd8_none'] = np.where(df[bd8_cols].eq('NO').all(axis=1), 1, 0)

# Create indicator
df['excl_bf'] = np.where((df['BD3'] == 'YES') & (df['bd7_none'] == 1) & (df['bd8_none'] == 1), 100, 0)
df['excl_bf'] = np.where(df['ch_age_05'] == 0, np.nan, df['excl_bf'])

In [None]:
# --- Continued Breastfeeding 12-23 mos [cont_1223_bf] --- #
df['cont_1223_bf'] = np.where(df['BD3'] == 'YES', 100, 0)
df['cont_1223_bf'] = np.where(df['ch_age_1223'] == 0, np.nan, df['cont_1223_bf'])

In [None]:
# --- Minimum Dietary Diversity [mdd_ch] --- #

# Generate food groups
breastmilk = ['BD3']
df['breastmilk'] = np.where(df[breastmilk].eq('YES').any(axis=1), 1, 0)

grains = ['BD8B', 'BD8C', 'BD8E']
df['grains'] = np.where(df[grains].eq('YES').any(axis=1), 1, 0)

legumes = ['BD8M']
df['legumes'] = np.where(df[legumes].eq('YES').any(axis=1), 1, 0)

dairy = ['BD7D', 'BD7E', 'BD8A', 'BD8N']
df['dairy'] = np.where(df[dairy].eq('YES').any(axis=1), 1, 0)

flesh = ['BD8I', 'BD8J', 'BD8L']
df['flesh'] = np.where(df[flesh].eq('YES').any(axis=1), 1, 0)

eggs = ['BD8K']
df['eggs'] = np.where(df[eggs].eq('YES').any(axis=1), 1, 0)

vitaminA = ['BD8D', 'BD8F', 'BD8G']
df['vitaminA'] = np.where(df[vitaminA].eq('YES').any(axis=1), 1, 0)

other = ['BD8H']
df['other'] = np.where(df[other].eq('YES').any(axis=1), 1, 0)

# Create indicator
food_groups = ['breastmilk', 'grains', 'legumes', 'dairy', 'flesh', 'eggs', 'vitaminA', 'other']

df['mdd_ch'] = np.where(df[food_groups].apply(sum, axis=1) >= 5, 100, 0)
df['mdd_ch'] = np.where(df['ch_age_623'] == 0, np.nan, df['mdd_ch'])

In [None]:
# --- Minimum Meal Frequency [mmf_ch] --- #

## Convert BD7D1, BD7E1, and BD9 categorical values to int
num_times_dict = {'ONE': 1.0, 'TWO': 2.0, 'THREE': 3.0, 'FOUR': 4.0, 'FIVE': 5.0, 'SIX': 6.0, 'SEVEN OR MORE': 7.0,
'NaN': np.nan, 'DK': np.nan, 'NO RESPONSE': np.nan}

df['solid_semi_soft_times'] = df['BD9'].map(num_times_dict).astype(float).fillna(0)
df['formula_times'] = df['BD7D1'].map(num_times_dict).astype(float).fillna(0)
df['other_milk_times'] = df['BD7E1'].map(num_times_dict).astype(float).fillna(0)

## BREASTFED: Age 6-8 months with 2 soft, semi-soft, solid feeds
df['mmf_bf_68'] = np.where((df['BD3'] == 'YES') & (df['solid_semi_soft_times'] >= 2), 1, 0)
df['mmf_bf_68'] = np.where(df['ch_age_68'] == 0, 0, df['mmf_bf_68'])

## BREASTFED: Age 9-23 months with 3 soft, semi-soft, solid feeds
df['mmf_bf_923'] = np.where((df['BD3'] == 'YES') & (df['solid_semi_soft_times'] >= 3), 1, 0)
df['mmf_bf_923'] = np.where(df['ch_age_923'] == 0, 0, df['mmf_bf_923'])

## NON-BREASTFED: Age 6-23 months with 4 soft, semi-soft, solid or milk feeds (** At least 1 is semi, soft, solid)
solid_semi_soft_milk_times = ['formula_times', 'other_milk_times', 'solid_semi_soft_times']
df['solid_semi_soft_milk_times'] = np.where(df[solid_semi_soft_milk_times].apply(sum, axis=1) >= 4, 1, 0)

df['mmf_nbf_623'] = np.where((df['breastmilk'] == 0) & (df['solid_semi_soft_milk_times'] == 1) & (df['solid_semi_soft_times'] >= 1), 1, 0)
df['mmf_nbf_623'] = np.where(df['ch_age_623'] == 0, 0, df['mmf_nbf_623'])

# Create indicator
mmf_cols = ['mmf_bf_68', 'mmf_bf_923', 'mmf_nbf_623']
df['mmf_ch'] = np.where(df[mmf_cols].eq(1).any(axis=1), 100, 0)
df['mmf_ch'] = np.where(df['ch_age_623'] == 0, np.nan, df['mmf_ch'])

In [None]:
# --- Minimum Acceptable Diet [mad_ch] --- #

## BREASTFED: Age 6-23 months w/ mdd and mmf
df['mad_bf_623'] = np.where((df['breastmilk'] == 1) & (df['mdd_ch'] == 100) & (df['mmf_ch'] == 100), 1, 0)

## NON-BREASTFED: Age 6-23 months w/ mdd and mmf and >= 2 milk feeds
milk_times = ['formula_times', 'other_milk_times']
df['milk_feeds_2'] = np.where(df[milk_times].sum(axis = 1) >= 2, 1, 0)

df['mad_nbf_623'] = np.where((df['breastmilk'] == 0) & (df['mdd_ch'] == 100) & (df['mmf_ch'] == 100) & (df['milk_feeds_2'] == 1), 1, 0)

# Create indicator
mad_cols = ['mad_bf_623', 'mad_nbf_623']
df['mad_ch'] = np.where(df[mad_cols].eq(1).any(axis=1), 100, 0)
df['mad_ch'] = np.where(df['ch_age_623'] == 0, np.nan, df['mad_ch'])

#### Create Equity Stratifiers

In [None]:
# --- Child Sex --- #
df['female'] = np.where(df['HL4'] == 'FEMALE', 'Female', 'Male')

# --- Residence --- #
df['residence'] = np.where(df['HH6'] == 'RURAL', 'rural', 'urban')

# --- Location --- #

## Combine region value with city value
df['region_comb'] = df['HH7']
df['region_comb'] = np.where(df['HH7A'] == 'Ha Noi', 'Ha Noi', df['region_comb'])
df['region_comb'] = np.where(df['HH7A'] == 'Ho Chi Minh city', 'Ho Chi Minh city', df['region_comb'])

# --- Mother's education --- #

## Create categories
conditions = [
    ((df['melevel'].isnull()) | (df['melevel'] == 'Pre-primary or none')),
    (df['melevel'] == 'Primary'),
    ((df['melevel'] == 'Lower Secondary') | (df['melevel'] == 'Upper Secondary') |
     (df['melevel'] == 'Vocational High school')),
    (df['melevel'] == 'University/College/Higher')
]
choices = ['None', 'Primary', 'Secondary', 'Higher']
df['mother_edu'] = np.select(conditions, choices)

# --- Wealth quintile --- #
df['wealth_q'] = df['windex5']

# --- Ethnicity of HoH --- #
df['ethn_hoh'] = df['ethnicity']

# --- Female HoH --- #
df['sex_HoH'] = np.where(df['HHSEX'] == 'Female', 'Female', 'Male')

# --- Elderly HoH --- #
df['elderly_HoH'] = np.where(df['HHAGE'] >= 60, 'Yes, 60+', 'No, < 60')

# --- Disability --- #
## NA in VNM

#### Subset and export working dataset

In [None]:
# Identify and select columns for working dataset
working_var_idx = df.columns.get_loc('Total')
working_var_cols = df.columns[working_var_idx:-1].to_list()

out_df = df[working_var_cols]

# Generate out_filepath
out_file = recode.split('.')[0] + '_subset' + '.csv'
out_filepath = datadir.joinpath('clean').joinpath(out_file)

# Save as csv
out_df.to_csv(out_filepath, index=False)