In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pickle
from PopSynthesis.Methods.IPSF.const import (
    processed_dir,
    data_dir,
    PP_ATTS,
    HH_TAG,
    POOL_SIZE,
    NOT_INCLUDED_IN_BN_LEARN,
)
from PopSynthesis.Methods.IPSF.CSP.operations.convert_seeds import pair_states_dict, convert_seeds_by_ordered_pairs
from PopSynthesis.Methods.IPSF.utils.pool_utils import create_pool
from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import MIN_PARENT_CHILD_GAP
from PopSynthesis.Methods.IPSF.CSP.operations.extra_filters import filter_mismatch_hhsz, filter_paired_pool_agegr

In [3]:
# get the data
hh_marg = pd.read_csv(data_dir / "hh_marginals_ipu.csv", header=[0, 1])
hh_seed = pd.read_csv(data_dir / "hh_sample_ipu.csv").drop(columns=["sample_geog"])
pp_seed = pd.read_csv(data_dir / "pp_sample_ipu.csv").drop(columns=["sample_geog"])
with open(processed_dir / "dict_hh_states.pickle", "rb") as handle:
    hh_att_state = pickle.load(handle)
with open(processed_dir / "dict_pp_states.pickle", "rb") as handle:
    pp_att_state = pickle.load(handle)

In [4]:
hh_marg = hh_marg.drop(columns=hh_marg.columns[hh_marg.columns.get_level_values(0)=="sample_geog"][0])
# vars
rela_col = "relationship"
id_col = "serialno"
main_rela = "Main"
hh_tag = HH_TAG

In [5]:
ordered_pairs =[
    [
        ("HH", "Main")
    ],
    [
        ("Main", "Spouse"), 
        ("Main", "Child"), 
        ("Main", "Parent"),
        ("Main", "Sibling"),
        ("Main", "Others")
    ], 
    [
        ("Child", "Grandchild"), 
        ("Parent", "Grandparent")
    ]
]

In [6]:
# process seed
seed_pairs = convert_seeds_by_ordered_pairs(ordered_pairs, hh_seed, pp_seed, id_col, rela_col, main_rela)

In [7]:
seed_pairs["Parent-Grandparent"] = filter_paired_pool_agegr(pool=seed_pairs["Parent-Grandparent"], agegr_col_younger=f"age_Parent", agegr_col_older=f"age_Grandparent", min_gap=MIN_PARENT_CHILD_GAP)
seed_pairs["Child-Grandchild"] = filter_paired_pool_agegr(pool=seed_pairs["Child-Grandchild"], agegr_col_younger=f"age_Grandchild", agegr_col_older=f"age_Child", min_gap=MIN_PARENT_CHILD_GAP)

In [8]:
# process seed
# seed_pairs = convert_seeds_to_pairs(hh_seed, pp_seed, id_col, rela_col, main_rela)

# # Filter to remove cases of wrong age-group for these 4 special
# for rela in ["Grandchild", "Child", "Parent", "Grandparent"]:
#     min_gap = MIN_GRANDPARENT_GRANDCHILD_GAP if rela in ["Grandchild", "Grandparent"] else MIN_PARENT_CHILD_GAP
#     main_age_col = f"age_{main_rela}"
#     rela_age_col = f"age_{rela}"
#     older_age_col = main_age_col if rela in ["Grandchild", "Child"] else rela_age_col
#     younger_age_col = main_age_col if rela in ["Parent", "Grandparent"] else rela_age_col
#     assert younger_age_col != older_age_col
#     pools_ref[f"{main_rela}-{rela}"] = filter_paired_pool_agegr(pool=pools_ref[f"{main_rela}-{rela}"], agegr_col_younger=younger_age_col, agegr_col_older=older_age_col, min_gap=min_gap)

In [9]:
# create pools
pools_ref = {}
for pair_name, pair_seed in seed_pairs.items():
    name1, name2 = pair_name.split("-")
    ori_states_1 = hh_att_state if name1 == hh_tag else pp_att_state
    ori_states_2 = pp_att_state # because the second one always people
    processed_states_ref = pair_states_dict(ori_states_1, ori_states_2, name1, name2)
    # we only need matching columns that we wish to process for BN
    # this excludes relationship and ids
    assert set(processed_states_ref.keys()) <= set(pair_seed.columns)
    to_filter_col = list(processed_states_ref.keys())
    if name1 == "HH":
        # special case to add the rela cols
        to_filter_col += list(pp_seed[rela_col].unique())
    filtered_seed = pair_seed[to_filter_col]
    pools_ref[pair_name] = create_pool(filtered_seed, state_names=processed_states_ref, pool_sz=POOL_SIZE)

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/18 [00:00<?, ?it/s]

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/10 [00:00<?, ?it/s]

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/10 [00:00<?, ?it/s]

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/10 [00:00<?, ?it/s]

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/10 [00:00<?, ?it/s]

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/10 [00:00<?, ?it/s]

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/10 [00:00<?, ?it/s]

Learn BN


  0%|          | 0/1000000 [00:00<?, ?it/s]

Doing the sampling


  0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
pp_atts = list(set(PP_ATTS) - set(NOT_INCLUDED_IN_BN_LEARN))
all_rela = [x.split("-")[-1] for x in pools_ref.keys()]

In [11]:
# Rename HH-Main and filter mismatch by hhszie
rename_main = {x: f"{x}_{main_rela}" for x in pp_atts}
pools_ref[f"{hh_tag}-{main_rela}"] = pools_ref[f"{hh_tag}-{main_rela}"].rename(columns=rename_main)
pools_ref[f"{hh_tag}-{main_rela}"] = filter_mismatch_hhsz(
    pools_ref[f"{hh_tag}-{main_rela}"], "hhsize", all_rela
)

In [12]:
# Process from Main
for rela in ["Child", "Parent"]:
    min_gap = MIN_PARENT_CHILD_GAP
    main_age_col = f"age_{main_rela}"
    rela_age_col = f"age_{rela}"
    older_age_col = main_age_col if rela == "Child" else rela_age_col
    younger_age_col = main_age_col if rela == "Parent" else rela_age_col
    assert younger_age_col != older_age_col
    pools_ref[f"{main_rela}-{rela}"] = filter_paired_pool_agegr(pool=pools_ref[f"{main_rela}-{rela}"], agegr_col_younger=younger_age_col, agegr_col_older=older_age_col, min_gap=min_gap)

In [13]:
for root_rela, sample_rela in ordered_pairs[-1]:
    root_age_col = f"age_{root_rela}"
    sample_age_col = f"age_{sample_rela}"
    older_age_col = root_age_col if sample_rela == "Grandchild" else sample_age_col
    younger_age_col = root_age_col if sample_rela == "Grandparent" else sample_age_col
    min_gap = MIN_PARENT_CHILD_GAP
    pools_ref[f"{root_rela}-{sample_rela}"] = filter_paired_pool_agegr(pool=pools_ref[f"{root_rela}-{sample_rela}"], agegr_col_younger=younger_age_col, agegr_col_older=older_age_col, min_gap=min_gap)

In [14]:
with open(processed_dir / "dict_pool_pairs_by_layers.pickle", "wb") as handle:
    pickle.dump(pools_ref, handle, protocol=pickle.HIGHEST_PROTOCOL)