# Combine all existing pools to check for shared combinations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pickle
from PopSynthesis.Methods.IPSF.const import (
    HH_TAG,
    HH_ATTS,
    PP_ATTS,
    NOT_INCLUDED_IN_BN_LEARN,
    processed_dir
)

In [3]:
hh_pool = pd.read_csv(processed_dir / "HH_pool.csv")
with open(processed_dir / "dict_pool_pairs_by_layers.pickle", "rb") as handle:
    pools_ref = pickle.load(handle)

In [4]:
all_rela = set(x.split("-")[-1] for x in pools_ref.keys())
all_rela

{'Child',
 'Grandchild',
 'Grandparent',
 'Main',
 'Others',
 'Parent',
 'Sibling',
 'Spouse'}

In [5]:
pools_ref[HH_TAG] = hh_pool

In [6]:
pp_atts = [x for x in PP_ATTS if x not in NOT_INCLUDED_IN_BN_LEARN]
hh_atts = [x for x in HH_ATTS if x not in NOT_INCLUDED_IN_BN_LEARN]

In [7]:
def cross_check_between_2_pools(pool1, pool2, considered_atts):
    assert set(considered_atts) <= set(pool1.columns)
    assert set(considered_atts) <= set(pool2.columns)
    converted_pool1 = pool1.set_index(considered_atts)
    converted_pool2 = pool2.set_index(considered_atts)
    possible_comb = set(converted_pool1.index) & set(converted_pool2.index)
    result_pool1 = converted_pool1.loc[list(possible_comb)].reset_index()
    result_pool2 = converted_pool2.loc[list(possible_comb)].reset_index()
    return result_pool1, result_pool2

In [8]:
# Process HH
HH_pool, HH_main_pool = cross_check_between_2_pools(pool1=pools_ref[HH_TAG], pool2=pools_ref["HH-Main"], considered_atts=hh_atts)
pools_ref[HH_TAG] = HH_pool
pools_ref["HH-Main"] = HH_main_pool

In [9]:
# Process Main
store_main = []
for rela in all_rela:
    if rela != "Main":
        pool_main = pools_ref["HH-Main"][pools_ref["HH-Main"][rela]>0]
        pool_rela = pools_ref[f"Main-{rela}"]
        considered_atts = [f"{x}_Main" for x in pp_atts]
        result_main, result_rela = cross_check_between_2_pools(pool1=pool_main, pool2=pool_rela, considered_atts=considered_atts)
        store_main.append(result_main)
        pools_ref[f"Main-{rela}"] = result_rela
pools_ref["HH-Main"] = pd.concat(store_main)

In [10]:
# to_consider_tags = [HH_TAG, "Main"]
# for tag in to_consider_tags:
#     considered_atts = hh_atts if tag == HH_TAG else [f"{x}_{tag}" for x in pp_atts]
#     related_pool_names = [x for x in pools_ref.keys() if tag in x]
#     possible_comb = []
#     for pool_name in related_pool_names:
#         print(f"Processing {pool_name} for {tag}")
#         pool = pools_ref[pool_name].copy(deep=True)
#         assert set(considered_atts) <= set(pool.columns)
#         converted_pool = pool.set_index(considered_atts)
#         possible_comb.append(set(converted_pool.index))
#     possible_comb = set.intersection(*possible_comb)
#     # update the pool, removing not matched combinations
#     for pool_name in related_pool_names:
#         pool = pools_ref[pool_name].copy(deep=True)
#         converted_pool = pool.set_index(considered_atts)
#         pools_ref[pool_name] = converted_pool.loc[list(possible_comb)].reset_index()

In [11]:
with open(processed_dir / "dict_pool_pairs_check_HH_main.pickle", "wb") as handle:
    pickle.dump(pools_ref, handle, protocol=pickle.HIGHEST_PROTOCOL)