# Combine all existing pools to check for shared combinations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pickle
from PopSynthesis.Methods.IPSF.const import (
    HH_TAG,
    HH_ATTS,
    PP_ATTS,
    NOT_INCLUDED_IN_BN_LEARN,
    processed_dir
)

In [3]:
hh_pool = pd.read_csv(processed_dir / "HH_pool.csv")
with open(processed_dir / "dict_pool_pairs_by_layers.pickle", "rb") as handle:
    pools_ref = pickle.load(handle)

In [4]:
pools_ref[HH_TAG] = hh_pool

In [5]:
all_rela = set(x.split("-")[-1] for x in pools_ref.keys())
all_rela

{'Child',
 'Grandchild',
 'Grandparent',
 'HH',
 'Main',
 'Others',
 'Parent',
 'Sibling',
 'Spouse'}

In [6]:
pp_atts = [x for x in PP_ATTS if x not in NOT_INCLUDED_IN_BN_LEARN]
hh_atts = [x for x in HH_ATTS if x not in NOT_INCLUDED_IN_BN_LEARN]

In [7]:
to_consider_tags = [HH_TAG, "Main"]

In [10]:
for tag in to_consider_tags:
    considered_atts = hh_atts if tag == HH_TAG else [f"{x}_{tag}" for x in pp_atts]
    related_pool_names = [x for x in pools_ref.keys() if tag in x]
    possible_comb = []
    for pool_name in related_pool_names:
        print(f"Processing {pool_name} for {tag}")
        pool = pools_ref[pool_name].copy(deep=True)
        assert set(considered_atts) <= set(pool.columns)
        converted_pool = pool.set_index(considered_atts)
        possible_comb.append(set(converted_pool.index))
    possible_comb = set.intersection(*possible_comb)
    # update the pool, removing not matched combinations
    for pool_name in related_pool_names:
        pool = pools_ref[pool_name].copy(deep=True)
        converted_pool = pool.set_index(considered_atts)
        pools_ref[pool_name] = converted_pool.loc[list(possible_comb)].reset_index()

Processing HH-Main for HH
Processing HH for HH
Processing HH-Main for Main
Processing Main-Spouse for Main
Processing Main-Child for Main
Processing Main-Parent for Main
Processing Main-Sibling for Main
Processing Main-Others for Main
Processing Main-Grandchild for Main
Processing Main-Grandparent for Main


In [11]:
pools_ref["HH"]

Unnamed: 0,dwelltype,owndwell,hhinc,totalvehs,hhsize
0,Other,Something Else,3500-3999,0,4
1,Other,Something Else,3500-3999,0,4
2,Other,Something Else,3500-3999,0,4
3,Terrace/Townhouse,Being Purchased,500-649,0,5
4,Terrace/Townhouse,Being Rented,650-799,2,4
...,...,...,...,...,...
4997023,Terrace/Townhouse,Being Purchased,400-499,2,1
4997024,Terrace/Townhouse,Being Purchased,400-499,2,1
4997025,Terrace/Townhouse,Being Purchased,400-499,2,1
4997026,Flat or Apartment,Something Else,3000-3499,3,2


In [12]:
with open(processed_dir / "dict_pool_pairs_check_HH_main.pickle", "wb") as handle:
    pickle.dump(pools_ref, handle, protocol=pickle.HIGHEST_PROTOCOL)