# Combine all existing pools to check for shared combinations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pickle
from PopSynthesis.Methods.IPSF.const import (
    HH_TAG,
    HH_ATTS,
    PP_ATTS,
    NOT_INCLUDED_IN_BN_LEARN,
    processed_dir
)

## Setup

In [3]:
# Get the people pools
# Get the household pools
# Get the paired people pools

hh_pool = pd.read_csv(processed_dir / "HH_pool.csv")
pp_pool = pd.read_csv(processed_dir / "PP_pool.csv")
with open(processed_dir / "dict_pool_pairs_by_layers.pickle", "rb") as handle:
    pools_ref = pickle.load(handle)

In [4]:
all_rela = set(x.split("-")[-1] for x in pools_ref.keys())
all_rela

{'Child',
 'Grandchild',
 'Grandparent',
 'Main',
 'Others',
 'Parent',
 'Sibling',
 'Spouse'}

In [5]:
pools_ref[HH_TAG] = hh_pool

In [6]:
pp_atts = [x for x in PP_ATTS if x not in NOT_INCLUDED_IN_BN_LEARN]
hh_atts = [x for x in HH_ATTS if x not in NOT_INCLUDED_IN_BN_LEARN]

## Filter the people-related pools

In [7]:
# Then make sure all people in pool exist in the rela pools and vice versa
# DO this by detect all possible combinations in pools and intersect
# DO this by decouple the paired rela pools to have possible people combs
# If del in people pool is ez but if del for others, we must del all 

def combine_all_pp_comb(pools_ref):
    possible_combs = []
    for pair_name, paired_pool in pools_ref.items():
        print(pair_name)
        if HH_TAG in pair_name:
            continue
        root_rela, sample_rela = pair_name.split("-")
        root_atts = [f"{x}_{root_rela}" for x in pp_atts]
        sample_atts = [f"{x}_{sample_rela}" for x in pp_atts]
        root_filtered_combs = set(paired_pool.set_index(root_atts).index)
        sample_filtered_combs = set(paired_pool.set_index(sample_atts).index)
        possible_combs.append(root_filtered_combs | sample_filtered_combs)
    return set.union(*possible_combs)

possible_pp_comb_in_pool_refs = combine_all_pp_comb(pools_ref)
len(possible_pp_comb_in_pool_refs)

HH-Main
Main-Spouse
Main-Child
Main-Parent
Main-Sibling
Main-Others
Main-Grandchild
Main-Grandparent
Child-Grandchild
Parent-Grandparent
HH


1056

In [8]:
possible_combs_in_pool = set(pp_pool.set_index(pp_atts).index)
len(possible_combs_in_pool)

886

In [9]:
possible_pp_combs = possible_pp_comb_in_pool_refs & possible_combs_in_pool
len(possible_pp_combs)

886

In [10]:
# Updating the pools
pp_pool = pp_pool.set_index(pp_atts).loc[list(possible_pp_combs)].reset_index()
print(pp_pool.shape)

# NO LONGER DO THIS, AS WE UPDATE THE POOL ON THE WAY LATER
# for pair_name, paired_pool in pools_ref.items():
#     if HH_TAG in pair_name:
#         continue
#     root_rela, sample_rela = pair_name.split("-")
#     root_atts = [f"{x}_{root_rela}" for x in pp_atts]
#     sample_atts = [f"{x}_{sample_rela}" for x in pp_atts]
#     root_filtered_combs = set(paired_pool.set_index(root_atts).index)
#     new_pool = paired_pool.set_index(root_atts).loc[list(possible_pp_combs & root_filtered_combs)].reset_index()
#     sample_filtered_combs = set(new_pool.set_index(sample_atts).index)
#     new_pool = new_pool.set_index(sample_atts).loc[list(possible_pp_combs & sample_filtered_combs)].reset_index()
#     pools_ref[pair_name] = new_pool
#     print(f"Finish {pair_name} with shape {new_pool.shape}")

(5000000, 5)


## Filter pools rela by rela

In [11]:
def cross_check_between_2_pools(pool1, pool2, considered_atts):
    assert set(considered_atts) <= set(pool1.columns)
    assert set(considered_atts) <= set(pool2.columns)
    converted_pool1 = pool1.set_index(considered_atts)
    converted_pool2 = pool2.set_index(considered_atts)
    possible_comb = set(converted_pool1.index) & set(converted_pool2.index)
    result_pool1 = converted_pool1.loc[list(possible_comb)].reset_index()
    result_pool2 = converted_pool2.loc[list(possible_comb)].reset_index()
    return result_pool1, result_pool2

In [22]:
pools_ref[""]

dict_keys(['HH-Main', 'Main-Spouse', 'Main-Child', 'Main-Parent', 'Main-Sibling', 'Main-Others', 'Main-Grandchild', 'Main-Grandparent', 'Child-Grandchild', 'Parent-Grandparent', 'HH'])

In [28]:
for pair_name, paired_pool in pools_ref.items(): # The order matters here
    print(f"Processing {pair_name}")
    if pair_name == HH_TAG:
        continue

    root_rela, sample_rela = pair_name.split("-")
    prev_pools = [x for x in pools_ref.keys() if x.split("-")[-1] == root_rela]
    assert len(prev_pools) == 1
    prev_pool = pools_ref[prev_pools[0]]

    other_pool = pd.DataFrame()
    if sample_rela in prev_pool.columns:
        # This happens for the case of relationships
        prev_pool = prev_pool[prev_pool[sample_rela] > 0]
        other_pool = prev_pool[prev_pool[sample_rela] == 0]
        print(other_pool)

    # So we now handling two pools only
    to_consider_atts = hh_atts if root_rela==HH_TAG else [f"{x}_{root_rela}" for x in pp_atts]
    updated_prev_pool, updated_curr_pool = cross_check_between_2_pools(prev_pool, paired_pool, to_consider_atts)

    if not other_pool.empty:
        updated_prev_pool = pd.concat([updated_prev_pool, other_pool], axis=0)
        print(updated_prev_pool)

    # print(f"Finish {pair_name} with shape {new_pool.shape}")

Processing HH-Main
Processing Main-Spouse
Empty DataFrame
Columns: [age_Main, sex_Main, persinc_Main, nolicence_Main, anywork_Main, dwelltype, hhinc, hhsize, totalvehs, owndwell, Main, Spouse, Child, Parent, Sibling, Others, Grandchild, Grandparent]
Index: []
Processing Main-Child
Empty DataFrame
Columns: [age_Main, sex_Main, persinc_Main, nolicence_Main, anywork_Main, dwelltype, hhinc, hhsize, totalvehs, owndwell, Main, Spouse, Child, Parent, Sibling, Others, Grandchild, Grandparent]
Index: []
Processing Main-Parent
Empty DataFrame
Columns: [age_Main, sex_Main, persinc_Main, nolicence_Main, anywork_Main, dwelltype, hhinc, hhsize, totalvehs, owndwell, Main, Spouse, Child, Parent, Sibling, Others, Grandchild, Grandparent]
Index: []
Processing Main-Sibling
Empty DataFrame
Columns: [age_Main, sex_Main, persinc_Main, nolicence_Main, anywork_Main, dwelltype, hhinc, hhsize, totalvehs, owndwell, Main, Spouse, Child, Parent, Sibling, Others, Grandchild, Grandparent]
Index: []


KeyboardInterrupt: 

In [None]:
temp_pools_hold

In [12]:
# Then do rela by rela (HH-Main then Main others and next layers)
# Make sure for a given HH/Main/root rela it will exist to have a sample rec

# Redo as we are doing it by pairs

# to_consider_tags = ["Main", "Child", "Parent", HH_TAG] # Possible root rela, HH last
# for tag in to_consider_tags:
#     considered_atts = hh_atts if tag == HH_TAG else [f"{x}_{tag}" for x in pp_atts]
#     related_pool_names = [x for x in pools_ref.keys() if tag in x]
#     possible_comb = [] if tag == HH_TAG else [possible_pp_combs]
#     for pool_name in related_pool_names:
#         print(f"Processing {pool_name} for {tag}")
#         pool = pools_ref[pool_name].copy(deep=True)
#         assert set(considered_atts) <= set(pool.columns)
#         converted_pool = pool.set_index(considered_atts)
#         possible_comb.append(set(converted_pool.index))
#     possible_comb = set.intersection(*possible_comb)
#     # update the pool, removing not matched combinations
#     for pool_name in related_pool_names:
#         pool = pools_ref[pool_name].copy(deep=True)
#         converted_pool = pool.set_index(considered_atts)
#         pools_ref[pool_name] = converted_pool.loc[list(possible_comb)].reset_index()


Processing HH-Main for Main
Processing Main-Spouse for Main
Processing Main-Child for Main
Processing Main-Parent for Main
Processing Main-Sibling for Main
Processing Main-Others for Main
Processing Main-Grandchild for Main
Processing Main-Grandparent for Main


KeyboardInterrupt: 

In [13]:
pools_ref["PP"] = pp_pool

In [14]:
for name, pool in pools_ref.items():
    print(f"{name}: {len(pool)}")

HH-Main: 2296016
Main-Spouse: 2980645
Main-Child: 954948
Main-Parent: 1431578
Main-Sibling: 1104735
Main-Others: 2060394
Main-Grandchild: 3177627
Main-Grandparent: 180154
Child-Grandchild: 4399739
Parent-Grandparent: 450807
HH: 4995294
PP: 5000000


## Cross-check with seed data to ensure it can conver all

In [15]:
# We can double check again with actual seed data to make sure it is correct (it can cover all data)
# NOTE: think about the neg inc, it should exist across all and it should be quite rare


## Reweight to match the known distributions

In [16]:
# Then reweight to match with the seed data, maybe use IPF but we do need to maintain the past distributions.

## Condense the pools

In [17]:
pools_ref["HH"][pools_ref["HH"]["hhinc"]=="Negative income"]

Unnamed: 0,dwelltype,owndwell,hhinc,totalvehs,hhsize
358074,Separate House,Fully Owned,Negative income,1,3
1344868,Separate House,Fully Owned,Negative income,2,2
3213636,Separate House,Being Purchased,Negative income,4+,8+
3213637,Separate House,Being Purchased,Negative income,4+,8+
3426338,Separate House,Being Purchased,Negative income,3,5
3761236,Separate House,Being Purchased,Negative income,3,7
3761237,Separate House,Being Purchased,Negative income,3,7
4754651,Terrace/Townhouse,Being Purchased,Negative income,2,3


In [18]:
pools_ref["HH-Main"][pools_ref["HH-Main"]["hhinc"]=="Negative income"]

Unnamed: 0,dwelltype,owndwell,hhinc,totalvehs,hhsize,age_Main,sex_Main,persinc_Main,nolicence_Main,anywork_Main,Main,Spouse,Child,Parent,Sibling,Others,Grandchild,Grandparent
166493,Separate House,Fully Owned,Negative income,1,3,60-69,F,$1000-1249 p.w.,Some Licence,N,1,1,1,0,0,0,0,0
623149,Separate House,Fully Owned,Negative income,2,2,50-59,F,$400-599 p.w.,Some Licence,Y,1,0,1,0,0,0,0,0
1483550,Separate House,Being Purchased,Negative income,4+,8+,40-49,F,$300-399 p.w.,Some Licence,N,1,0,5,0,0,7,1,0
1583217,Separate House,Being Purchased,Negative income,3,5,40-49,M,$800-999 p.w.,Some Licence,Y,1,1,3,0,0,0,0,0
1738240,Separate House,Being Purchased,Negative income,3,7,40-49,M,Negative Income,Some Licence,Y,1,1,5,0,0,0,0,0
2188906,Terrace/Townhouse,Being Purchased,Negative income,2,3,50-59,M,$1-199 p.w.,Some Licence,N,1,1,1,0,0,0,0,0


## Output

In [19]:
# with open(processed_dir / "dict_pool_pairs_check_all_condensed.pickle", "wb") as handle:
#     pickle.dump(pools_ref, handle, protocol=pickle.HIGHEST_PROTOCOL)

If we condensed them all, we later can pack them out?
We will have the sample_col, weights