# Create households from a model learnt from samples, ensure that it matches with household size

- Single person, must be hhsz=1
- Couple with Chidl, must be hhsz>=3
- Everything else must be hhsz>=2

We will sample more and then remove wrong one and then sample again from the existing ones

In [15]:
from PopSynthesis.Methods.CSP.run.rela_const import EXPECTED_RELATIONSHIPS
from PopSynthesis.Methods.CSP.const import DATA_FOLDER
import pandas as pd
from PopSynthesis.Methods.BN.utils.learn_BN import learn_struct_BN_score, learn_para_BN
from pgmpy.sampling import BayesianModelSampling
from pgmpy.factors.discrete import State

In [20]:
hh_samples_w_hhtype = pd.read_csv(DATA_FOLDER / 'hh_samples_w_hhtype_n_counts.csv')
n_rela_cols = [f"n_{rela}" for rela in EXPECTED_RELATIONSHIPS]
hh_samples_w_hhtype = hh_samples_w_hhtype.rename(columns={x: x.replace("HH_", "") for x in hh_samples_w_hhtype.columns if x not in n_rela_cols})
hh_samples_w_hhtype

Unnamed: 0,dwelltype,hhinc,totalvehs,owndwell,hhsize,hh_type,n_Spouse,n_Child,n_Parent,n_Grandparent,n_Grandchild,n_Sibling,n_Others,n_Main
0,Separate House,1000-1249,2,Fully Owned,4,Couple with Child,1,2,0,0,0,0,0,1
1,Separate House,1500-1749,3,Being Purchased,4,Complicated Family,0,0,2,0,0,1,0,1
2,Separate House,1000-1249,2,Being Purchased,4,Couple with Child,1,2,0,0,0,0,0,1
3,Separate House,1750-1999,3,Fully Owned,3,Couple with Child,1,1,0,0,0,0,0,1
4,Separate House,1750-1999,1,Being Purchased,3,Single Parent,0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29237,Separate House,Nil income,3,Being Purchased,2,Couple Only,1,0,0,0,0,0,0,1
29238,Separate House,500-649,2,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
29239,Separate House,650-799,2,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
29240,Separate House,400-499,1,Fully Owned,1,Single Person,0,0,0,0,0,0,0,1


In [21]:
aim_n_hh = 2420222

In [22]:
model = learn_struct_BN_score(hh_samples_w_hhtype)
model = learn_para_BN(model, hh_samples_w_hhtype)
sampler = BayesianModelSampling(model)
pool = sampler.forward_sample(size=10000000)

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]



In [25]:
pool["hh_type"].value_counts()

hh_type
Couple with Child     3127872
Couple Only           2733910
Single Person         2242827
Group Household        685533
Complicated Family     636222
Single Parent          573636
Name: count, dtype: int64

In [26]:
def filter_by_hhtype(row):
    """
    Filter the pool by household type.
    """
    hhsz = row['hhsize']
    if hhsz == "8+":
        hhsz = 8
    else:
        hhsz = int(hhsz)
    hhtype = row['hh_type']
    if hhtype == 'Single Person' and hhsz == 1:
        return True
    if hhtype == 'Couple with Child' and hhsz >= 3:
        return True
    if hhtype not in ['Single Person', 'Couple with Child'] and hhsz >= 2:
        return True
    return False

pool["check"] = pool.apply(filter_by_hhtype, axis=1)

In [27]:
pool_new = pool[pool["check"]].drop(columns=["check"])

In [30]:
pool_new[pool_new["hh_type"]=="Couple with Child"]["hhsize"].value_counts()

hhsize
4     1474414
3     1081310
5      454035
6       95017
7       16417
8+       6618
Name: count, dtype: int64

In [36]:
def filter_by_hhsz(row):
    total_n_rela = row["rela_tot"]
    hhsz = row['hhsize']
    if hhsz == "8+":
        return total_n_rela >= 8
    else:
        return total_n_rela == int(hhsz)


pool_new["rela_tot"] = pool_new[n_rela_cols].sum(axis=1)
pool_new["check"] = pool_new.apply(filter_by_hhsz, axis=1)

In [37]:
pool_new2 = pool_new[pool_new["check"]].drop(columns=["check", "rela_tot"])

In [40]:
len(pool_new2)

9046349

In [41]:
# Now single person must be correct, everything else need to be checked
def check_rela_match(row):
    """
    Check if the relationship matrix matches the household type.
    """
    assert row["n_Main"] == 1, "Main person should be 1"
    hhtype = row['hh_type']
    n_spouse = row['n_Spouse']
    n_child = row['n_Child']
    n_parent = row['n_Parent']
    n_grandchild = row['n_Grandchild']
    n_grandparent = row['n_Grandparent']
    n_sibling = row['n_Sibling']
    n_others = row['n_Others']

    if n_others > 0:
        return hhtype == "Group Household"
    if n_spouse == 0 and n_child == 0 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Single Person"
    if n_spouse >= 1 and n_child >= 1 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Couple with Child"
    if n_spouse >= 1 and n_child == 0 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Couple Only"
    if n_spouse == 0 and n_child >= 1 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Single Parent"
    return hhtype == "Complicated Family"

pool_new2["check"] = pool_new2.apply(check_rela_match, axis=1)

In [42]:
pool_new3 = pool_new2[pool_new2["check"]].drop(columns=["check"])

In [43]:
pool_new3

Unnamed: 0,dwelltype,hhinc,totalvehs,owndwell,hhsize,hh_type,n_Spouse,n_Child,n_Parent,n_Grandparent,n_Grandchild,n_Sibling,n_Others,n_Main
0,Separate House,1000-1249,0,Fully Owned,1,Single Person,0,0,0,0,0,0,0,1
1,Separate House,1000-1249,2,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
2,Terrace/Townhouse,1250-1499,1,Being Rented,2,Couple Only,1,0,0,0,0,0,0,1
3,Separate House,150-299,2,Being Rented,2,Single Parent,0,1,0,0,0,0,0,1
4,Separate House,2000-2499,1,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999994,Other,5000-5999,1,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
9999995,Separate House,2000-2499,3,Being Rented,5,Couple with Child,1,3,0,0,0,0,0,1
9999996,Separate House,4500-4999,2,Being Purchased,3,Couple with Child,1,1,0,0,0,0,0,1
9999997,Flat or Apartment,1-149,1,Being Rented,1,Single Person,0,0,0,0,0,0,0,1


In [44]:
samples = pool_new3.sample(aim_n_hh, replace=False)
samples

Unnamed: 0,dwelltype,hhinc,totalvehs,owndwell,hhsize,hh_type,n_Spouse,n_Child,n_Parent,n_Grandparent,n_Grandchild,n_Sibling,n_Others,n_Main
7187872,Separate House,300-399,2,Being Purchased,2,Couple Only,1,0,0,0,0,0,0,1
9411208,Separate House,3500-3999,2,Being Purchased,2,Couple Only,1,0,0,0,0,0,0,1
1142371,Flat or Apartment,2500-2999,0,Fully Owned,1,Single Person,0,0,0,0,0,0,0,1
14862,Separate House,1500-1749,1,Fully Owned,2,Single Parent,0,1,0,0,0,0,0,1
9230485,Separate House,150-299,1,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9828189,Separate House,500-649,1,Being Rented,1,Single Person,0,0,0,0,0,0,0,1
9379194,Terrace/Townhouse,800-999,1,Fully Owned,1,Single Person,0,0,0,0,0,0,0,1
8320957,Separate House,1750-1999,2,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
556953,Separate House,3000-3499,3,Being Rented,3,Couple with Child,1,1,0,0,0,0,0,1


In [45]:
samples.to_csv(DATA_FOLDER / 'hh_pureBN_hhtype_filter.csv', index=False)