# Create households from a model learnt from samples, ensure that it matches with household size

- Single person, must be hhsz=1
- Couple with Chidl, must be hhsz>=3
- Everything else must be hhsz>=2

We will sample more and then remove wrong one and then sample again from the existing ones

In [1]:
from PopSynthesis.Methods.CSP.run.rela_const import EXPECTED_RELATIONSHIPS
from PopSynthesis.Methods.CSP.const import DATA_FOLDER
import pandas as pd
from PopSynthesis.Methods.BN.utils.learn_BN import learn_struct_BN_score, learn_para_BN
from pgmpy.sampling import BayesianModelSampling
from pgmpy.factors.discrete import State

In [2]:
hh_samples_w_hhtype = pd.read_csv(DATA_FOLDER / 'hh_samples_w_hhtype_n_counts.csv')
n_rela_cols = [f"n_{rela}" for rela in EXPECTED_RELATIONSHIPS]
hh_samples_w_hhtype = hh_samples_w_hhtype.rename(columns={x: x.replace("HH_", "") for x in hh_samples_w_hhtype.columns if x not in n_rela_cols})
hh_samples_w_hhtype

Unnamed: 0,dwelltype,hhinc,totalvehs,owndwell,hhsize,hh_type,n_Spouse,n_Child,n_Parent,n_Grandparent,n_Grandchild,n_Sibling,n_Others,n_Main
0,Separate House,1000-1249,2,Fully Owned,4,Couple with Child,1,2,0,0,0,0,0,1
1,Separate House,1500-1749,3,Being Purchased,4,Complicated Family,0,0,2,0,0,1,0,1
2,Separate House,1000-1249,2,Being Purchased,4,Couple with Child,1,2,0,0,0,0,0,1
3,Separate House,1750-1999,3,Fully Owned,3,Couple with Child,1,1,0,0,0,0,0,1
4,Separate House,1750-1999,1,Being Purchased,3,Single Parent,0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29237,Separate House,Nil income,3,Being Purchased,2,Couple Only,1,0,0,0,0,0,0,1
29238,Separate House,500-649,2,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
29239,Separate House,650-799,2,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
29240,Separate House,400-499,1,Fully Owned,1,Single Person,0,0,0,0,0,0,0,1


In [3]:
aim_n_hh = 2420222

In [4]:
model = learn_struct_BN_score(hh_samples_w_hhtype)
model = learn_para_BN(model, hh_samples_w_hhtype)
sampler = BayesianModelSampling(model)
pool = sampler.forward_sample(size=10000000)

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]



In [5]:
pool["hh_type"].value_counts()

hh_type
Couple with Child     3127739
Couple Only           2732025
Single Person         2244036
Group Household        684999
Complicated Family     636007
Single Parent          575194
Name: count, dtype: int64

In [6]:
def filter_by_hhtype(row):
    """
    Filter the pool by household type.
    """
    hhsz = row['hhsize']
    if hhsz == "8+":
        hhsz = 8
    else:
        hhsz = int(hhsz)
    hhtype = row['hh_type']
    if hhtype == 'Single Person' and hhsz == 1:
        return True
    if hhtype == 'Couple with Child' and hhsz >= 3:
        return True
    if hhtype not in ['Single Person', 'Couple with Child'] and hhsz >= 2:
        return True
    return False

pool["check"] = pool.apply(filter_by_hhtype, axis=1)

In [7]:
pool_new = pool[pool["check"]].drop(columns=["check"])

In [8]:
pool_new[pool_new["hh_type"]=="Couple with Child"]["hhsize"].value_counts()

hhsize
4     1474170
3     1082096
5      453309
6       94986
7       16457
8+       6656
Name: count, dtype: int64

In [9]:
def filter_by_hhsz(row):
    total_n_rela = row["rela_tot"]
    hhsz = row['hhsize']
    if hhsz == "8+":
        return total_n_rela >= 8
    else:
        return total_n_rela == int(hhsz)


pool_new["rela_tot"] = pool_new[n_rela_cols].sum(axis=1)
pool_new["check"] = pool_new.apply(filter_by_hhsz, axis=1)

In [10]:
pool_new2 = pool_new[pool_new["check"]].drop(columns=["check", "rela_tot"])

In [11]:
len(pool_new2)

9047721

In [12]:
# Now single person must be correct, everything else need to be checked
def check_rela_match(row):
    """
    Check if the relationship matrix matches the household type.
    """
    assert row["n_Main"] == 1, "Main person should be 1"
    hhtype = row['hh_type']
    n_spouse = row['n_Spouse']
    n_child = row['n_Child']
    n_parent = row['n_Parent']
    n_grandchild = row['n_Grandchild']
    n_grandparent = row['n_Grandparent']
    n_sibling = row['n_Sibling']
    n_others = row['n_Others']

    if n_others > 0:
        return hhtype == "Group Household"
    if n_spouse == 0 and n_child == 0 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Single Person"
    if n_spouse >= 1 and n_child >= 1 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Couple with Child"
    if n_spouse >= 1 and n_child == 0 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Couple Only"
    if n_spouse == 0 and n_child >= 1 and n_parent == 0 and n_grandchild == 0 and n_grandparent == 0 and n_sibling == 0 and n_others == 0:
        return hhtype == "Single Parent"
    return hhtype == "Complicated Family"

pool_new2["check"] = pool_new2.apply(check_rela_match, axis=1)

In [13]:
pool_new3 = pool_new2[pool_new2["check"]].drop(columns=["check"])

In [14]:
pool_new3

Unnamed: 0,dwelltype,hhinc,totalvehs,owndwell,hhsize,hh_type,n_Spouse,n_Child,n_Parent,n_Grandparent,n_Grandchild,n_Sibling,n_Others,n_Main
0,Other,1750-1999,3,Being Rented,4,Couple with Child,1,2,0,0,0,0,0,1
2,Other,2000-2499,1,Being Rented,5,Couple with Child,1,3,0,0,0,0,0,1
3,Separate House,3000-3499,3,Fully Owned,2,Couple Only,1,0,0,0,0,0,0,1
4,Separate House,1500-1749,0,Being Purchased,3,Couple with Child,1,1,0,0,0,0,0,1
5,Separate House,650-799,1,Fully Owned,1,Single Person,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Other,2500-2999,1,Being Rented,2,Couple Only,1,0,0,0,0,0,0,1
9999996,Flat or Apartment,800-999,0,Being Rented,1,Single Person,0,0,0,0,0,0,0,1
9999997,Flat or Apartment,1-149,1,Being Rented,1,Single Person,0,0,0,0,0,0,0,1
9999998,Separate House,1000-1249,1,Something Else,1,Single Person,0,0,0,0,0,0,0,1


## Sampling to create results
If want to force the matching with hh_type, can do calculation from hh_type marginal, this should drive the hhsize and counts as well

In [15]:
expected_n_hhtype = hh_samples_w_hhtype["hh_type"].value_counts(normalize=True) * aim_n_hh
expected_n_hhtype = expected_n_hhtype.round().astype(int)

In [16]:
results = []
for hhtype, n_hh in expected_n_hhtype.items():
    sub_pool = pool_new3[pool_new3["hh_type"] == hhtype]
    sub_samples = sub_pool.sample(n=n_hh, replace=True)
    results.append(sub_samples)
samples = pd.concat(results, ignore_index=True)

In [17]:
# samples = pool_new3.sample(aim_n_hh, replace=False)
# samples

In [19]:
samples["serialno"] = range(1, len(samples) + 1)
samples

Unnamed: 0,dwelltype,hhinc,totalvehs,owndwell,hhsize,hh_type,n_Spouse,n_Child,n_Parent,n_Grandparent,n_Grandchild,n_Sibling,n_Others,n_Main,serialno
0,Separate House,2000-2499,3,Being Rented,6,Couple with Child,1,4,0,0,0,0,0,1,1
1,Terrace/Townhouse,5000-5999,1,Being Purchased,5,Couple with Child,1,3,0,0,0,0,0,1,2
2,Separate House,1-149,2,Fully Owned,4,Couple with Child,1,2,0,0,0,0,0,1,3
3,Separate House,2500-2999,2,Being Purchased,5,Couple with Child,1,3,0,0,0,0,0,1,4
4,Separate House,1250-1499,3,Being Rented,5,Couple with Child,1,3,0,0,0,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420216,Separate House,1500-1749,3,Being Purchased,3,Single Parent,0,2,0,0,0,0,0,1,2420217
2420217,Separate House,800-999,1,Being Rented,4,Single Parent,0,3,0,0,0,0,0,1,2420218
2420218,Flat or Apartment,1000-1249,1,Being Rented,2,Single Parent,0,1,0,0,0,0,0,1,2420219
2420219,Separate House,1500-1749,2,Fully Owned,4,Single Parent,0,3,0,0,0,0,0,1,2420220


In [20]:
samples.to_csv(DATA_FOLDER / 'hh_pureBN_hhtype_filter_constrained.csv', index=False)