# Create households from a model learnt from samples, ensure that it matches with household size

- Single person, must be hhsz=1
- Couple with Chidl, must be hhsz>=3
- Everything else must be hhsz>=2

We will sample more and then remove wrong one and then sample again from the existing ones

In [1]:
from PopSynthesis.Methods.CSP.run.rela_const import EXPECTED_RELATIONSHIPS
from PopSynthesis.Methods.CSP.const import DATA_FOLDER
import pandas as pd
from PopSynthesis.Methods.BN.utils.learn_BN import learn_struct_BN_score, learn_para_BN
from pgmpy.sampling import BayesianModelSampling
from pgmpy.factors.discrete import State

In [2]:
hh_samples_w_hhtype = pd.read_csv(DATA_FOLDER / 'hh_samples_w_hhtype.csv')
hh_samples_w_hhtype = hh_samples_w_hhtype.drop(columns=["serialno", "sample_geog"])

In [3]:
aim_n_pp = 2420222

In [4]:
model = learn_struct_BN_score(hh_samples_w_hhtype)
model = learn_para_BN(model, hh_samples_w_hhtype)
sampler = BayesianModelSampling(model)
pool = sampler.forward_sample(size=10000000)

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]



In [5]:
pool["hh_type"].value_counts()

hh_type
Couple with Child     3124670
Couple Only           2736601
Single Person         2243922
Group Household        683881
Complicated Family     634879
Single Parent          576047
Name: count, dtype: int64

In [6]:
def filter_by_hhtype(row):
    """
    Filter the pool by household type.
    """
    hhsz = row['hhsize']
    if hhsz == "8+":
        hhsz = 8
    else:
        hhsz = int(hhsz)
    hhtype = row['hh_type']
    if hhtype == 'Single Person' and hhsz == 1:
        return True
    if hhtype == 'Couple with Child' and hhsz >= 3:
        return True
    if hhtype not in ['Single Person', 'Couple with Child'] and hhsz >= 2:
        return True
    return False

pool["check"] = pool.apply(filter_by_hhtype, axis=1)

In [7]:
pool_new = pool[pool["check"]].drop(columns=["check"])

In [8]:
pool_new

Unnamed: 0,dwelltype,owndwell,hhinc,totalvehs,hhsize,hh_type
0,Separate House,Fully Owned,500-649,1,2,Couple Only
1,Separate House,Being Rented,800-999,1,2,Couple Only
2,Separate House,Fully Owned,1750-1999,1,2,Couple Only
3,Separate House,Being Rented,500-649,2,2,Couple Only
4,Separate House,Being Rented,1250-1499,2,2,Complicated Family
...,...,...,...,...,...,...
9999995,Separate House,Being Purchased,5000-5999,2,5,Complicated Family
9999996,Separate House,Fully Owned,3000-3499,2,1,Single Person
9999997,Flat or Apartment,Being Rented,1750-1999,1,1,Single Person
9999998,Separate House,Being Purchased,1750-1999,1,1,Single Person


In [13]:
samples = pool_new.sample(aim_n_pp, replace=False)
samples

Unnamed: 0,dwelltype,owndwell,hhinc,totalvehs,hhsize,hh_type
7626927,Flat or Apartment,Being Rented,3000-3499,1,5,Couple with Child
1740922,Terrace/Townhouse,Being Rented,2500-2999,2,6,Group Household
4392632,Separate House,Being Purchased,2500-2999,1,4,Couple with Child
3033444,Flat or Apartment,Being Rented,800-999,0,1,Single Person
8118621,Separate House,Fully Owned,3500-3999,1,6,Complicated Family
...,...,...,...,...,...,...
5066421,Separate House,Being Purchased,1250-1499,2,3,Couple with Child
5919361,Other,Being Rented,650-799,2,2,Single Parent
4156894,Separate House,Fully Owned,Nil income,1,1,Single Person
8568214,Separate House,Being Purchased,4000-4499,3,4,Complicated Family


In [14]:
samples.to_csv(DATA_FOLDER / 'hh_pureBN_hhtype_filter.csv', index=False)