In [16]:
import pandas as pd
from PopSynthesis.DataProcessor.utils.general_utils import find_file
import numpy as np
from PopSynthesis.DataProcessor.utils.const_files import (
    hh_seed_file,
    pp_seed_file,
    raw_data_dir,
    processed_data_dir,
    output_dir,
)

In [17]:
pp_file = find_file(base_path=raw_data_dir, filename=pp_seed_file)
pp_df = pd.read_csv(pp_file)

In [18]:
pp_df.relationship.value_counts()

relationship
Self              30504
Child             24201
Spouse            19715
Unrelated          1816
Other relative     1169
Grandchild         1051
Sibling             413
Other               109
Name: count, dtype: int64

In [19]:
def convert_income(income_str):
    if "Negative" in income_str:
        return -1
    elif "Missing" in income_str:
        return -2
    elif "Zero" in income_str:
        return 0
    elif "-" in income_str:
        return int(income_str.split("-")[0].replace("$", ""))
    elif "+" in income_str:
        return 2000
    else:
        raise ValueError("Weird")
pp_df["converted_income"] = pp_df["persinc"].apply(convert_income)

In [22]:
pp_df["id_combine"] = pp_df.apply(lambda r: [r["persid"], r["age"], r["converted_income"], r["relationship"]], axis=1)

In [23]:
# check case self is not the oldest
gb_pid = pp_df.groupby("hhid")["id_combine"].apply(lambda x: list(x))
gb_pid

hhid
Y12H0000101    [[Y12H0000101P01, 50, 1000, Self], [Y12H000010...
Y12H0000102    [[Y12H0000102P01, 57, 1, Spouse], [Y12H0000102...
Y12H0000103    [[Y12H0000103P01, 47, 400, Self], [Y12H0000103...
Y12H0000104    [[Y12H0000104P01, 56, 1250, Self], [Y12H000010...
Y12H0000107    [[Y12H0000107P01, 41, 1500, Self], [Y12H000010...
                                     ...                        
Y20H4001024    [[Y20H4001024P01, 56, 0, Self], [Y20H4001024P0...
Y20H4001028    [[Y20H4001028P01, 64, 0, Self], [Y20H4001028P0...
Y20H4001029    [[Y20H4001029P01, 73, 300, Self], [Y20H4001029...
Y20H4001036                    [[Y20H4001036P01, 75, 400, Self]]
Y20H4001039                    [[Y20H4001039P01, 76, 600, Self]]
Name: id_combine, Length: 30803, dtype: object

In [24]:
def idx_max_val_return(ls):
    max_idx = None
    max_val = None
    for idx, val in enumerate(ls):
        if max_val is None or val > max_val:
            max_idx = idx
            max_val = val
    return max_idx

def find_idx_value(ls, find_val):
    for idx, val in enumerate(ls):
        if val == find_val:
            return idx
    return None

In [43]:
def process_info_each_house(r):
    val_combine = r["id_combine"]
    ls_id, ls_age, ls_income, ls_rela = np.array(val_combine).T
    ls_rela[0] = "Self" # replace all
    
    # Check Child and GrandChild
    self_age = ls_age[0]
    special_case = "Nothing"
    for rela, age in zip(ls_rela, ls_age):
        age_gap = int(self_age) - int(age)
        if rela == "Child" and age_gap < 16:
            special_case = "Child"
        elif rela == "Grandchild" and age_gap < 33:
            special_case = "GrandChild"
            
    # the_oldest = ls_id[idx_max_val_return(ls_age)]
    # the_highest_income = ls_id[idx_max_val_return(ls_income)]
    
    the_oldest_rela = ls_rela[idx_max_val_return(ls_age.astype(int))]
    the_highest_income_rela = ls_rela[idx_max_val_return(ls_income.astype(int))]

    # idx_self = find_idx_value(ls_rela, "Self")
    # the_self = None if idx_self is None else ls_id[idx_self]
    num_self = list(ls_rela).count("Self")
    num_spouse = list(ls_rela).count("Spouse")
    num_pp = len(ls_rela)
    num_unrelated = list(ls_rela).count("Unrelated")
    num_other = list(ls_rela).count("Other")
    return the_oldest_rela, the_highest_income_rela, num_pp, num_self, num_spouse, num_unrelated, num_other, special_case
pid_df = pd.DataFrame(gb_pid)
corr_cols = ["oldest", "highest_inc", "num_pp", "num_self", "num_spouse", "num_unrelated", "num_other", "special_case"]
pid_df[corr_cols] = pid_df.apply(process_info_each_house, result_type="expand", axis=1)

In [44]:
pid_df

Unnamed: 0_level_0,id_combine,oldest,highest_inc,num_pp,num_self,num_spouse,num_unrelated,num_other,special_case
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Y12H0000101,"[[Y12H0000101P01, 50, 1000, Self], [Y12H000010...",Self,Self,4,1,1,0,0,Nothing
Y12H0000102,"[[Y12H0000102P01, 57, 1, Spouse], [Y12H0000102...",Self,Child,4,1,1,0,0,Nothing
Y12H0000103,"[[Y12H0000103P01, 47, 400, Self], [Y12H0000103...",Self,Self,4,1,1,0,0,Nothing
Y12H0000104,"[[Y12H0000104P01, 56, 1250, Self], [Y12H000010...",Self,Spouse,3,1,1,0,0,Nothing
Y12H0000107,"[[Y12H0000107P01, 41, 1500, Self], [Y12H000010...",Self,Self,3,1,0,0,0,Nothing
...,...,...,...,...,...,...,...,...,...
Y20H4001024,"[[Y20H4001024P01, 56, 0, Self], [Y20H4001024P0...",Self,Self,2,1,1,0,0,Nothing
Y20H4001028,"[[Y20H4001028P01, 64, 0, Self], [Y20H4001028P0...",Self,Spouse,2,1,1,0,0,Nothing
Y20H4001029,"[[Y20H4001029P01, 73, 300, Self], [Y20H4001029...",Self,Self,2,1,1,0,0,Nothing
Y20H4001036,"[[Y20H4001036P01, 75, 400, Self]]",Self,Self,1,1,0,0,0,Nothing


In [45]:
a = pid_df[pid_df["special_case"]!="Nothing"]
len(a)

15

In [36]:
pid_df.to_csv("./check_rela.csv")