In [26]:
import os,sys
sys.path.insert(1, '../')
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import seaborn as sns
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy import sparse
from tqdm import tqdm
import pdb

from poisson_utils import get_data, get_tree, acs_filter, transform_features

In [27]:
# Year and whether to transform the data
curr_year = 2022
transform = False
# Feature codes to use
feature_names =['PINCP','AGEP', 'PUBCOV','PRIVCOV','MAR','SCH','DIS','ESP','CIT','MIG','MIL','ANC','NATIVITY','DEAR','DEYE','DREM','SEX','RAC1P']
ft = np.array(["q", "q", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"])
regression_feature_name = "SEX"
outcome_name = "SCHL"
features, outcome = get_data(curr_year, feature_names, outcome_name, regression_feature_name, acs_filter=acs_filter)
reg_feat = features[regression_feature_name]
if transform:
    features, enc = transform_features(features, ft)
else:
    enc = None

np.random.seed(0) # Fix seed for tree
tree = get_tree(curr_year-1, feature_names, ft, outcome_name, regression_feature_name, enc=enc, transform=transform, acs_filter=acs_filter)
np.random.seed(0) # Fix seed for evaluation
# Evaluate tree and plot data
predicted_outcome = tree.predict(xgb.DMatrix(features))

Training tree without transformation.
[0]	eval-error:-15.06485	eval-mae:1717.70648	train-error:-15.11779	train-mae:1717.65486
[1]	eval-error:-15.06485	eval-mae:1634.47306	train-error:-15.11779	train-mae:1634.42283
[2]	eval-error:-15.06485	eval-mae:1555.27195	train-error:-15.11779	train-mae:1555.22301
[3]	eval-error:-15.06485	eval-mae:1479.90788	train-error:-15.11779	train-mae:1479.86021
[4]	eval-error:-15.06485	eval-mae:1408.19453	train-error:-15.11779	train-mae:1408.14829
[5]	eval-error:-15.06485	eval-mae:1339.95612	train-error:-15.11779	train-mae:1339.91067




[6]	eval-error:-15.06485	eval-mae:1275.02331	train-error:-15.11779	train-mae:1274.97878
[7]	eval-error:-15.06485	eval-mae:1213.23606	train-error:-15.11779	train-mae:1213.19257
[8]	eval-error:-15.06485	eval-mae:1154.44221	train-error:-15.11779	train-mae:1154.39961
[9]	eval-error:-15.06485	eval-mae:1098.49634	train-error:-15.11779	train-mae:1098.45498
[10]	eval-error:-15.06485	eval-mae:1045.26075	train-error:-15.11779	train-mae:1045.22061
[11]	eval-error:-15.06485	eval-mae:994.60457	train-error:-15.11779	train-mae:994.56528
[12]	eval-error:-15.06485	eval-mae:946.40235	train-error:-15.11779	train-mae:946.36393
[13]	eval-error:-15.06485	eval-mae:900.53518	train-error:-15.11779	train-mae:900.49779
[14]	eval-error:-15.06485	eval-mae:856.89024	train-error:-15.11779	train-mae:856.85366
[15]	eval-error:-15.06485	eval-mae:815.35994	train-error:-15.11779	train-mae:815.32393
[16]	eval-error:-15.06485	eval-mae:775.84175	train-error:-15.11779	train-mae:775.80616
[17]	eval-error:-15.06485	eval-mae:73

In [28]:
print(outcome.var(), (outcome-predicted_outcome).var())

36.11279613920831 11.472895853371755


In [29]:
X = features[['SEX', 'PINCP', 'RAC1P', 'NATIVITY']].to_numpy()
X = np.hstack((X, np.ones((X.shape[0], 1))))
X[:,1] = X[:,1] * (np.linalg.norm(X, axis=0)[0] / np.linalg.norm(X, axis=0)[1])
X[:,2] = X[:,2] * (np.linalg.norm(X, axis=0)[0] / np.linalg.norm(X, axis=0)[2])
Y = outcome.to_numpy()
Yhat = predicted_outcome

In [30]:
os.makedirs('./data', exist_ok=True)
np.savez(
    './data/census_education.npz',
    X=X,
    Yhat=Yhat,
    Y=Y,
)