In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, kruskal, f_oneway
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.ensemble import RandomForestRegressor
import sys,os

In [2]:
'''
Data location: 'https://www.fhfa.gov/DataTools/Downloads/Pages/Public-Use-Databases.aspx'
'''
data_path = '../data/'

In [3]:
fm = pd.read_pickle('../data/fm_2020_all.gz')

## Manually label columns

In [4]:
idx_cols = {
    1: 'e_flag',
    2: 'id',
}

In [5]:
geo_cols = {
    3 : 'state_code',
    4 : 'msa_code',
    5 : 'cn_code', # county
    6 : 'ct_code', # cesnsus tract 
    58: 'rural_ct', # bool
    59: 'lower_miss_delta_ct', # bool
    60: 'middle_app_cn', # bool
    61: 'persistent_poverty_cn', #bool
    62: 'conc_pov_area', # bool
    63: 'high_opp_area', # bool
    64: 'qoz_ct', # bool
}

In [6]:
race_cols = {
    7 : 'ct_pct_minority', # float
    19: 'b_race_1st_val',
    20: 'b_race_2nd_val',
    21: 'b_race_3rd_val',
    22: 'b_race_4th_val',
    23: 'b_race_5th_val',
    24: 'b_ethnicity',
    25: 'cb_race_1st_val',
    26: 'cb_race_2nd_val',
    27: 'cb_race_3rd_val',
    28: 'cb_race_4th_val',
    29: 'cb_race_5th_val',
    30: 'cb_ethnicity',
}

In [7]:
age_gender_cols = {
    31: 'b_gender',
    32: 'cb_gender',
    33: 'b_age_gr', # ord
    34: 'cb_age_gr', 
    40: 'b_gt_62', # bool
    41: 'cb_gt_62',
    42: 'ltv_at_orig' # float
} 

In [8]:
income_cols = {
    8 : 'ct_med_income', # int
    9 : 'msa_med_income', # int
    10: 'ct_med_inc/msa_med_inc', # X
    11: 'b_income', # int
    12: 'msa_med_income_2', # X
    13: 'b_income_ratio', # float
    14: 'acquired_unpaid_balance_gr', # int
}

In [9]:
loan_cols = {
    15: 'loan_purpose', # cat
    16: 'fed_guar', # cat
    17: 'n_borrowers', # ord
    18: 'ft_home_buyer', # bool,
    35: 'occ_code', # cat
    36: 'rate_spread', # num
    37: 'hoepa_status', # cat
    38: 'property_type', # cat 
    39: 'lien_status', # X
    43: 'mortgage_note_origin', # cat
    44: 'mortgage_term_mo', # num
    45: 'n_units', # ord
    46: 'ir_orig', # label column, binarize
    47: 'note_amt_gr', # num
    48: 'preapproval', # cat
    49: 'app_chan', # cat
    50: 'aus_name', # cat
    51: 'b_cs_model', # cat
    52: 'cb_cs_model', # X
    53: 'debt_to_income_ratio_gr', # ord
    54: 'discount_pts', # num
    55: 'intro_rate_period', # X
    56: 'land_ownership', # cat
    57: 'property_value_gr', # num
}

In [10]:
all_cols = {**idx_cols, **geo_cols, **age_gender_cols, **race_cols, **income_cols, **loan_cols}

In [11]:
ordinal_cols = [
    'b_age_gr',
    'n_borrowers',
    'n_units',
    'debt_to_income_ratio_gr',
]

In [12]:
cat_cols = [
    'loan_purpose',
    'fed_guar',
    'occ_code',
    'hoepa_status',
    'property_type',
    'mortgage_note_origin',
    'preapproval',
    'aus_name',
    'b_cs_model',
    'land_ownership',
    'app_chan'
]

In [13]:
cont_cols = [
    'ct_pct_minority',
    'ct_med_income',
    'msa_med_income',
    'b_income',
    'b_income_ratio',
    'acquired_unpaid_balance_gr',
    'rate_spread',
    'mortgage_term_mo',
    'note_amt_gr', 
    'discount_pts',
    'property_value_gr',
    'ltv_at_orig'
]

### Race

In [14]:
print(
    19, np.unique(fm.loc[:,19]),race_cols[19], '\n',
    20, np.unique(fm.loc[:,20]),race_cols[20], '\n',
    21, np.unique(fm.loc[:,21]),race_cols[21], '\n',
    22, np.unique(fm.loc[:,22]),race_cols[22], '\n',
    23, np.unique(fm.loc[:,23]),race_cols[23], '\n',
)

19 [1 2 3 4 5 6 7 9] b_race_1st_val 
 20 [1 2 3 4 9] b_race_2nd_val 
 21 [1 2 3 9] b_race_3rd_val 
 22 [1 2 9] b_race_4th_val 
 23 [1 9] b_race_5th_val 



In [15]:
r_white_non_hisp = ((fm[19]==5) & (fm[24]==2)).astype(int)

In [16]:
r_black = ((fm[19]==3) | (fm[20]==3) | (fm[21]==3)).astype(int)

### Gender

In [17]:
gender_m = (fm[31]==1).astype(int)

In [18]:
gender_f = (fm[31]==2).astype(int)

### Apply

In [19]:
new_cols = set(fm.columns) - set(list(range(1,7)) + list(range(19,31)) + [55,12,31,39,40,10])
verbose_cols = [all_cols[c] for c in new_cols]
fm_new = fm.loc[:,new_cols]
fm_new.columns = verbose_cols
## remove co-borrower info
fm_new = fm_new.loc[:,[v for v in verbose_cols if 'cb' not in v]]

In [20]:
fm_new['r_white_non_hisp'] = r_white_non_hisp
fm_new['r_black'] = r_black
fm_new['gender_m'] = gender_m
fm_new['gender_f'] = gender_f

In [21]:
fm_new_cut = fm_new.loc[((fm_new['r_white_non_hisp'] ==1) | (fm_new['r_black'] ==1))
           & ((fm_new['gender_m']==1)| (fm_new['gender_f']==1)),:]

In [23]:
# fm_new_cut = fm_new_cut.apply(lambda r: )

In [24]:
len(fm_new_cut)/len(fm_new)

0.6689997002649856

In [54]:
fm_new_cut['r_white_non_hisp'].sum()/len(fm_new_cut)

0.9521132247444342

In [55]:
fm_new_cut['r_black'].sum()/len(fm_new_cut)

0.04941704728580577

In [56]:
fm_new_cut['gender_m'].sum()/len(fm_new_cut)

0.6664785495741363

In [57]:
fm_new_cut['gender_f'].sum()/len(fm_new_cut)

0.3335214504258637

In [25]:
len(fm_new.columns)

40

In [26]:
from sklearn.preprocessing import LabelBinarizer, StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline

In [27]:
oh = OneHotEncoder(handle_unknown='ignore')
oc = OrdinalEncoder(handle_unknown='ignore')
mm = MinMaxScaler()

In [28]:
ct = ColumnTransformer(
    [('oh',oh, cat_cols),
    ('oc',oc,  ordinal_cols),
    ('mm',mm, cont_cols)],remainder='passthrough')

In [29]:
X = fm_new_cut
X_ct = ct.fit_transform(X)

In [30]:
X_ct.shape

(3249749, 63)

In [31]:
oh_cols = oh.fit(X[cat_cols]).get_feature_names(cat_cols)
ct_cols = list(oh_cols) + ordinal_cols + cont_cols + \
    list(X.drop(cat_cols + ordinal_cols + cont_cols,axis=1).columns) 

In [32]:
len(cat_cols), len(oh_cols)

(11, 34)

In [33]:
list(X.drop(cat_cols + ordinal_cols + cont_cols,axis=1).columns) 

['ft_home_buyer',
 'ir_orig',
 'rural_ct',
 'lower_miss_delta_ct',
 'middle_app_cn',
 'persistent_poverty_cn',
 'conc_pov_area',
 'high_opp_area',
 'qoz_ct',
 'r_white_non_hisp',
 'r_black',
 'gender_m',
 'gender_f']

In [34]:
len(ct_cols)

63

In [36]:
X_ct_f = pd.DataFrame(data=X_ct,columns=ct_cols)

In [37]:
X_ct_f['ft_home_buyer'] = X_ct_f['ft_home_buyer'].apply(lambda r: 1 if r==1 else 0)

In [38]:
X_ct_f = X_ct_f.drop(['r_black','gender_f'],axis=1)

In [39]:
X_ct_f.shape

(3249749, 61)

### Interest rate division threshold
 - divide equally based on interest rate

In [43]:
X_ct_f.loc[:,'ir_orig'].median(), X_ct_f.loc[:,'ir_orig'].mean()

(3.12, 3.210376302883245)

In [44]:
y_ct = pd.cut(X_ct_f.loc[:,'ir_orig'],bins=[0,3.12,7],labels=[0,1])

### Test train split

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_ct_f = X_ct_f.drop('ir_orig',axis=1)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X_ct_f, y_ct, test_size=0.2, random_state=23)

In [52]:
X_train.shape, y_train.shape

((2599799, 60), (2599799,))

In [58]:
save_data_loc = './data/fm2020/'

In [59]:
X_train.to_pickle(save_data_loc+'X_train.gz')

In [60]:
y_train.to_pickle(save_data_loc+'y_train.gz')

In [61]:
X_test.to_pickle(save_data_loc+'X_test.gz')

In [62]:
y_test.to_pickle(save_data_loc+'y_test.gz')