In [14]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, kruskal, f_oneway
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.ensemble import RandomForestRegressor
import sys,os

In [16]:
'''
Data location: 'https://www.fhfa.gov/DataTools/Downloads/Pages/Public-Use-Databases.aspx'
'''
data_path = '../data/'

In [4]:
###
# Read dataset
# Unfortunately there is arbitrary white space so pandas read_csv didnt work well, 
# maybe could use np.loadtxt
# Starting out with 10% sample for quickness
###
def import_housing_data_as_list(data_path,sampling=False,window=[]):
    out = []
    f = open(data_path,'r')
    if sampling:
        for i,line in enumerate(f.readlines()):
            if (i % 10) == 0:
                out.append(line.split())
    elif len(window) > 0:
        for i,line in enumerate(f.readlines()):
            if i >= window[0] and i <= window[1]:
                out.append(line.split())
            if i > window[1]:
                break
    else:
        for line in f.readlines():
            out.append(line.split())
    f.close()
    return np.array(out)

In [5]:
housing_list_segment = import_housing_data_as_list(data_path,sampling=False,window=[0,99])

In [6]:
housing_list_segment.shape

(100, 64)

In [12]:
# h_df_raw.to_parquet('../data/fm_data_raw.parquet',engine='pyarrow')

In [15]:
# h_df = pd.read_parquet('../data/fm_data_raw.parquet',engine='pyarrow')

In [16]:
'There are {:,} people in the full dataset'.format(len(housing_list_segment))

'There are 100 people in the full dataset'

In [7]:
np.arange(0,5e6,5e5)

array([      0.,  500000., 1000000., 1500000., 2000000., 2500000.,
       3000000., 3500000., 4000000., 4500000.])

In [8]:
cols = list(range(1,65))
float_cols = [7,10,13,36,42,46,53]
int_cols = list(set(cols) - set(float_cols))
for idx in [0]:
    wdw = [int(idx),int(idx+5e5)]
    print('loading {}-{}...'.format(wdw[0],wdw[1]))
    housing_list_segment = import_housing_data_as_list(data_path,sampling=False,window=wdw)
    float_data = []
    int_data = []
    print('converting...')
    for c in np.subtract(float_cols,1):
        float_data.append(housing_list_segment[:,c].astype('float64'))
    for c in np.subtract(int_cols,1):
        int_data.append(housing_list_segment[:,c].astype('float64').astype('int32'))
    h_df = pd.concat([
        pd.DataFrame(columns=int_cols,data=np.array(int_data).T),
        pd.DataFrame(columns=float_cols,data=np.array(float_data).T)],axis=1)
    h_df.to_pickle('../data/fm_{}-{}.gz'.format(wdw[0],wdw[1]),compression='gzip')
    print('saved {}-{}...'.format(wdw[0],wdw[1]))

loading 0-500000...
converting...
saved 0-500000...


In [9]:
hf = pd.read_pickle('../data/fm_0-500000.gz')

In [11]:
sys.getsizeof(hf)/1024**2

135.42216110229492

In [55]:
hf = pd.DataFrame()
for i,f in enumerate([f for f in os.listdir(data_path) if '.gz' in f]):
    hf_tmp = pd.read_pickle(data_path+f)
    hf = hf.append(hf_tmp.iloc[:-1,:])
    print(hf.shape)

(500000, 64)
(1000000, 64)
(1500000, 64)
(2000000, 64)
(2500000, 64)
(3000000, 64)
(3500000, 64)
(4000000, 64)
(4357624, 64)
(4857624, 64)


In [58]:
hf.to_pickle('../data/fm_2020_all.gz',compression='gzip')

In [59]:
hf = pd.read_pickle('../data/fm_2020_all.gz')

### Perform binning

In [52]:
bins = np.arange(1.5,7.5,0.5)
c = pd.cut(hf.loc[:,46],bins=bins,labels=False,include_lowest=True)

In [53]:
hf['ir_bin'] = c.apply(lambda x: bins[x])

In [54]:
hf[['ir_bin',46]]

Unnamed: 0,ir_bin,46
0,2.5,3.00
1,3.5,3.62
2,3.0,3.12
3,2.5,2.99
4,3.5,3.99
...,...,...
499995,3.5,3.75
499996,2.5,2.75
499997,2.0,2.37
499998,4.5,4.75


In [None]:
hf.to_pickle('../data/fm_2020_all.gz')