As per my EDA into the distribution of Census variables (i.e. income, rent, bedrooms) by Census block group, I will transform School, Bedroom, Value, Rent, and Income into the mode for each row. 

I can further transform Bedroom, Value, Rent, and Income to numerical values.

For Poverty and Race, I will transform the counts into percentages.

In [1]:
from __future__ import division
import pandas as pd
from IPython.display import display
import re

In [2]:
df = pd.read_hdf('../data/data_w_descs_and_census.h5')
df.shape

(905650, 155)

In [3]:
# old_df = df.copy()

## Transforming first set of categories

In [4]:
def get_bedroom(txt):
    if txt[-1] == '+':
        return 6 # I chose this arbitrarily over 5
    else:
        return int(txt[-1])

In [5]:
def get_rent(txt):
    if txt[-1] == '+':
        return 3750
    else:
        regex = re.compile(r'rent_(\d+)_(\d+)')
        match = regex.search(txt)
        low, high = [int(i) for i in match.groups()]
        return int((high + low) / 2) + 1

In [6]:
def get_income(txt):
    if txt[-1] == '+':
        return 250000
    else:
        regex = re.compile(r'income_(\d+)_(\d+)')
        match = regex.search(txt)
        low, high = [int(i) for i in match.groups()]
        return int((high + low) / 2) + 1        

In [7]:
def get_value(txt):
    if txt[-1] == '+':
        return 2.5e6
    else:
        regex = re.compile(r'value_(\d+)_(\d+)')
        match = regex.search(txt)
        low, high = [int(i) for i in match.groups()]
        return int((high + low) / 2) + 1        

In [8]:
def transform_categ_mode(df, category):
    if category in ('race', 'poverty'):
        raise Exception('not gonna transform those categs')
    
    d = {
        'bedroom': 'bedroom_total_ppl', 
        'school': 'school_total', 
        'rent': 'rent_total', 
        'income': 'income_total', 
        'value': 'value_total',
        'housing': 'housing_total'
    }
    assert category in d.keys()
    
    category_total_col = d[category]
    category_cols = [col for col in df.columns if category + '_' == col[:len(category) + 1]]
    categ_df = df[category_cols]
    
    if category == 'income': # super-hacky, but this is a one-time process
        categ_df = categ_df.drop('income_per_capita', axis=1)
    
    max_categ_df = categ_df[[col for col in categ_df.columns if col != category_total_col]].idxmax(axis=1)
#     return max_categ_df
    new_df = df.drop(category_cols, axis=1)    
    
    if category == 'bedroom':
        new_df[category] = max_categ_df.map(lambda txt: get_bedroom(txt))
    elif category == 'school':
        new_df[category] = max_categ_df.map(lambda txt: txt.replace('school_', ''))
    elif category == 'rent':
        new_df[category] = max_categ_df.map(lambda txt: get_rent(txt))
    elif category == 'income':
        new_df[category] = max_categ_df.map(lambda txt: get_income(txt))
    elif category == 'value':
        new_df[category] = max_categ_df.map(lambda txt: get_value(txt)) 
    elif category == 'housing':
        new_df[category] = max_categ_df.map(lambda txt: txt[8:])
        
    return new_df

## Now we start the transformations

In [9]:
df.shape

(905650, 155)

In [10]:
df1 = transform_categ_mode(df.dropna(subset=['housing_own']), 'school')
df1.shape

(905205, 131)

In [11]:
df2 = transform_categ_mode(df1, 'housing')
df2.shape

(905205, 129)

In [12]:
df3 = transform_categ_mode(df2, 'bedroom')
df3.shape

(905205, 123)

In [13]:
df4 = transform_categ_mode(df3, 'value')
df4.shape

(905205, 97)

In [14]:
df5 = transform_categ_mode(df4, 'rent')
df5.shape

(905205, 73)

In [15]:
df6 = transform_categ_mode(df5, 'income')
df6.shape

(905205, 56)

## Now to make percentages for Poverty and Race

In [16]:
def transform_poverty_race(df, category):
    df = df.copy()
    assert category in ('race', 'poverty')
    d = {
        'poverty': 'poverty_total_pop', 
        'race': 'race_total'
    } 
    
    total_col_name = d[category]
    new_df = df[[col for col in df.columns if category + '_' == col[:len(category) + 1]]]
    df_transformed_values = zip(*new_df.apply(
        lambda row: [row[list(new_df.columns).index(col)] / row[total_col_name] for col in new_df.columns if col != total_col_name],
        axis=1)
    )    
    
    if category == 'poverty':
        df['poverty_pop_below_poverty_level'], \
            df['poverty_pop_w_public_assistance'], \
            df['poverty_pop_w_food_stamps'], \
            df['poverty_pop_w_ssi'] = df_transformed_values
    elif category == 'race':
        df['race_white'], \
            df['race_black'], \
            df['race_asian'], \
            df['race_hispanic'], \
            df['race_other'] = df_transformed_values

    return df.drop(total_col_name, axis=1)

In [17]:
df7 = transform_poverty_race(df6, 'race')
df7.shape

(905205, 55)

In [18]:
df8 = transform_poverty_race(df7, 'poverty')
df8.shape

(905205, 54)

## Let's look at one row

In [19]:
df8.iloc[100:101].T.loc['tract_and_block_group':]

Unnamed: 0,101
tract_and_block_group,0406001
race_white,0.728709
race_black,0.114135
race_asian,0.0561896
race_hispanic,0.0412643
race_other,0.0597015
poverty_pop_below_poverty_level,0.0443038
poverty_pop_w_public_assistance,0.0147679
poverty_pop_w_food_stamps,0.0590717
poverty_pop_w_ssi,0.0147679


## Let's save

In [20]:
df9 = df8.drop(['case_enquiry_id', 'description', 'specific_location', 'title', 'Geocoded_Location', 'ClosedPhoto'], axis=1)

In [21]:
df9.to_pickle('../data/data_w_transformed_census.pkl')

In [22]:
!ls -lh ../data/data_w_descs_and_transformed_census.pkl

-rw-rw-r-- 1 ubuntu ubuntu 539M Feb  1 00:12 ../data/data_w_descs_and_transformed_census.pkl
