As per my EDA into the distribution of Census variables (i.e. income, rent, bedrooms) by Census block group, I will transform School, Bedroom, Value, Rent, and Income into the mode for each row. 

I can further transform Bedroom, Value, Rent, and Income to numerical values.

For Poverty and Race, I will transform the counts into percentages.

In [46]:
from __future__ import division
import pandas as pd
from IPython.display import display
import re

In [2]:
df = pd.read_hdf('../data/data_w_descs_and_census.h5')
df.shape

(905650, 155)

## Transforming first set of categories

In [47]:
def get_bedroom(txt):
    if txt[-1] == '+':
        return 6 # I chose this arbitrarily over 5
    else:
        return int(txt[-1])

In [61]:
def get_rent(txt):
    if txt[-1] == '+':
        return 3750
    else:
        regex = re.compile(r'rent_(\d+)_(\d+)')
        match = regex.search(txt)
        low, high = [int(i) for i in match.groups()]
        return int((high + low) / 2) + 1

In [75]:
def get_income(txt):
    if txt[-1] == '+':
        return 250000
    else:
        regex = re.compile(r'income_(\d+)_(\d+)')
        match = regex.search(txt)
        low, high = [int(i) for i in match.groups()]
        return int((high + low) / 2) + 1        

In [79]:
def get_value(txt):
    if txt[-1] == '+':
        return 2.5e6
    else:
        regex = re.compile(r'value_(\d+)_(\d+)')
        match = regex.search(txt)
        low, high = [int(i) for i in match.groups()]
        return int((high + low) / 2) + 1        

In [160]:
def transform_categ_mode(df, category):
    if category in ('race', 'poverty'):
        raise Exception('not gonna transform those categs')
    
    d = {
        'bedroom': 'bedroom_total_ppl', 
        'school': 'school_total', 
        'rent': 'rent_total', 
        'income': 'income_total', 
        'value': 'value_total',
        'housing': 'housing_total'
    }
    assert category in d.keys()
    
    category_total_col = d[category]
    category_cols = [col for col in df.columns if category + '_' == col[:len(category) + 1]]
    categ_df = df[category_cols]
    
    if category == 'income': # super-hacky, but this is a one-time process
        categ_df = categ_df.drop('income_per_capita', axis=1)
    
    max_categ_df = categ_df[[col for col in categ_df.columns if col != category_total_col]].idxmax(axis=1)
#     return max_categ_df
    new_df = df.drop(category_cols, axis=1)    
    
    if category == 'bedroom':
        new_df[category] = max_categ_df.map(lambda txt: get_bedroom(txt))
    elif category == 'school':
        new_df[category] = max_categ_df.map(lambda txt: txt.replace('school_', ''))
    elif category == 'rent':
        new_df[category] = max_categ_df.map(lambda txt: get_rent(txt))
    elif category == 'income':
        new_df[category] = max_categ_df.map(lambda txt: get_income(txt))
    elif category == 'value':
        new_df[category] = max_categ_df.map(lambda txt: get_value(txt)) 
    elif category == 'housing':
        new_df[category] = max_categ_df.map(lambda txt: txt[8:])
        
    return new_df

In [149]:
aa = transform_categ_mode(df, 'school')
aa[aa.isnull()]

74        NaN
4613      NaN
6762      NaN
7090      NaN
7571      NaN
12551     NaN
13108     NaN
13649     NaN
15904     NaN
19404     NaN
19450     NaN
23139     NaN
25490     NaN
26075     NaN
26323     NaN
27623     NaN
30191     NaN
30332     NaN
31413     NaN
39351     NaN
40833     NaN
42803     NaN
44950     NaN
45563     NaN
45961     NaN
48578     NaN
50865     NaN
52597     NaN
52738     NaN
55016     NaN
         ... 
848661    NaN
850133    NaN
851090    NaN
852498    NaN
855450    NaN
860543    NaN
862434    NaN
865641    NaN
870390    NaN
870936    NaN
871095    NaN
872808    NaN
875063    NaN
875168    NaN
875513    NaN
877632    NaN
882243    NaN
882398    NaN
883555    NaN
884408    NaN
886198    NaN
886922    NaN
888377    NaN
890710    NaN
891027    NaN
892138    NaN
892665    NaN
893215    NaN
899810    NaN
902884    NaN
dtype: object

In [165]:
aa.isnull().sum()

445

In [166]:
df.isnull().sum()

CASE_ENQUIRY_ID                        0
OPEN_DT                                0
TARGET_DT                         197308
CLOSED_DT                          70634
OnTime_Status                        662
CASE_STATUS                            0
CLOSURE_REASON                     70425
CASE_TITLE                           484
SUBJECT                                0
REASON                                 0
TYPE                                   0
QUEUE                                  0
Department                             0
SubmittedPhoto                    770245
ClosedPhoto                       831941
Location                           39262
fire_district                      42516
pwd_district                       40649
city_council_district              39437
police_district                    40423
neighborhood                       40090
neighborhood_services_district     39439
ward                               39442
precinct                           42192
land_usage      

In [152]:
df[aa.isnull()].Property_Type.drop_duplicates()

74       Intersection
19450         Address
Name: Property_Type, dtype: object

In [154]:
df.iloc[19450:19451].T.loc['LATITUDE':]

Unnamed: 0,19450
LATITUDE,42.3536
LONGITUDE,-71.1737
Source,Citizens Connect App
Geocoded_Location,"(42.3536, -71.1737)"
case_enquiry_id,1.01001e+11
description,Dead squirrel
specific_location,\n Location: \n\t\t\t\tRoadway\n\t\t\t
title,"Dead Animal Pick-up at 733 Washington St, Brig..."
tract_and_block_group,3731001
bedroom_total_ppl,


In [134]:
category = 'school'
df.iloc[74][[col for col in df.columns if category + '_' == col[:len(category) + 1]]]

school_total                        NaN
school_0_none                       NaN
school_1_preschool                  NaN
school_2_kindergarden               NaN
school_3_1st_grade                  NaN
school_4_2nd_grade                  NaN
school_5_3rd_grade                  NaN
school_6_4th_grade                  NaN
school_7_5th_grade                  NaN
school_8_6th_grade                  NaN
school_9_7th_grade                  NaN
school_10_8th_grade                 NaN
school_11_9th_grade                 NaN
school_12_10th_grade                NaN
school_13_11th_grade                NaN
school_14_12th_grade_no_diploma     NaN
school_15_hs_diploma                NaN
school_16_ged                       NaN
school_17_less_than_1_yr            NaN
school_18_some_college_no_degree    NaN
school_19_associates                NaN
school_20_bachelors                 NaN
school_21_masters                   NaN
school_22_professional_school       NaN
school_23_doctorate                 NaN


In [164]:
transform_categ_mode(df, 'rent').rent.head()

TypeError: 'float' object has no attribute '__getitem__'

In [125]:
for categ in ['school', 'housing', 'bedroom', 'value', 'rent', 'income']:
    df = transform_categ_mode(df, categ)
    
df.shape

AttributeError: 'float' object has no attribute 'replace'

## Now to make percentages for Poverty and Race

In [111]:
def transform_poverty_race(df, category):
    assert category in ('race', 'poverty')
    d = {
        'poverty': 'poverty_total_pop', 
        'race': 'race_total'
    } 
    
    total_col_name = d[category]
    new_df = df[[col for col in df.columns if category + '_' == col[:len(category) + 1]]]
    df_transformed_values = zip(*new_df.apply(
        lambda row: [row[list(new_df.columns).index(col)] / row[total_col_name] for col in new_df.columns if col != total_col_name],
        axis=1)
    )    
    
    if category == 'poverty':
        new_df['poverty_pop_below_poverty_level'], \
            new_df['poverty_pop_w_public_assistance'], \
            new_df['poverty_pop_w_food_stamps'], \
            new_df['poverty_pop_w_ssi'] = df_transformed_values
    elif category == 'race':
        new_df['race_white'], \
            new_df['race_black'], \
            new_df['race_asian'], \
            new_df['race_hispanic'], \
            new_df['race_other'] = df_transformed_values
    
    return new_df.drop(total_col_name, axis=1)

In [None]:
for categ in ('race', 'poverty'):
    df = transform_poverty_race(df, categ)
    
df.shape

In [None]:
df.head(1).T