# Feature engineering
Modify survey responses to be interpretable

In [36]:
# import packages
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
# load data
survey_data = pd.read_csv('../data/intermediate/01_filtered_renamed_survey_data.csv')

## Variable transformations
We're handling various demographics in the survey balancing notebook

In [38]:
transformed_variables = pd.DataFrame()
transformed_variables['id'] = survey_data['id']
# transformed_variables.set_index('id', inplace=True)

In [39]:
# Years lived in Somerville. This one is complicated since it was not a coerced data type.

def format_years(v):
    """Map response to 'how long have you lived here?'"""

    # just a number, assume it's years
    p = '^([\.\d]+)$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]))
    
    # "3.5 years"
    p = '^([\.\d]+) years$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]))

    # "30+ years"
    p = '^([\.\d]+)\+ years$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]))
    
    # "9 months"
    p = '^([\.\d]+) months$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]) / 12)
    
    # "3 weeks"
    p = '^([\.\d]+) weeks$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return 0
    
    # "6-8 years"
    p = '^([\.\d]+)-([\.\d]+) years$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0][1]))
    
    # "1 year, 9 months"
    p = '^([\.\d]+) year, ([\.\d]+) months$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0][0]))
    
    # "3 days"
    p = '^([\.\d]+) days$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return 0
    
    if v == '1 year':
        return 1
    
    if v == '1 month':
        return 0
    
    years_map = {'no_answer': np.nan,
             '1 month, two years before from 2010-2012': 2,
             '4  months': 0,
             '5 years + 10 years before': 15,
             '`17 years': 17,
             'less than 1 year': 0,
             '11years': 11,
             '3 generations': np.nan,
             '1 year thist ime; 6 years total': 6,
             '1 year 2 months': 1,
             'too long': np.nan,
             '50 +': 50,
             ' years': np.nan,
             '16+': 16,
             '14 years; 11 years now': 14,
             '1 year 3 months': 1,
             'less than 10 months': 0,
             '30+': 30,
             '8.5 yesrs': 8.5,
             'life long resident': 50,
             '1 year this time; 6 years total': 6
            }
    
    if v in years_map.keys():
        return years_map[v]

# Make sure everything is a string
transformed_variables['f01_how_long_lived_here'] = survey_data['d10_how_long_lived_here'].astype(str)

# For some reason '-' was coded as '999'. We need to undo that.
transformed_variables['f01_how_long_lived_here'] = transformed_variables['f01_how_long_lived_here'].str.replace("999", '-')

# Lowercase
transformed_variables['f01_how_long_lived_here'] = transformed_variables['f01_how_long_lived_here'].str.lower()

# remove qualifiers
transformed_variables['f01_how_long_lived_here'] = transformed_variables['f01_how_long_lived_here'].apply(
    lambda x: x[7:] if x.startswith("almost ") else x)
transformed_variables['f01_how_long_lived_here'] = transformed_variables['f01_how_long_lived_here'].apply(
    lambda x: x[7:] if x.startswith("about ") else x)
transformed_variables['f01_how_long_lived_here'] = transformed_variables['f01_how_long_lived_here'].apply(
    lambda x: x[7:] if x.startswith("over ") else x)

# apply formatting function
transformed_variables['f01_how_long_lived_here'] = transformed_variables['f01_how_long_lived_here'].apply(lambda x: format_years(x))



In [40]:
# Housing status

def format_housing_status(v):
    """Map housing status response"""
    
    if v == 'Rent':
        return v
    
    elif v == 'Own':
        return v
    
    else:
        return 'Other'

# Map housing status. We just want Rent/Own/Other
transformed_variables['f02_housing_status'] = survey_data['d06_housing_status'].apply(
    lambda x: format_housing_status(x))

In [41]:
# Is the city moving in the right direction?

city_direction_map = {'Right direction': 'right',
                      'Wrong track': 'wrong',
                      'Not sure': 'unsure',}

transformed_variables['f03_city_direction'] = survey_data['q05_city_direction'].map(city_direction_map)
transformed_variables['f03_city_direction'].fillna('no_answer', inplace=True)


In [42]:
# Language. This is tricky because there's a free response. I mapped most to "other"

language = survey_data['d03_language'].str.lower()

transformed_variables['f04_english'] = pd.to_numeric(language.str.contains('english') * 1)
transformed_variables['f04_spanish'] = pd.to_numeric(language.str.contains('spanish') * 1)
transformed_variables['f04_portuguese'] = pd.to_numeric(language.str.contains('portuguese') * 1)

searchfor = ['mandarin', 'chinese', 'contonese']
transformed_variables['f04_chinese'] = pd.to_numeric(language.str.contains('|'.join(searchfor)) * 1)

searchfor = ['arabic', 'greek', 'french', 'punjabi', 'amharic', 
             'gujrati', 'nepali', 'tigrinya', 'polish', 'filipino',
             'italian', 'japanese', 'russian', 'haitian creole', 'kreole', 'romanian',
             'slovac', 'vietnnamese', 'esperanto', 'hebrew', 'bulgarian', 'latin',
            'persian', 'romanian']
transformed_variables['f04_other'] = pd.to_numeric(language.str.contains('|'.join(searchfor)) * 1)


In [43]:
# Convert to binary datatype for children, plan to move, is student

transformed_variables['f05_num_children'] = pd.to_numeric(survey_data['d05_num_children'].map({'Yes': 1, 'No': 0}))
transformed_variables['f06_plan_to_move'] = pd.to_numeric(survey_data['d07_plan_to_move'].map({'Yes': 1, 'No': 0}))
transformed_variables['f07_is_student'] = pd.to_numeric(survey_data['d09_is_student'].map({'Yes': 1, 'No': 0}))

In [44]:
# Transportation.
# These can have compound answers that we need to break out into indicators.

# the transportation questions return a list of vehicles. Break into indicator columns.
transformed_variables['f08_car'] = pd.to_numeric(survey_data['d11_transportation_mode'].str.contains('Car') * 1)
transformed_variables['f08_walk'] = pd.to_numeric(survey_data['d11_transportation_mode'].str.contains('Walk') * 1)
transformed_variables['f08_bike'] = pd.to_numeric(survey_data['d11_transportation_mode'].str.contains('Bike') * 1)
transformed_variables['f08_public'] = pd.to_numeric(survey_data['d11_transportation_mode'].str.contains('Public') * 1)

transformed_variables['f09_car'] = pd.to_numeric(survey_data['d12_transportation_month'].str.contains('Car') * 1)
transformed_variables['f09_walk'] = pd.to_numeric(survey_data['d12_transportation_month'].str.contains('Walk') * 1)
transformed_variables['f09_bike'] = pd.to_numeric(survey_data['d12_transportation_month'].str.contains('Bike') * 1)
transformed_variables['f09_public'] = pd.to_numeric(survey_data['d12_transportation_month'].str.contains('Public') * 1)


## Code reasons for moving

In [45]:
survey_data['d07_plan_to_move']

0        No
1        No
2        No
3       Yes
4        No
       ... 
1491     No
1492    Yes
1493    Yes
1494     No
1495     No
Name: d07_plan_to_move, Length: 1496, dtype: object

In [46]:
# Export reasons people have for moving away
survey_data[survey_data['d07_plan_to_move']=="Yes"][['id','move_why']].to_csv('../data/intermediate/move_why.csv', index=False)

At this point I open this .csv and code the responses. Each response is assigned up to two reasons for leaving. Further reasons are ignored. The categories and sub-categories are:

* Cost
  * Housing
  * Taxes
  * General
* Family / Relationship
* Job / School
* School System
* Political
* Atmosphere
    * Crowds
    * Community
    * Cleanliness
    * Development
    * Traffic / Bikes
* Other
   * Housing
   * [blank]

In [47]:
moving_data = pd.read_csv('../data/intermediate/move_why_coded.csv').set_index('id')

In [48]:
moving_data

Unnamed: 0_level_0,move_why,cat1,subcat1,cat2,subcat2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7394,Family reasons,Family / Relationship,,,
7399,Pursuing higher education,Job / school,,,
7405,Easier commute to work,Job / school,,,
7411,To get closer to family,Family / Relationship,,,
7416,Way too expensive,Cost,General,,
...,...,...,...,...,...
8872,Closer to work but I love Somerville <3,Job / school,,,
8874,"My job is term limited, but I don't want anoth...",Job / school,,Cost,Housing
8877,Too expensive / neighborhood being sold to yup...,Cost,General,Atmosphere,Community
8883,House is being sold and can't find something c...,Other,Housing,,


In [49]:
# Drop redundant column, rename cols, and join to data set

moving_data.drop('move_why', inplace=True, axis=1)
moving_data.rename(columns={'cat1': 'f10_move_cat1',
                           'subcat1': 'f10_move_subcat1',
                           'cat2': 'f10_move_cat2',
                           'subcat2': 'f10_move_subcat2'},
                 inplace=True)

In [50]:
# Prepare to join
transformed_variables.set_index('id', inplace=True)

In [51]:
transformed_variables = transformed_variables.join(moving_data, how='left').reset_index()

In [53]:
# export data
transformed_variables.to_csv('../data/intermediate/04_transformed_variables.csv', index=False)