# Code and analyze people who are planning to move


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re


In [40]:
# load data

data = pd.read_csv('../data/processed/weighted_survey_data.csv')
# variables = pd.read_csv('../data/processed/data_2019_vars.csv')

data.head()

Unnamed: 0,year,q01_happy,q02_satisfied_general,q03_satisfied_somerville,q04_satisfied_neighborhood,q06a_city_services,q06b_cost_housing,q06c_quality_schools,q06d_trust_police,q06e_sidewalks,...,d08_hhi_buckets,d11_car,d11_walk,d11_bike,d11_public,d12_car,d12_walk,d12_bike,d12_public,weight
0,2019,10.0,10.0,10.0,10.0,5.0,4.0,4.0,5.0,4.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,13.925208
1,2019,7.0,7.0,9.0,8.0,4.0,2.0,,4.0,3.0,...,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,62.491567
2,2019,9.0,9.0,1.0,1.0,4.0,1.0,4.0,2.0,1.0,...,4.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,29.726222
3,2019,10.0,9.0,9.0,8.0,5.0,5.0,5.0,5.0,3.0,...,5.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,55.520842
4,2019,8.0,8.0,7.0,7.0,5.0,3.0,4.0,5.0,1.0,...,5.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,25.358064


In [41]:
data[data['d07_plan_to_move']==1.0][['move_why']].to_csv('../data/processed/move_why.csv')

At this point I open this .csv and code the responses. Each response is assigned up to two reasons for leaving. Further reasons are ignored. The categories and sub-categories are:

* Cost
  * Housing
  * Taxes
  * General
* Family / Relationship
* Job / School
* School System
* Political
* Atmosphere
    * Crowds
    * Community
    * Cleanliness
    * Development
    * Traffic / Bikes
* Other
   * Housing
   * [blank]

In [61]:
coded_data = pd.read_csv('../data/processed/move_why_coded.csv').set_index('id')

In [62]:
coded_data

Unnamed: 0_level_0,move_why,cat1,subcat1,cat2,subcat2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,Family reasons,Family / Relationship,,,
8,Pursuing higher education,Job / school,,,
14,Easier commute to work,Job / school,,,
20,To get closer to family,Family / Relationship,,,
25,Way too expensive,Cost,General,,
...,...,...,...,...,...
1481,Closer to work but I love Somerville <3,Job / school,,,
1483,"My job is term limited, but I don't want anoth...",Job / school,,Cost,Housing
1486,Too expensive / neighborhood being sold to yup...,Cost,General,Atmosphere,Community
1492,House is being sold and can't find something c...,Other,Housing,,


In [85]:
data_moving = data.drop('move_why', axis=1).join(coded_data, how='left')
data_moving['mentions_housing_cost'] = (data_moving.apply(lambda row: ((row['cat1'] == 'Cost') & (row['subcat1']=='Housing')) or ((row['cat2'] == 'Cost') & (row['subcat2']=='Housing')), axis=1)) * 1.0

Export for future processing

In [None]:
data_moving.to_csv('../data/processed/.csv')

How many people are planning on moving?

In [86]:
data_moving.groupby('d07_plan_to_move').sum()[['weight']] / np.sum(data_moving['weight'])

Unnamed: 0_level_0,weight
d07_plan_to_move,Unnamed: 1_level_1
0.0,0.582288
1.0,0.35754


About 35.7% of respondents report planning to move out of Somerville in the next year.

What reasons do people give for moving?

In [87]:
movers_only = data_moving[data_moving['d07_plan_to_move']==1.0]

In [88]:
movers_only.groupby(['cat1', 'subcat1']).sum()[['weight']] / np.sum(movers_only['weight'])

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
cat1,subcat1,Unnamed: 2_level_1
Atmosphere,Cleanliness,0.007733
Atmosphere,Community,0.001966
Atmosphere,Construction,0.004955
Atmosphere,Crowds,0.023439
Atmosphere,Development,0.001064
Atmosphere,Nature,0.008151
Atmosphere,Noise,0.005859
Atmosphere,Other,0.020887
Atmosphere,Space,0.00975
Atmosphere,Traffic / bikes,0.016993


In [89]:
movers_only.groupby(['cat1']).sum()[['weight']] / np.sum(movers_only['weight'])

Unnamed: 0_level_0,weight
cat1,Unnamed: 1_level_1
Atmosphere,0.101334
Cost,0.420771
Family / Relationship,0.028335
Job / school,0.240705
No Answer,0.071292
Other,0.126603
Political,0.007896
School System,0.003065


How many people mention housing cost specifically in their first two reasons?

In [91]:
movers_only.groupby(['mentions_housing_cost']).sum()[['weight']] / np.sum(movers_only['weight'])

Unnamed: 0_level_0,weight
mentions_housing_cost,Unnamed: 1_level_1
0.0,0.684266
1.0,0.315734


In [93]:
movers_only.columns

Index(['year', 'q01_happy', 'q02_satisfied_general',
       'q03_satisfied_somerville', 'q04_satisfied_neighborhood',
       'q06a_city_services', 'q06b_cost_housing', 'q06c_quality_schools',
       'q06d_trust_police', 'q06e_sidewalks', 'q06f_events',
       'q09_safe_at_night', 'q11_beauty', 'q10_parks', 'd01_gender', 'd02_age',
       'd04_race', 'd05_num_children', 'd06_housing_status',
       'd07_plan_to_move', 'd10_how_long_lived_here', 'd08_hhi',
       'd09_is_student', 'ward', 'q05_city_direction',
       'q07_safe_crossing_street', 'q08_convenient', 'q12_housing_condition',
       'd03_english', 'd03_spanish', 'd03_portuguese', 'd03_chinese',
       'd03_other', 'd04_ethnicity', 'd04_race_white', 'd04_race_aa',
       'd04_race_asian', 'd04_race_other', 'race_hooks', 'd08_hhi_buckets',
       'd11_car', 'd11_walk', 'd11_bike', 'd11_public', 'd12_car', 'd12_walk',
       'd12_bike', 'd12_public', 'weight', 'move_why', 'cat1', 'subcat1',
       'cat2', 'subcat2', 'mentions_hou

In [94]:
movers_only.groupby(['mentions_housing_cost', 'd08_hhi']).sum()[['weight']] / np.sum(movers_only['weight'])

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
mentions_housing_cost,d08_hhi,Unnamed: 2_level_1
0.0,"$10,000 to $24,999",0.063565
0.0,"$100,000 to $149,999",0.137156
0.0,"$150,000 to 200,000",0.067002
0.0,"$200,000 or more",0.069796
0.0,"$25,000 to $49,999",0.082501
0.0,"$50,000 to 74,999",0.104719
0.0,"$75,000 to $99,999",0.081794
0.0,"Less than $10,000",0.044428
1.0,"$10,000 to $24,999",0.012463
1.0,"$100,000 to $149,999",0.078716


In [3]:
# define target and base features

features = variables[variables['is_feature']==1]['var'].values

target = 'd07_plan_to_move'

data.dropna(subset=target, inplace=True)

In [15]:
data.groupby(target).count()

Unnamed: 0_level_0,q01_happy,q02_satisfied_general,q03_satisfied_somerville,q04_satisfied_neighborhood,q06a_city_services,q06b_cost_housing,q06c_quality_schools,q06d_trust_police,q06e_sidewalks,q06f_events,...,d01_gender_Male,d01_gender_No Answer,d01_gender_Non-binary,d06_housing_status_Other,d06_housing_status_Own,d06_housing_status_Rent,q05_city_direction_no_answer,q05_city_direction_right,q05_city_direction_unsure,q05_city_direction_wrong
d07_plan_to_move,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,952,951,955,955,950,948,770,936,955,941,...,967,967,967,967,967,967,967,967,967,967
1.0,425,424,426,426,428,428,313,412,427,419,...,430,430,430,430,430,430,430,430,430,430


In [3]:
# test-train split



In [10]:
data.groupby('d07_plan_to_move').count()

Unnamed: 0_level_0,id,q01_happy,q02_satisfied_general,q03_satisfied_somerville,q04_satisfied_neighborhood,q06a_city_services,q06b_cost_housing,q06c_quality_schools,q06d_trust_police,q06e_sidewalks,...,d01_gender_Male,d01_gender_No Answer,d01_gender_Non-binary,d06_housing_status_Other,d06_housing_status_Own,d06_housing_status_Rent,q05_city_direction_no_answer,q05_city_direction_right,q05_city_direction_unsure,q05_city_direction_wrong
d07_plan_to_move,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,967,952,951,955,955,950,948,770,936,955,...,967,967,967,967,967,967,967,967,967,967
1.0,430,425,424,426,426,428,428,313,412,427,...,430,430,430,430,430,430,430,430,430,430


## some ideas about housing cost

What's the survey breakdown of opinion on housing cost depending on [ward, demo, HHI, rent/own, intent to leave]

Distribution of people who are leaving vs not

Look at intent to leave: reasons, breakdown

