#AmeniDC
## See the cost of amenities in the District of Columbia

In [4]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']
import matplotlib.pylab as plt

In [5]:
import simplejson as json
import requests
from requests import Request, Session
from bs4 import BeautifulSoup
import re

from urlparse import urljoin
from collections import namedtuple
import sqlite3
from pyspark.sql import SQLContext

import numpy as np
import pandas as pd

#import geojson
#from geojson import Feature, Point, FeatureCollection
import pprint
pp = pprint.PrettyPrinter(indent=0)

In [6]:
def grid_score_plotter(cv,param):
  cv_accuracy = pd.DataFrame.from_records(
      [(score.parameters[param],
        score.mean_validation_score)
       for score in cv.grid_scores_],
  columns=[param, 'accuracy'])

  plt.plot(cv_accuracy[param], cv_accuracy.accuracy)
  plt.xlabel(param)
  plt.ylabel('accuracy')
  plt.show()

#OpenData.DC API

Get the property sale points from http://opendata.dc.gov/datasets/2acc75ccdd954267acecb8713b2b800a_28
and store as a SQL database pandas DataFrame

In [115]:
from ediblepickle import checkpoint


@checkpoint(key='prop_sales_json.csv', work_dir='data', refresh=False)
def request_records():
  url = 'http://opendata.dc.gov/datasets/2acc75ccdd954267acecb8713b2b800a_28.geojson'
  with requests.Session() as s:
    resp = s.get(url)
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print "And you get an HTTPError:", e.message
  return resp.json()


def zip_parser(x):
  z = x.split()[-1][:5]
  try:
    return int(z)
  except:
    return np.nan
  
  
def build_df(resp_json):
  keep_cols = [u'ADDRESS1',u'ADDRESS2',u'ANNUALTAX',u'ASSESSMENT',
               u'BASEBUILD',u'BASELAND',u'CITYSTZIP',
               u'HIGHNUMBER',u'HSTDCODE',u'LANDAREA',u'LOT',
               u'LOWNUMBER',u'NBHD',
               u'NEWIMPR',u'NEWLAND',u'NEWTOTAL',
               u'OLDIMPR',u'OLDLAND',u'OLDTOTAL',u'OWNOCCT',
               u'PREMISEADD',u'PROPTYPE',u'QDRNTNAME',u'SALEDATE',
               u'SALEPRICE',u'SALETYPE',u'SQUARE',u'SSL',u'STREETCODE',u'STREETNAME',
               u'SUBNBHD',u'coordinates']
    
  row_dicts = []
  for feats in resp_json['features']:
    try:
      # get coordinates
      row = dict(zip(['longitude','latitude'],feats['geometry']['coordinates']))
    except:
      continue
    attrs = feats['properties'] # get select property attributes
    row.update(dict(( (k,attrs.get(k,'')) for k in keep_cols )))
    row_dicts.append(row)
    
  df = pd.DataFrame(row_dicts)
  print df.columns

  df['lat_lng'] = df.loc[:,('latitude','longitude')].apply(
      lambda row: str(row[0])+','+str(row[1]),axis=1)
  df['zipcode'] = df.CITYSTZIP.map(lambda r: zip_parser(r))

  df.to_hdf('./data/df_cleaned_wo_amen.hd5','df')
  

if False:
  resp_json = request_records()
  print resp_json['features'][0]
if True:
  build_df(resp_json)

Index([   u'ADDRESS1',    u'ADDRESS2',   u'ANNUALTAX',  u'ASSESSMENT',
         u'BASEBUILD',    u'BASELAND',   u'CITYSTZIP',  u'HIGHNUMBER',
          u'HSTDCODE',    u'LANDAREA',         u'LOT',   u'LOWNUMBER',
              u'NBHD',     u'NEWIMPR',     u'NEWLAND',    u'NEWTOTAL',
           u'OLDIMPR',     u'OLDLAND',    u'OLDTOTAL',     u'OWNOCCT',
        u'PREMISEADD',    u'PROPTYPE',   u'QDRNTNAME',    u'SALEDATE',
         u'SALEPRICE',    u'SALETYPE',      u'SQUARE',         u'SSL',
        u'STREETCODE',  u'STREETNAME',     u'SUBNBHD', u'coordinates',
          u'latitude',   u'longitude'],
      dtype='object')


# Google Maps Places API

Here, property sale locations are queried for nearby amenities. API requests were made once offline and pickled. Data for each lat_lng pair were stored as dict. 

Create a df column showing amenity_price_rating numbers for the top 20 hits within 1000m of the property

In [116]:
if True:
  df = pd.read_hdf('./data/df_cleaned_wo_amen.hd5','df')


In [98]:
if True:
  df = df.ix[0:10,:].copy()
  df.reset_index(inplace=True)

In [121]:
import simplejson as json
from requests import Request, Session
import pandas as pd
from requests_futures.sessions import FuturesSession
from collections import namedtuple

# Read in all API keys
with open("../secrets/google_secrets.json.nogit") as fh: 
  secrets = json.loads(fh.read())
google_api_key = secrets['server_api_key']

# Define amenity types of interest
AMENITY_TYPES = ['bakery','bar','cafe','grocery_or_supermarket',
  'movie_theater','park','pharmacy','restaurant','school','spa','subway_station']

Place = namedtuple('Place',['name','lat_lng','rating','price_level'])

def google_places_parser(sess,resp):
  places_list = []
  
  for r in resp.json()['results']:
    name = r.get('name')
    lat_lng = str(r['geometry']['location']['lat']) + ',' + str(r['geometry']['location']['lng'])
    rating = r.get('rating')
    price_level = r.get('price_level')
    places_list.append(Place(name,lat_lng,rating,price_level))
  
  resp.data = {'next_page_token':resp.json().get('next_page_token',''),
               'places':places_list}

def get_amenities(query_str, dfs):
  ''' Given an amenity-type query string and an nd-array-like
  lat_lng_series, return a list of list of results'''

  base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
  
  urls = []
  s = Session()
  for ll,npt in dfs.itertuples(index=False):
    if npt == '':
      search_payload = {"key":google_api_key,
                    "radius":1000,
                    "types":query_str,
                    "location":ll}
    else:
      '''Including a page token forces Google to ignore all other
      search parameters'''
      search_payload = {"key":google_api_key,
                        'pagetoken':npt}
    req = Request('GET', base_url, params=search_payload)
    urls.append(s.prepare_request(req).url)

  session = FuturesSession(max_workers=7)
  futures = [session.get(url, background_callback=google_places_parser) for url in urls]
  
  return ([f.result().data['places'] for f in futures], 
          [f.result().data['next_page_token'] for f in futures])

In [119]:
import time

if True:
  amenity = AMENITY_TYPES[0]
  df['page_tokens'] = ''
  df['latest_query'] = ''
  if False:
    df.loc[:,amenity] = df.apply(lambda _: list(),axis=1)
  get_more = np.ones(len(df.index),np.bool)
  
  i = 0
  while i<3 and get_more.any():
    i += 1
    lq, pt = get_amenities(amenity, df.loc[get_more,('lat_lng','page_tokens')])
    df.loc[get_more,'latest_query'] = lq
    df.loc[get_more,'page_tokens'] = pt
    df.loc[get_more,amenity] = df.loc[get_more,amenity] + df.loc[get_more,"latest_query"]
    get_more = df.loc[:,"page_tokens"] != ''
    time.sleep(5) # delay because next page is not immediately available

https://maps.googleapis.com/maps/api/place/nearbysearch/json?radius=1000&key=AIzaSyB5L9HXY7Lk837K90L9hOv5jl0Im2Km0Ls&types=bakery&location=38.9252905465%2C-76.9797563101
https://maps.googleapis.com/maps/api/place/nearbysearch/json?radius=1000&key=AIzaSyB5L9HXY7Lk837K90L9hOv5jl0Im2Km0Ls&types=bakery&location=38.9272915525%2C-76.9819163682
https://maps.googleapis.com/maps/api/place/nearbysearch/json?radius=1000&key=AIzaSyB5L9HXY7Lk837K90L9hOv5jl0Im2Km0Ls&types=bakery&location=38.9276412641%2C-76.9804175452
https://maps.googleapis.com/maps/api/place/nearbysearch/json?radius=1000&key=AIzaSyB5L9HXY7Lk837K90L9hOv5jl0Im2Km0Ls&types=bakery&location=38.9279023461%2C-76.9801360213
https://maps.googleapis.com/maps/api/place/nearbysearch/json?radius=1000&key=AIzaSyB5L9HXY7Lk837K90L9hOv5jl0Im2Km0Ls&types=bakery&location=38.928642486%2C-76.980557665
https://maps.googleapis.com/maps/api/place/nearbysearch/json?radius=1000&key=AIzaSyB5L9HXY7Lk837K90L9hOv5jl0Im2Km0Ls&types=bakery&location=38.9261475597

KeyError: 'the label [bakery] is not in the [columns]'

In [125]:
df

Unnamed: 0,ADDRESS1,ADDRESS2,ANNUALTAX,ASSESSMENT,BASEBUILD,BASELAND,CITYSTZIP,HIGHNUMBER,HSTDCODE,LANDAREA,...,STREETCODE,STREETNAME,SUBNBHD,coordinates,latitude,longitude,lat_lng,zipcode,page_tokens,latest_query
0,1719 FRANKLIN ST NE,,1841.86,326230,53722,49026,"WASHINGTON, DC 20018-2033",,1,5249,...,3809,FRANKLIN ST,C,,38.925291,-76.979756,"38.9252905465,-76.9797563101",20018,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
1,2814 18TH ST NE,,2200.98,378260,90097,47683,"WASHINGTON, DC 20018-2402",,1,4967,...,0180,18TH ST,C,,38.926065,-76.979315,"38.9260645262,-76.9793153944",20018,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
2,1603 RHODE ISLAND AVE NE,,1973.90,119630,0,23925,"WASHINGTON, DC 20018-1841",,,3190,...,7546,RHODE ISLAND AV,C,,38.926148,-76.981453,"38.9261475597,-76.9814528368",20018,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
3,1506 GIRARD ST NE,,1673.22,267050,80086,42710,"WASHINGTON, DC 20018-1832",,1,6801,...,1761,BRENTWOOD RD,D,,38.927292,-76.981916,"38.9272915525,-76.9819163682",20018,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
4,16843 HARBOUR TOWN DR,,302.26,35560,0,17115,"SILVER SPRING, MD 20905-8012",,,3500,...,0170,17TH ST,D,,38.927641,-76.980418,"38.9276412641,-76.9804175452",20905,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
5,1715 HAMLIN ST NE,,2680.48,385550,102675,32325,"WASHINGTON, DC 20018-1837",,1,4310,...,4303,HAMLIN ST,D,,38.927902,-76.980136,"38.9279023461,-76.9801360213",20018,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
6,3005 17TH ST NE,,2834.32,403650,121241,23707,"WASHINGTON, DC 20018-3815",,1,2429,...,0170,17TH ST,D,,38.928642,-76.980558,"38.928642486,-76.980557665",20018,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
7,1618 HAMLIN ST NE,,2038.80,310060,70411,43377,"WASHINGTON, DC 20018-1836",,1,4918,...,4303,HAMLIN ST,D,,38.928357,-76.981612,"38.9283568824,-76.981612486",20018,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
8,8001 14TH ST NW,,1920.42,349830,70137,42120,"WASHINGTON, DC 20012-1207",,1,4500,...,0180,18TH ST,D,,38.929495,-76.979653,"38.9294945336,-76.9796527375",20012,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."
9,8001 14TH ST NW,,582.76,68560,0,33000,"WASHINGTON, DC 20012-1207",,,4000,...,0180,18TH ST,D,,38.929378,-76.979644,"38.9293781736,-76.9796444993",20012,,"[(Zelfiwu Inc, 38.921522,-76.971777, None, Non..."


In [120]:
if True:
  df.to_hdf('./data/df_w_amen.hd5','df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->[u'ADDRESS1', u'ADDRESS2', u'CITYSTZIP', u'HIGHNUMBER', u'HSTDCODE', u'LOT', u'LOWNUMBER', u'NBHD', u'PREMISEADD', u'PROPTYPE', u'QDRNTNAME', u'SALEDATE', u'SALETYPE', u'SQUARE', u'SSL', u'STREETCODE', u'STREETNAME', u'SUBNBHD', u'coordinates', 'lat_lng', 'page_tokens', 'latest_query']]



In [93]:
i

1

In [None]:
np.log10(df.SALEPRICE).hist(bins=40)

### Find count and mean of each amenity ###

In [None]:
def amen_mean(row,rating_or_price):
  denom = np.float64(sum(True for r in row if r[rating_or_price]))
  if denom == 0:
    return np.nan
  else:
    return sum(r[rating_or_price] for r in row if r[rating_or_price])/denom

if True:
  for amen in AMENITY_TYPES:
    df[amen+'_count'] = df[amen].map(lambda row: len(row))

    for i in xrange(2):
      if i==0:
        r_p = 'rating'
        place_ind = 2
      elif i==1:
        r_p = 'price_level'
        place_ind = 3
      else:
        print 'What are you doing?'
        continue
      df[amen+'_mean_'+r_p] = df[amen].map(lambda row: amen_mean(row,place_ind))
  
  df.to_hdf('./data/df_w_amen.hd5','df')


In [None]:
len(df.index)

In [None]:
df.park_count.hist(bins=20)

###When places have the same name, use the haversine formula to determine if they refer to the same place.

In [None]:
haversine(-77.020269,38.894629,-77.0325204,38.9039343)

In [None]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

#Do some modeling

In [None]:
df = pd.read_hdf('./data/df_w_amen.hd5','df')

In [None]:
df.park_count.hist(bins=20)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
  """
  Select columns of data from nd array
  """
  def __init__(self, columns):
    ''' columns must be list of strings '''
    if type(columns) is list or type(columns) is str:
      self.columns = columns
    
  def fit(self, X, y):
    return self

  def transform(self, X):
    ''' Assume X is pandas dataframe'''
    return X[self.columns]
    
class ShellTransformer(BaseEstimator, TransformerMixin):
  '''Pass the fitted fitted_model into the init function
  and predict during the transform step'''
  def __init__(self,fitted_model):
    self.fitted_model = fitted_model
    pass
  
  def fit(self,X,y=None):
    return self

  def transform(self,X):
    '''Here, X is a pandas DataFrame'''
    return self.fitted_model.predict(X)
 

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn import cross_validation, grid_search
import numpy as np


amen_pipe = Pipeline([
    ('select', ColumnSelectTransformer('amenities_1000')),
    ('vec', DictVectorizer(sparse=True)),
    ('ridge', linear_model.Ridge()),
])

amen_model = grid_search.GridSearchCV( amen_pipe,
                param_grid={'ridge__alpha':np.logspace(0.7,1.5,5)},
                cv=cross_validation.ShuffleSplit(len(df.index), n_iter=20, 
                    test_size=0.2, random_state=42) )

amen_model.fit(df,df['SALEPRICE'])
grid_score_plotter(amen_model,'ridge__alpha')

## Build a predictive model using only latitude and longitude using K-nearest neighbors ##

In [None]:
from sklearn import cross_validation, grid_search, neighbors

param_grid = {"n_neighbors": range(1,6)}
lat_lng_model = grid_search.GridSearchCV( neighbors.KNeighborsRegressor(),
                param_grid=param_grid,
                cv=cross_validation.ShuffleSplit(len(df.index), n_iter=20, 
                    test_size=0.2,) )

lat_lng_model.fit(df[['latitude','longitude']],df['SALEPRICE'])
for key in param_grid.keys():
  grid_score_plotter(lat_lng_model,key)

In [None]:
cross_valiation.ShuffleSplit?

## Build a predictive model using only latitude and longitude using random forest regressor ##

In [None]:
from sklearn import ensemble

param_grid = {"min_samples_leaf": range(1,10), 'min_samples_split': range(2,10,2)}
ll_rf_model = grid_search.GridSearchCV(ensemble.RandomForestRegressor(n_jobs=-1),
                param_grid=param_grid,
                cv=cross_validation.ShuffleSplit(len(df.index), n_iter=20, 
                    test_size=0.2,) )

ll_rf_model.fit(df[['latitude','longitude']],df['SALEPRICE'])
for key in param_grid.keys():
  grid_score_plotter(ll_rf_model,key)

In [None]:
ll_rf_model.grid_scores_[0][1]

In [None]:
from mpl_toolkits.mplot3d import Axes3D

xs = []
ys = []
zs = []
for gs in ll_rf_model.grid_scores_:
  xs.append(gs[0]['min_samples_leaf'])
  ys.append(gs[0]['min_samples_split'])
  zs.append(gs[1])

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(xs, ys, zs, zdir='z', s=20, depthshade=True)



In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import linear_model
from sklearn import cross_validation, grid_search
import numpy as np



```python
import simplejson as json
from requests import Request, Session
import pandas as pd
from multiprocessing import Pool
from collections import namedtuple
from itertools import izip, repeat

# Read in all API keys
with open("../secrets/google_secrets.json.nogit") as fh: 
  secrets = json.loads(fh.read())
google_api_key = secrets['server_api_key']

# Define amenity types of interest
AMENITY_TYPES = ['bakery','bar','cafe','grocery_or_supermarket',
  'movie_theater','park','pharmacy','restaurant','school','spa','subway_station']

Place = namedtuple('Place',['name','lat_lng','rating','price_level'])

def google_places_parser((query_str,lat_lng_str)):
  ''' Search through up three pages of up to 20 results each'''
  search_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    
  search_payload = {"key":google_api_key,
                    "radius":1000,
                    "types":query_str}
  
  url = search_url+'?location='+lat_lng_str  

  places_list = []
  rcnt = 0
  while True and rcnt<3:
    with requests.Session() as s:
      resp = s.get(url,params=search_payload)
    rcnt +=1
      
    for r in resp.json()['results']:
      name = r.get('name')
      lat_lng = str(r['geometry']['location']['lat']) + ',' + str(r['geometry']['location']['lng'])
      rating = r.get('rating')
      price_level = r.get('price_level')
      places_list.append(Place(name,lat_lng,rating,price_level))
    
    npt = resp.json().get('next_page_token','')
    if npt == '':
      break
    else:
      search_payload.update({'page_token':npt})
    
  return places_list


def get_amenities(query_str, lat_lng_series):
  ''' Given an amenity-type query string and an nd-array-like
  lat_lng_series, return a list of list of results'''

  p = Pool(7)
  
  place_lists = p.map(google_places_parser, izip(repeat(query_str),lat_lng_series))
  return place_lists

```