## Final Prototyping for PLUTO only buildings

In [1]:
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine, text
import numpy as np
from sqlalchemy.orm import sessionmaker
import geopandas as gpd
#import geodatasets

import pickle as pkl

from datetime import datetime

In [2]:
with open('data/in_pw.pkl', 'rb') as f:
    in_pw = pkl.load(f)

with open('data/from_pluto.pkl', 'rb') as f:
    pw_from_pluto = pkl.load(f)

with open('data/manually_created.pkl', 'rb') as f:
    pw_manually_created = pkl.load(f)

In [3]:
'''from_pluto_post2017 = from_pluto[(from_pluto['last_acris_sale'] >= datetime.fromisoformat('2017-01-01')) | \
                           (from_pluto['last_listing'] >= datetime.fromisoformat('2017-01-01')) | \
                            (from_pluto['last_mlsli_listing'] >= datetime.fromisoformat('2017-01-01'))]'''

"from_pluto_post2017 = from_pluto[(from_pluto['last_acris_sale'] >= datetime.fromisoformat('2017-01-01')) |                            (from_pluto['last_listing'] >= datetime.fromisoformat('2017-01-01')) |                             (from_pluto['last_mlsli_listing'] >= datetime.fromisoformat('2017-01-01'))]"

In [4]:
in_pw.groupby(['source'])['id'].nunique()

source
manual_future_building      49
manually entered            15
manually_entered           409
pluto_merge                  1
pluto_split                  3
pluto_v09_1                  1
pluto_v13_1                  1
pluto_v13_2                  2
pluto_v14_1                  1
pluto_v14_2                  1
pluto_v15_1                  1
pluto_v16_2                  3
pluto_v17_1               3094
Name: id, dtype: int64

In [5]:
no_listings = in_pw[(in_pw['num_acris'] + in_pw['num_mlsli'] + in_pw['num_other_listings'] + in_pw['num_rls']) == 0]
no_listings.groupby('source')['id'].nunique()

source
manual_future_building       3
manually entered             2
pluto_split                  1
pluto_v14_1                  1
pluto_v17_1               1116
Name: id, dtype: int64

In [6]:
has_listings = in_pw[~in_pw['id'].isin(no_listings['id'])]
has_listings.groupby('source')['id'].nunique()

source
manual_future_building      46
manually entered            13
manually_entered           409
pluto_merge                  1
pluto_split                  2
pluto_v09_1                  1
pluto_v13_1                  1
pluto_v13_2                  2
pluto_v14_2                  1
pluto_v15_1                  1
pluto_v16_2                  3
pluto_v17_1               1978
Name: id, dtype: int64

In [None]:
print(in_pw.shape, no_listings.shape)

In [7]:
pw_manually_created.groupby('source')['id'].nunique()

source
manual_future_building     46
manually entered           13
manually_entered          409
Name: id, dtype: int64

In [8]:
for_mlsli = pw_manually_created[(pw_manually_created['num_mlsli'] > 0) & (pw_manually_created['num_rls'] == 0)]
for_rls = pw_manually_created[(pw_manually_created['num_mlsli'] == 0) & (pw_manually_created['num_rls'] > 0)]
for_other = pw_manually_created[~(pw_manually_created['id'].isin(for_mlsli['id'])) & ~(pw_manually_created['id'].isin(for_rls['id']))]
print(for_mlsli.shape, for_rls.shape, for_other.shape)

(410, 105) (32, 105) (26, 105)


## Address Matching

In [9]:
address_dict = {
  "FIRST": "1ST",
  "SECOND": "2ND",
  "THIRD": "3RD",
  "FOURTH": "4TH",
  "FIFTH": "5TH",
  "SIXTH": "6TH",
  "SEVENTH": "7TH",
  "EIGHTH": "8TH",
  "NINTH": "9TH",
  "TENTH": "10TH",
  "ELEVENTH": "11TH",
  "TWELFTH": "12TH",
  "THIRTEENTH": "13TH",
  "FOURTEENTH": "14TH",
  "FIFTEENTH": "15TH",
  "SIXTEENTH": "16TH",
  "SEVENTEENTH": "17TH",
  "EIGHTEENTH": "18TH",
  "NINETEENTH": "19TH",
  "TWENTIETH": "20TH",
  "TWENTY-FIRST": "21ST",
  "TWENTY-SECOND": "22ND",
  "TWENTY-THIRD": "23RD",
  "TWENTY-FOURTH": "24TH",
  "TWENTY-FIFTH": "25TH",
  "TWENTY-SIXTH": "26TH",
  "TWENTY-SEVENTH": "27TH",
  "TWENTY-EIGHTH": "28TH",
  "TWENTY-NINTH": "29TH",
  "THIRTIETH": "30TH",
  "THIRTY-FIRST": "31ST",
  "THIRTY-SECOND": "32ND",
  "THIRTY-THIRD": "33RD",
  "THIRTY-FOURTH": "34TH",
  "THIRTY-FIFTH": "35TH",
  "THIRTY-SIXTH": "36TH",
  "THIRTY-SEVENTH": "37TH",
  "THIRTY-EIGHTH": "38TH",
  "THIRTY-NINTH": "39TH",
  "FORTIETH": "40TH",
  "FORTY-FIRST": "41ST",
  "FORTY-SECOND": "42ND",
  "FORTY-THIRD": "43RD",
  "FORTY-FOURTH": "44TH",
  "FORTY-FIFTH": "45TH",
  "FORTY-SIXTH": "46TH",
  "FORTY-SEVENTH": "47TH",
  "FORTY-EIGHTH": "48TH",
  "FORTY-NINTH": "49TH",
  "FIFTIETH": "50TH",
  "FIFTY-FIRST": "51ST",
  "FIFTY-SECOND": "52ND",
  "FIFTY-THIRD": "53RD",
  "FIFTY-FOURTH": "54TH",
  "FIFTY-FIFTH": "55TH",
  "FIFTY-SIXTH": "56TH",
  "FIFTY-SEVENTH": "57TH",
  "FIFTY-EIGHTH": "58TH",
  "FIFTY-NINTH": "59TH",
  "SIXTIETH": "60TH",
  "SIXTY-FIRST": "61ST",
  "SIXTY-SECOND": "62ND",
  "SIXTY-THIRD": "63RD",
"SIXTY-FOURTH": "64TH",
"SIXTY-FIFTH": "65TH",
"SIXTY-SIXTH": "66TH",
"SIXTY-SEVENTH": "67TH",
"SIXTY-EIGHTH": "68TH",
"SIXTY-NINTH": "69TH",
"SEVENTIETH": "70TH",
"SEVENTY-FIRST": "71ST",
"SEVENTY-SECOND": "72ND",
"SEVENTY-THIRD": "73RD",
"SEVENTY-FOURTH": "74TH",
"SEVENTY-FIFTH": "75TH",
"SEVENTY-SIXTH": "76TH",
"SEVENTY-SEVENTH": "77TH",
"SEVENTY-EIGHTH": "78TH",
"SEVENTY-NINTH": "79TH",
"EIGHTIETH": "80TH",
"EIGHTY-FIRST": "81ST",
"EIGHTY-SECOND": "82ND",
"EIGHTY-THIRD": "83RD",
"EIGHTY-FOURTH": "84TH",
"EIGHTY-FIFTH": "85TH",
"EIGHTY-SIXTH": "86TH",
"EIGHTY-SEVENTH": "87TH",
"EIGHTY-EIGHTH": "88TH",
"EIGHTY-NINTH": "89TH",
"NINETIETH": "90TH",
"NINETY-FIRST": "91ST",
"NINETY-SECOND": "92ND",
"NINETY-THIRD": "93RD",
"NINETY-FOURTH": "94TH",
"NINETY-FIFTH": "95TH",
"NINETY-SIXTH": "96TH",
"NINETY-SEVENTH": "97TH",
"NINETY-EIGHTH": "98TH",
"NINETY-NINTH": "99TH",
"ONE HUNDREDTH": "100TH"
}

In [10]:
with open('data/resbuildings.pkl', 'rb') as f:
    resbuildings = pkl.load(f)

In [11]:
resbuildings['address'] = [x.upper() for x in resbuildings['address']]
pw_manually_created['display_address'] = [x.upper() for x in pw_manually_created['display_address']]

In [12]:
import re
resbuildings['address'] = [re.sub(r"(?<=\d)(ST|ND|RD|TH)\b", '', x) for x in resbuildings['address']]
resbuildings['address'] = [re.sub('-', '', x) for x in resbuildings['address']]
pw_manually_created['display_address'] = [re.sub(r"(?<=\d)(ST|ND|RD|TH)\b", '', x) for x in pw_manually_created['display_address']]

In [13]:
resbuildings['address']

295               1752 72 STREET
1372            5203 CENTER BLVD
1652              8936 43 AVENUE
1663       2624 GERRITSEN AVENUE
1970          3136 MICKLE AVENUE
                   ...          
857394          174 DUANE STREET
857445          25 MONROE STREET
857717      1044 OLMSTEAD AVENUE
858326    926 SOUTHERN BOULEVARD
858842         100 GOTHAM AVENUE
Name: address, Length: 2286, dtype: object

In [14]:
address_match = pw_manually_created.merge(resbuildings, how='inner', left_on='display_address', right_on='address')
address_match[['bbl', 'source_id', 'source', 'address', 'display_address', 'zip', 'zipcode']]

Unnamed: 0,bbl,source_id,source,address,display_address,zip,zipcode
0,3016728000.0,3016720000.0,manual_future_building,269 MALCOLM X BOULEVARD,269 MALCOLM X BOULEVARD,11233,11233.0
1,1007658000.0,1007658000.0,manual_future_building,246 WEST 16 STREET,246 WEST 16 STREET,10011,10011.0
2,3076208000.0,3076201000.0,manual_future_building,1673 OCEAN AVENUE,1673 OCEAN AVENUE,11230,11230.0
3,3014818000.0,3014810000.0,manual_future_building,832 MONROE STREET,832 MONROE STREET,11221,11221.0
4,3056328000.0,3056320000.0,manual_future_building,4814 11 AVENUE,4814 11 AVENUE,11219,11219.0
5,3058550000.0,3006490000.0,manual_future_building,6734 5 AVENUE,6734 5 AVENUE,11220,11220.0
6,3032048000.0,3032048000.0,manual_future_building,900 WILLOUGHBY AVENUE,900 WILLOUGHBY AVENUE,11221,11221.0
7,3025128000.0,3025120000.0,manual_future_building,153 GREEN STREET,153 GREEN STREET,11222,11222.0
8,3019858000.0,3019858000.0,manual_future_building,25 MONROE STREET,25 MONROE STREET,11238,11238.0
9,3057208000.0,3057200000.0,manual_future_building,1449 61 STREET,1449 61 STREET,11219,11219.0


In [15]:
address_match = address_match[pd.to_numeric(address_match['zip']) == address_match['zipcode']]
address_match.shape

(19, 197)

In [16]:
address_match.to_csv('data/final_address_match.csv')

In [17]:
resbuildings_no_match = resbuildings[~resbuildings['bbl'].isin(address_match['bbl'])]
resbuildings_no_match.shape

(2267, 92)

## Dealing with `resbuildings_no_match`

In [18]:
# getting preso geometries
from shapely.geometry import shape
import json
'''preso = has_listings[has_listings.geometry_json.notnull()]
preso_gdf = gpd.GeoDataFrame(preso)
# Transform geojson to shapes
preso_gdf['geometry'] = [shape(json.loads(x)) for x in preso_gdf['geometry_json']]
#for i,r in gdf.iterrows():
 #   gdf.at[i, 'geometry_json'] = shape(json.loads(gdf['geometry_json'][i]))
    
# Set geometry for geodf
preso_gdf = preso_gdf.set_geometry('geometry')
preso_gdf.set_crs(crs="EPSG:4326", inplace=True)
'''
# TODO need to change this to all preso
with open('data/preso_gdf.pkl', 'rb') as f:
    preso_gdf = pkl.load(f)
print(has_listings.shape, preso_gdf.shape)


preso_gdf.drop(columns=['created_at',
                            'updated_at',
                            'new_dev_start_date',
                            'new_dev_end_date',
                            'land_lease_expiration',
                            'last_rls_listing',
                            'last_mlsli_listing',
                            'last_listing',
                            'last_acris_created'], inplace=True)

(2458, 105) (744425, 105)


In [19]:
# getting pluto geometries
try:
    with open('data/gdf.pkl', 'rb') as f:
        pluto_gdf = pkl.load(f)
except FileNotFoundError:
    shapefile_path = 'data/nyc_mappluto_23v1_1_shp/MapPLUTO.shp'
    pluto_gdf = gpd.read_file(shapefile_path)
    # formatting for geospatial join
    pluto_gdf.to_crs('epsg:4326', inplace=True)
    with open('data/gdf.pkl', 'wb') as f:
        pkl.dump(pluto_gdf, f)

In [20]:
resbuildings_no_match = pluto_gdf[pluto_gdf['BBL'].isin(resbuildings_no_match['bbl'])]

In [21]:
resbuildings_no_match.shape

(2267, 95)

In [22]:
tst = ['within', 'intersects', None, 'covers', 'overlaps', 'crosses', 'touches', 'contains_properly', 'contains']
for t in tst:
    sp1 = gpd.sjoin(preso_gdf, resbuildings_no_match, op=t)
    print(t, sp1.BBL.nunique())
#sp1 = gpd.sjoin(preso_gdf, resbuildings_no_match, op='covered_by')

within 32
intersects 1979
None 2255
covers 152
overlaps 1883
crosses 0
touches 0
contains_properly 152
contains 152


In [23]:
sp1 = gpd.sjoin(preso_gdf, resbuildings_no_match, op='intersects')

In [24]:
#sp1_over = sp1
#sp1_inter = gpd.sjoin(preso_gdf, resbuildings_no_match, op='intersects')
#inter_only = sp1_inter[~sp1_inter.BBL.isin(sp1_over.BBL)]

In [25]:
#resbuildings_no_match[resbuildings_no_match['BBL'].isin(inter_only.BBL)].to_file('data/inter_only')


In [26]:
sp1[['BBL', 'source_id', 'source', 'Address', 'display_address', 'zip', 'ZipCode', 'YearBuilt']]

Unnamed: 0,BBL,source_id,source,Address,display_address,zip,ZipCode,YearBuilt
50,1.002250e+09,1002250004.0,pluto_v17_1,465 GREENWICH STREET,123 Watts Street,10013,10013,1910
93955,1.002250e+09,1002257501.0,pluto_v17_1,465 GREENWICH STREET,463-469 Greenwich Street,10013,10013,1910
327716,1.002250e+09,1002257502.0,pluto_v17_1,465 GREENWICH STREET,16 Desbrosses Street,10013,10013,1910
696281,1.002250e+09,1002250005.0,pluto_v09_1,465 GREENWICH STREET,16 DESBROSSES STREET,10013,10013,1910
80,1.001418e+09,1001410028.0,pluto_v17_1,174 DUANE STREET,172 Duane Street,10013,10013,1910
...,...,...,...,...,...,...,...,...
732998,4.157110e+09,4157110020.0,pluto_v17_1,11-34 MCBRIDE STREET,11-30 MC BRIDE STREET,11691,11691,2019
733723,4.117630e+09,4117630006.0,pluto_v17_1,125-09 133RD AVENUE,125-09 133 AVENUE,11420,11420,2022
736187,4.063700e+09,4063700105.0,pluto_v17_1,134 60 AVENUE,134-22 60 AVENUE,11355,11355,2017
748092,2.056550e+09,2056550019.0,pluto_v17_1,3548 MC OWEN AVENUE,3544 McOwen Avenue,10475,10475,0


In [27]:
dupes = sp1.groupby('BBL')['id'].nunique()
dupes = dupes[dupes > 1]

In [28]:
dupes

BBL
1.000168e+09    3
1.000638e+09    6
1.000908e+09    5
1.001320e+09    3
1.001418e+09    3
               ..
4.161560e+09    2
4.161560e+09    2
4.161668e+09    6
4.161668e+09    5
4.161808e+09    6
Name: id, Length: 1398, dtype: int64

In [33]:
resbuildings_no_match[resbuildings_no_match['BBL'].isin(sp1.BBL)].to_file('data/geospatial_found')
resbuildings_no_match[resbuildings_no_match['BBL'].isin(dupes.index)].to_file('data/geospatial_dupes')
not_found = resbuildings_no_match[~resbuildings_no_match['BBL'].isin(sp1.BBL)]
not_found.to_file('data/geospatial_not_found')

  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,


In [34]:
dupes.shape

(1398,)

In [35]:
sp1.shape

(5626, 191)

In [36]:
sp1[sp1['in_search'] == True].BBL.nunique()

1837

In [37]:
sp1[sp1['in_search'] != True].BBL.nunique()

641

In [41]:
sp1.groupby('in_search')['BBL'].nunique()

in_search
False     641
True     1837
Name: BBL, dtype: int64

In [39]:
not_found.shape

(288, 95)

In [42]:
resbuildings_no_match.BBL.nunique()

2267

In [45]:
x = sp1.groupby('BBL')['id'].nunique()[sp1.groupby('BBL')['id'].nunique() == 1]

In [46]:
x

BBL
1.001370e+09    1
1.001370e+09    1
1.002240e+09    1
1.006020e+09    1
1.006020e+09    1
               ..
4.161560e+09    1
4.161560e+09    1
4.161560e+09    1
4.161560e+09    1
4.161560e+09    1
Name: id, Length: 581, dtype: int64

In [47]:
sp1.groupby('BBL')['id'].nunique()[sp1.groupby('BBL')['id'].nunique() != 1]

BBL
1.000168e+09    3
1.000638e+09    6
1.000908e+09    5
1.001320e+09    3
1.001418e+09    3
               ..
4.161560e+09    2
4.161560e+09    2
4.161668e+09    6
4.161668e+09    5
4.161808e+09    6
Name: id, Length: 1398, dtype: int64