In [36]:
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine, text
import numpy as np
from sqlalchemy.orm import sessionmaker
import geopandas as gpd
#import geodatasets

import pickle as pkl

from datetime import datetime

In [37]:
try:
    with open('data/buildings_final2.pkl', 'rb') as f:
        preso = pkl.load(f)
    print(preso.shape)
except FileNotFoundError:
    query_string = f'''
        select 
            b.*
            , rls.*
            , mlsli.*
            , l.*
            , a.*
            , st_asgeojson(b.geometry) as geometry_json
        from buildings b
        ---join with geographies
        inner join building_geographies bg
            on b.id = bg.building_id
        left join (
            select
                building_id
                , count(*) as num_other_listings
                , max(created_at) as last_listing
            from listings
            where not(
                    lower(source) like '%mlsli%' 
                    or lower(source) like '%rls%' 
                    or source = 'real_plus')
            group by building_id
        ) l on l.building_id = b.id
        left join (
            select
                building_id
                , count(distinct(id)) as num_rls
                , max(created_at) as last_rls_listing
            from listings
            where --not(lower(source) like '%mlsli%')
                lower(source) like '%rls%' or source = 'real_plus'
            group by building_id
        ) rls on rls.building_id = b.id
        left join (
            select
                building_id
                , count(distinct(id)) as num_mlsli
                , max(created_at) as last_mlsli_listing
            from listings
            where lower(source) like '%mlsli%'
            group by building_id
        ) mlsli on mlsli.building_id = b.id
        left join (
            select 
                building_id
                , count(distinct(acris_document_id)) as num_acris
                , max(created_at) as last_acris_created
                , max(sale_date) as last_acris_sale
            from historicals
            group by building_id
            order by count(distinct(acris_document_id)) desc
        ) a on b.id = a.building_id
        where bg.geography_id = 1278
        '''
    preso = pull_data(query_string)
    with open('data/buildings_final2.pkl', 'wb') as f:
        pkl.dump(preso, f)

(749222, 105)


In [38]:
try:
    with open('data/pluto_map.pkl', 'rb') as f:
        pluto = pkl.load(f)
except FileNotFoundError:
    shapefile_path = 'data/nyc_mappluto_23v1_1_shp/MapPLUTO.shp'
    gdf = gpd.read_file(shapefile_path)
    pluto = pluto[pluto['bbl'].isin(gdf['BBL'].to_list())]
    with open('data/pluto_map.pkl', 'wb') as f:
        pkl.dump(pluto, f)
    # Print GeoDataFrame information
    print(gdf.head())

In [39]:
preso['source_id'] = pd.to_numeric(preso['source_id'])
matched_buildings = preso.merge(pluto, how='inner', left_on='source_id', right_on='bbl')
print(matched_buildings.shape)

(730482, 197)


In [40]:
matched_buildings[matched_buildings['bbl'] == 3055167510][['display_address', 'address']]

Unnamed: 0,display_address,address
184213,1563 61ST STREET,1563 61ST STREET


In [41]:
mn_ids = matched_buildings[matched_buildings['borough'] == 'BK']['id']

In [42]:
def get_percent_change(old_value, new_value):
    try:
        return abs(float(old_value) - float(new_value)) / float(old_value)
    except ZeroDivisionError:
        return 1
def clean_updates(field_name, old_value, new_value):
    match field_name:
        case 'building_area':
            return get_percent_change(old_value, new_value) >= 0.05
        case 'year_built':
            return float(new_value) - float(old_value) >= 5
        case 'lot_area':
            return get_percent_change(old_value, new_value) >= 0.1
        case 'lot_depth':
            return get_percent_change(old_value, new_value) >= 0.1
        case 'lot_front':
            return get_percent_change(old_value, new_value) >= 0.1
        case 'num_units':
            return old_value == 0
        case _:
            return True

In [43]:
update_df = pd.read_csv('data/update_df.csv')
update_df.groupby('field_name')['id'].nunique()

  update_df = pd.read_csv('data/update_df.csv')


field_name
building_area      27564
building_class     31948
building_depth     15464
building_front     11180
geometry_x        730063
landmark_x          1261
lot_area          390865
lot_depth           4288
lot_front           6750
num_stories        19526
num_units          14903
year_built         19026
zip                 2779
Name: id, dtype: int64

In [44]:
keep = [clean_updates(x, y, z) for x, y, z in zip(update_df['field_name'], update_df['old_value'], update_df['new_value'])]
update_df = update_df[keep]
update_df.groupby('field_name')['id'].nunique()

field_name
building_area      24666
building_class     31948
building_depth     15464
building_front     11180
geometry_x        730063
landmark_x          1261
lot_area            8510
lot_depth           2971
lot_front           5920
num_stories        19526
num_units           3426
year_built          9417
zip                 2779
Name: id, dtype: int64

In [45]:
update_class = update_df[update_df['field_name'] == 'building_class']
resbuildings = set(update_class[(update_class['old_value'].str[0].isin(['A', 'R', 'B', 'C', 'D', 'L', 'S'])) | (update_class['new_value'].str[0].isin(['A', 'R', 'B', 'C', 'D', 'L', 'S']))]['id'])
res_ids = set(preso[(preso['building_class'].str[0].isin(['A', 'R', 'B', 'C', 'D', 'L', 'S']))]['id'])
all_res_ids = set(update_df[(update_df['id'].isin(resbuildings)) | (update_df['id'].isin(res_ids))]['id'])

In [46]:
update_df = update_df[update_df['id'].isin(mn_ids)]
update_df.groupby('field_name')['id'].nunique()

field_name
building_area      10574
building_class     11688
building_depth      6825
building_front      4479
geometry_x        275448
landmark_x           199
lot_area            3195
lot_depth           1263
lot_front           2393
num_stories         4468
num_units           1489
year_built          3965
zip                  712
Name: id, dtype: int64

In [47]:
update_df = update_df[update_df['id'].isin(all_res_ids)]
update_df.groupby('field_name')['id'].nunique()

field_name
building_area       8650
building_class      8447
building_depth      5692
building_front      3602
geometry_x        246793
landmark_x            90
lot_area            2010
lot_depth            875
lot_front           1757
num_stories         3841
num_units           1344
year_built          3444
zip                  222
Name: id, dtype: int64

In [48]:
not_geo = set(update_df[update_df['field_name'] != 'geometry_x']['id'])
update_df = update_df[update_df['id'].isin(not_geo)]
update_df.groupby('field_name')['id'].nunique()

field_name
building_area      8650
building_class     8447
building_depth     5692
building_front     3602
geometry_x        14807
landmark_x           90
lot_area           2010
lot_depth           875
lot_front          1757
num_stories        3841
num_units          1344
year_built         3444
zip                 222
Name: id, dtype: int64

In [49]:
update_df[update_df['field_name'] == 'geometry_x']

Unnamed: 0.1,Unnamed: 0,id,field_name,old_value,new_value,in_search
546800,546800,43239,geometry_x,MULTIPOLYGON (((-73.964026121154 40.6108355417...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-73.9...",True
546806,546806,43300,geometry_x,MULTIPOLYGON (((-73.8823456253682 40.674005424...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-73.8...",True
546818,546818,43467,geometry_x,MULTIPOLYGON (((-73.9447584562312 40.676927717...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-73.9...",True
546855,546855,43989,geometry_x,MULTIPOLYGON (((-74.0354179170091 40.623609404...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-74.0...",True
546864,546864,44146,geometry_x,MULTIPOLYGON (((-74.026673163054 40.6127037959...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-74.0...",True
...,...,...,...,...,...,...
1274198,1274198,853673,geometry_x,MULTIPOLYGON (((-73.9809912275208 40.634233818...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-73.9...",True
1274200,1274200,854066,geometry_x,MULTIPOLYGON (((-73.9530369432236 40.714819492...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-73.9...",True
1274204,1274204,854197,geometry_x,MULTIPOLYGON (((-73.9605574121217 40.686194539...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-73.9...",True
1274206,1274206,854419,geometry_x,MULTIPOLYGON (((-73.9508904829116 40.701667735...,"{""type"":""MultiPolygon"",""coordinates"":[[[[-73.9...",True


In [50]:
update_df = update_df[(update_df['field_name'] != 'geometry_x') & (update_df['field_name'] != 'num_units')]
updates_mapped = update_df.groupby(['field_name', 'old_value', 'new_value'])['id'].nunique()

In [51]:
updates_mapped

field_name     old_value  new_value
building_area  0.0        10023        1
                          10029        1
                          10030        1
                          10233        1
                          102534       1
                                      ..
zip            11235.0    11234.0      1
               11237.0    11221.0      1
               11238.0    11215.0      1
                          11216.0      1
               11249.0    11201.0      1
Name: id, Length: 18070, dtype: int64

In [52]:
# updates_mapped.drop('geometry_x', level=0, axis=0, inplace=True)
# 
# updates_mapped.drop('lot_area', level=0, axis=0, inplace=True)
updates_mapped.to_csv('data/updates_mapping_bk.csv')

In [53]:
update_df = update_df[(update_df['field_name'] != 'geometry_x') & (update_df['field_name'] != 'num_units')]

In [54]:
updated_ids = set(update_df['id'].values.tolist())

In [55]:
def get_updated_fields(id):
    tmp = update_df[update_df['id'] == id]['field_name']
    return tmp.values.tolist()

In [56]:
matched_buildings = matched_buildings[matched_buildings['id'].isin(updated_ids)]
updated_fields = [get_updated_fields(x) for x in matched_buildings['id']]
matched_buildings['updated_fields'] = updated_fields

In [57]:
matched_buildings.columns

Index(['id', 'city', 'state', 'zip', 'google_map_params',
       'building_bike_storage', 'building_doorman', 'building_elevator',
       'building_gym', 'building_laundry',
       ...
       'appdate', 'plutomapid', 'firm07_flag', 'pfirm15_flag', 'version',
       'dcpedited', 'latitude', 'longitude', 'notes', 'updated_fields'],
      dtype='object', length=198)

In [58]:
to_export = matched_buildings[['id', 'display_address', 'address', 'bbl', 'updated_fields', 'year_built', 'yearbuilt', 'in_search', 'num_stories', 'numfloors', 'num_units', 'unitsres', 'building_class', 'bldgclass', 'lot_area', 'lotarea', 'building_area', 'bldgarea']] 
to_export.columns = ['Building_ID', 'PW Address', 'PLUTO Address', 'Source_ID', 'Change_List', 'PW_Year_Built', 'PLUTO_Year_Built', 'in_search', 'PW_floors', 'PLUTO_floors', 'PW_Units', 'PLUTO_units', 'PW_building_class', 'PLUTO_building_class', 'PW_lot_area', 'PLUTO_lot_area', 'PW_building_area', 'PLUTO_building_area']
to_export.to_csv('data/update_bk_deep_dive.csv')

In [59]:
to_export.shape

(14806, 18)

In [60]:
pluto[pluto['bbl'] == 1000237501][['bbl', 'address', 'unitsres', 'unitstotal', 'resarea']]

Unnamed: 0,bbl,address,unitsres,unitstotal,resarea
702664,1000238000.0,1 WALL STREET,1.0,2.0,874513.0


In [61]:
preso[preso['display_address'].str.contains('326 East 51st')][['id', 'city', 'in_search', 'display_address', 'source_id']]

Unnamed: 0,id,city,in_search,display_address,source_id
339163,186341,Brooklyn,True,1326 East 51st Street,3077760000.0


In [62]:
pluto['address'].fillna('na', inplace=True)
pluto[pluto['address'].str.contains('326 EAST 51 STREET')][['borough', 'bbl', 'address']]

Unnamed: 0,borough,bbl,address
500873,MN,1013430000.0,326 EAST 51 STREET
690319,BK,3077760000.0,1326 EAST 51 STREET


In [63]:
matched_buildings[matched_buildings['display_address'].str.contains('39 Worth')]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,appdate,plutomapid,firm07_flag,pfirm15_flag,version,dcpedited,latitude,longitude,notes,updated_fields


In [64]:
bbl = preso[preso['display_address'].str.contains('39 Worth')]['source_id'].item()

In [65]:
bbl

1001760011.0

In [66]:
pluto[pluto['address'].str.contains('39 WORTH')]['bbl'].item()

1001760010.0

In [67]:
preso[preso['source_id'] == 1001760010][['display_address']]

Unnamed: 0,display_address
140919,41 Worth Street


In [68]:
matched_buildings[matched_buildings['bbl'] == 3055167510]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,appdate,plutomapid,firm07_flag,pfirm15_flag,version,dcpedited,latitude,longitude,notes,updated_fields


In [69]:
pluto[pluto['bbl'] == 3055167510]['address']

242878    1563 61ST STREET
Name: address, dtype: object