# Updating Perchwell's `buildings` dataset with PLUTO
The purpose of this notebook is to lay out the final process for ingesting the latest version of PLUTO (in this case v23_1). Once the process is finalized, this notebook will be transformed into a script that can be used in a DAG.

In [1]:
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine, text
import numpy as np
from sqlalchemy.orm import sessionmaker
import geopandas as gpd
#import geodatasets

import pickle as pkl

from datetime import datetime

## Getting Perchwell's `Buildings` table

In [2]:
%load_ext sql

In [3]:
%sql postgresql://teleport:@localhost:2023/perchwell

'Connected: teleport@perchwell'

In [4]:
def pull_data(sql_query:str, file_out="pw.csv", port_num=2023):
    """
    For a given sql_query, this function will connect to Perchwell's database, 
    pull the corresponding data, save it to a csv in the file specified 
    (default is pw.csv), and returns that data in the form of a Pandas
    DataFrame.

    Args:
        sql_query (str): query whose results you wish to download
        file_out (str, optional): filepath for the output file. Defaults to 
            "pw.csv".
        port_num (int, optional): Port number for Teleport Connection. Defaults
            to 2023.

    Returns:
        pd.DataFrame: DataFrame containing results of sql_query 
    """    
    pw_df = []
    prev_row_count = -1
    current_row = 0
    # Create the SQLAlchemy engine
    engine = create_engine(f"postgresql://teleport:@localhost:{port_num}/perchwell")

    # in order to get around the 300k row limit
    while prev_row_count <= current_row:
        current_row = len(pw_df)
        if prev_row_count == current_row:
            print(f"final row count: {current_row}")
            break
        # Define the SQL query string, but keep last line
        query_string = f"""{sql_query}
            -- NOTE: DO NOT DELETE BELOW THIS
            LIMIT 100000 OFFSET {current_row};
            """
        try:
            # Execute the query
            with engine.connect() as conn:
                result = conn.execute(text(query_string))

                # Process the query result
                for row in result:
                    # Access the row data
                    pw_df.append(row)
        except:
            # in case of timeout
            engine = create_engine(f"postgresql://teleport:@localhost:{port_num}/perchwell")
            with engine.connect() as conn:
                result = conn.execute(text(query_string))
                for row in result:
                    pw_df.append(row)
        print(f"{len(pw_df)} rows appended")
        prev_row_count = current_row
    pw_df = pd.DataFrame(pw_df)
    #pw_df.to_csv(file_out)
    return pw_df

In [5]:
try:
    with open('data/buildings_final2.pkl', 'rb') as f:
        preso = pkl.load(f)
    print(preso.shape)
except FileNotFoundError:
    query_string = f'''
        select 
            b.*
            , rls.*
            , mlsli.*
            , l.*
            , a.*
            , st_asgeojson(b.geometry) as geometry_json
        from buildings b
        ---join with geographies
        inner join building_geographies bg
            on b.id = bg.building_id
        left join (
            select
                building_id
                , count(*) as num_other_listings
                , max(created_at) as last_listing
            from listings
            where not(
                    lower(source) like '%mlsli%' 
                    or lower(source) like '%rls%' 
                    or source = 'real_plus')
            group by building_id
        ) l on l.building_id = b.id
        left join (
            select
                building_id
                , count(distinct(id)) as num_rls
                , max(created_at) as last_rls_listing
            from listings
            where --not(lower(source) like '%mlsli%')
                lower(source) like '%rls%' or source = 'real_plus'
            group by building_id
        ) rls on rls.building_id = b.id
        left join (
            select
                building_id
                , count(distinct(id)) as num_mlsli
                , max(created_at) as last_mlsli_listing
            from listings
            where lower(source) like '%mlsli%'
            group by building_id
        ) mlsli on mlsli.building_id = b.id
        left join (
            select 
                building_id
                , count(distinct(acris_document_id)) as num_acris
                , max(created_at) as last_acris_created
                , max(sale_date) as last_acris_sale
            from historicals
            group by building_id
            order by count(distinct(acris_document_id)) desc
        ) a on b.id = a.building_id
        where bg.geography_id = 1278
        '''
    preso = pull_data(query_string)
    with open('data/buildings_final2.pkl', 'wb') as f:
        pkl.dump(preso, f)

(749222, 105)


In [6]:
from shapely.geometry import shape
import json
preso = preso[preso.geometry_json.notnull()]
# Transform normal df to geopandas df
preso_gdf = gpd.GeoDataFrame(preso)
# Transform geojson to shapes
preso_gdf['geometry'] = [shape(json.loads(x)) for x in preso_gdf['geometry_json']]
#for i,r in gdf.iterrows():
 #   gdf.at[i, 'geometry_json'] = shape(json.loads(gdf['geometry_json'][i]))
    
# Set geometry for geodf
preso_gdf = preso_gdf.set_geometry('geometry')
preso_gdf.set_crs(crs="EPSG:4326", inplace=True)
with open('data/preso_gdf.pkl', 'wb') as f:
    pkl.dump(preso_gdf, f)

In [7]:
preso.shape

(744425, 105)

## Reading the PLUTO files

In [8]:
pluto = pd.read_csv("data/pluto_23v1_1.csv")
pluto.shape

  pluto = pd.read_csv("data/pluto_23v1_1.csv")


(859068, 92)

In [9]:
try:
    with open('data/pluto_map.pkl', 'rb') as f:
        pluto = pkl.load(f)
except FileNotFoundError:
    shapefile_path = 'data/nyc_mappluto_23v1_1_shp/MapPLUTO.shp'
    gdf = gpd.read_file(shapefile_path)
    pluto = pluto[pluto['bbl'].isin(gdf['BBL'].to_list())]
    with open('data/pluto_map.pkl', 'wb') as f:
        pkl.dump(pluto, f)
    # Print GeoDataFrame information
    print(gdf.head())

In [10]:
pluto.shape

(856903, 92)

## Identifying Buildings That Need Updating
The next step is to match the buildings in PW's `buildings` table to PLUTO using the BBL. Once this is complete, we identify which building and fields need to be updated.

In [11]:
mapped_fields = [('zip', 'zipcode'),
                 ('year_built', 'yearbuilt'),
                 #('school_district_code', 'schooldist'),
                 ('num_stories', 'numfloors'), 
                 ('num_units', 'unitsres'), 
                 ('lot_area', 'lotarea'), 
                 ('lot_front', 'lotfront'), 
                 ('lot_depth', 'lotdepth'),
                 ('building_class', 'bldgclass'),
                 ('building_front', 'bldgfront'),
                 ('building_depth', 'bldgdepth'),
                 ('building_area', 'bldgarea'),
                 ('landmark_x', 'landmark_y')]

In [12]:
preso['source_id'] = pd.to_numeric(preso['source_id'])
matched_buildings = preso.merge(pluto, how='inner', left_on='source_id', right_on='bbl')
print(matched_buildings.shape)

(730044, 197)


In [13]:
# map mismatched fields
matched_buildings['zip'] = pd.to_numeric(matched_buildings['zip'])
matched_buildings['numfloors'] = np.ceil(matched_buildings['numfloors'])

# map landmark field
matched_buildings['landmark_y'] = [True if type(x) == str else False for x in matched_buildings['landmark_y']]
matched_buildings['landmark_x'] = [x if x else 'f' for x in matched_buildings['landmark_x']]
matched_buildings['landmark_x'] = [False if (x.lower() in ('f', 'false') or x.isspace()) else True for x in matched_buildings['landmark_x']]

In [14]:
# TODO: is this the right format for the update df?
update_df = {'id': [], 'field_name': [], 'old_value': [], 'new_value': [], 'in_search': []}
for fnames in mapped_fields:
    print(fnames)
    wrong = matched_buildings[matched_buildings[f'{fnames[0]}'] != matched_buildings[f'{fnames[1]}']]
    wrong = wrong[(wrong[f'{fnames[1]}'] != 0)]
    wrong = wrong[~(wrong[f'{fnames[1]}'].isna())]
    if fnames[0] in ('num_stories', 'year_built'):
        # new value should be greater for both year_built and num_stories
        wrong = wrong[wrong[f'{fnames[0]}'] < wrong[f'{fnames[1]}']]
    update_df['id'] += list(wrong['id'])
    update_df['old_value'] += list(wrong[f'{fnames[0]}'])
    update_df['new_value'] += list(wrong[f'{fnames[1]}'])
    update_df['in_search'] += list(wrong['in_search'])
    update_df['field_name'] += [f'{fnames[0]}'] * len(wrong['id'])

update_df = pd.DataFrame(update_df)
update_df['id'].nunique()

('zip', 'zipcode')
('year_built', 'yearbuilt')
('num_stories', 'numfloors')
('num_units', 'unitsres')
('lot_area', 'lotarea')
('lot_front', 'lotfront')
('lot_depth', 'lotdepth')
('building_class', 'bldgclass')
('building_front', 'bldgfront')
('building_depth', 'bldgdepth')
('building_area', 'bldgarea')
('landmark_x', 'landmark_y')


419965

In [15]:
# checking to see if rules worked, should be same numbers printed
print(update_df.shape, update_df['id'].nunique())
update_df = update_df[(update_df['new_value'] != 0)]
update_df =  update_df[~(update_df['new_value'].isna())]
print(update_df.shape, update_df['id'].nunique())

(545452, 5) 419965
(545452, 5) 419965


In [16]:
# checking to see if rules worked, should be same numbers being printed
print(update_df[update_df['field_name'] == 'num_stories'].shape)
print(update_df[(update_df['field_name'] == 'num_stories') & (update_df['old_value'] < update_df['new_value'])].shape)

(19524, 5)
(19524, 5)


In [17]:
update_df.head()

Unnamed: 0,id,field_name,old_value,new_value,in_search
0,538,zip,10005,10038.0,True
1,1757,zip,10038,10007.0,False
2,5051,zip,10001,10123.0,True
3,10005,zip,10020,10036.0,True
4,11722,zip,10007,10038.0,False


## Splitting PW Only Buildings into Buckets
Now it is time to deal with the buildings that are only in Perchwell's `buildings` table. These tables will be split into separate categories based on [the rules laid out in the Whimsical.](https://whimsical.com/building-matchup-XDNFzadmfuQ5c6SAous4SY@7YNFXnKbYxTaFmKVEDgBL)

In [18]:
in_pluto = pluto[~pluto['bbl'].isin(matched_buildings['bbl'])]
in_pw = preso[~(preso['id'].isin(matched_buildings['id']))]
print(in_pw.shape, in_pluto.shape)

(14381, 105) (130936, 92)


In [19]:
already_hidden = in_pw[in_pw['in_search'] != True]
print(in_pw.shape)
in_pw = in_pw[in_pw['in_search'] == True]
print(in_pw.shape)

(14381, 105)
(4562, 105)


In [20]:
keep = in_pw[~((in_pw['source'].str.contains('manual')) | (in_pw['source'].str.contains('pluto')))]
in_pw = in_pw[(in_pw['source'].str.contains('manual')) | (in_pw['source'].str.contains('pluto'))]
in_pw.shape

(3581, 105)

In [21]:
keep.shape

(981, 105)

In [22]:
keep.groupby('source')['id'].nunique()

source
                  1
pitney_bowes    980
Name: id, dtype: int64

In [23]:
in_pw['num_rls'].fillna(0, inplace=True)
in_pw['num_mlsli'].fillna(0, inplace=True)
in_pw['num_other_listings'].fillna(0, inplace=True)
in_pw['num_acris'].fillna(0, inplace=True)

In [24]:
no_listings = in_pw[(in_pw['num_acris'] + in_pw['num_mlsli'] + in_pw['num_other_listings'] + in_pw['num_rls']) == 0]

In [25]:
no_listings.shape

(1123, 105)

In [26]:
has_listings = in_pw[~in_pw['id'].isin(no_listings['id'])]
has_listings.shape

(2458, 105)

In [27]:
from_pluto = has_listings[has_listings['source'].str.contains('pluto')]
manually_created = has_listings[has_listings['source'].str.contains('manual')]

In [28]:
print(from_pluto.shape)
print(manually_created.shape)

(1990, 105)
(468, 105)


In [29]:
with open('data/in_pw.pkl', 'wb') as f:
    pkl.dump(in_pw, f)

with open('data/from_pluto.pkl', 'wb') as f:
    pkl.dump(from_pluto, f)

with open('data/manually_created.pkl', 'wb') as f:
    pkl.dump(manually_created, f)

In [30]:
from_pluto['last_acris_sale'] = pd.to_datetime(from_pluto['last_acris_sale'],format='%Y-%m-%d', errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from_pluto['last_acris_sale'] = pd.to_datetime(from_pluto['last_acris_sale'],format='%Y-%m-%d', errors='coerce')


In [31]:
from_pluto_post2017 = from_pluto[(from_pluto['last_acris_sale'] >= datetime.fromisoformat('2017-01-01')) | \
                           (from_pluto['last_listing'] >= datetime.fromisoformat('2017-01-01')) | \
                            (from_pluto['last_mlsli_listing'] >= datetime.fromisoformat('2017-01-01'))]

In [32]:
from_pluto_post2017.shape

(1527, 105)

In [33]:
from_pluto.shape

(1990, 105)

In [34]:
from_pluto_post2017['last_listing']

88       2019-05-15 03:58:52.735464
206      2019-05-15 02:58:22.756099
437      2019-05-15 03:51:56.164466
554      2019-05-15 05:39:45.907035
588      2019-05-15 02:55:15.236699
                    ...            
744973                          NaT
745676   2020-08-11 05:06:02.485419
747120   2020-08-11 06:41:09.204408
748111   2020-08-11 05:52:52.716649
748822   2019-05-15 04:34:29.052688
Name: last_listing, Length: 1527, dtype: datetime64[ns]

In [35]:
from_pluto_pre2017 = from_pluto[~(from_pluto['id'].isin(from_pluto_post2017['id']))]
from_pluto_pre2017.shape

(463, 105)

In [36]:
zips_bx = [10463, 10471,10466, 10469, 10470, 10475,10458, 10467, 10468,10461, 10462, 10464, 10465, 10472, 10473,10453, 10457, 10460 ,10451, 10452, 10456, 10454, 10455, 10459, 10474]
zips_bk = [11211, 11222, 11201, 11205, 11215, 11217, 11231, 11213, 11212, 11216, 11233, 11238, 11207, 11208, 11220, 11232, 11204, 11218, 11219, 11230, 11203, 11210, 11225, 11226, 11234, 11236, 11239, 11209, 11214, 11228, 11223, 11224, 11229, 11235, 11206, 11221, 11237]
zips_mn = [10031, 10032, 10033, 10034, 10040,10026, 10027, 10030, 10037, 10039,10029, 10035,10023, 10024, 10025,10021, 10028, 10044, 10128,10001, 10011, 10018, 10019, 10020, 10036,10010, 10016, 10017, 10022,10012, 10013, 10014,10002, 10003, 10009, 10004, 10005, 10006, 10007, 10038, 10280]
zips_qn = [11101, 11102, 11103, 11104, 11105, 11106,11368, 11369, 11370, 11372, 11373, 11377, 11378,11354, 11355, 11356, 11357, 11358, 11359, 11360,11361, 11362, 11363, 11364,11374, 11375, 11379, 11385,11365, 11366, 11367,11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421,11412, 11423, 11432, 11433, 11434, 11435, 11436,11004, 11005, 11411, 11413, 11422, 11426, 11427, 11428, 11429,11691, 11692, 11693, 11694, 11695, 11697]
zips = zips_bk + zips_bx + zips_mn + zips_qn

In [37]:
print(manually_created.shape)
# manually_created_outside = manually_created[~manually_created['zip'].astype(int).isin(zips)]
# manually_created = manually_created[manually_created['zip'].astype(int).isin(zips)]
# print(manually_created.shape, manually_created_outside.shape)

(468, 105)


In [38]:
for_mlsli = manually_created[(manually_created['num_mlsli'] > 0) & (manually_created['num_rls'] == 0)]
for_rls = manually_created[(manually_created['num_mlsli'] == 0) & (manually_created['num_rls'] > 0)]
for_other = manually_created[~(manually_created['id'].isin(for_mlsli['id'])) & ~(manually_created['id'].isin(for_rls['id']))]

In [39]:
print(for_mlsli.shape, for_rls.shape, for_other.shape)

(410, 105) (32, 105) (26, 105)


In [40]:
# TODO: address matching between manually created buildings and PLUTO

## Saving out the categories
Need to save:
  - Update Building
    - update_df
  - Manual Review:
    - for_mlsli
    - for_other
    - for_rls
    - from_pluto_post2017
  - Deprecate:
    - from_pluto_pre2017
    - no_listings

In [41]:
preso_gdf['source_id'] = pd.to_numeric(preso_gdf['source_id'], errors='coerce')
for_mlsli_gdf = preso_gdf[preso_gdf['id'].isin(for_mlsli['id'])]
for_mlsli_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/for_mlsli')
for_other_gdf = preso_gdf[preso_gdf['id'].isin(for_other['id'])]
for_other_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/for_other')
for_rls_gdf = preso_gdf[preso_gdf['id'].isin(for_rls['id'])]
for_rls_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/for_rls')
from_pluto_post2017_gdf = preso_gdf[preso_gdf['id'].isin(from_pluto_post2017['id'])]
from_pluto_post2017_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/from_pluto_post2017')

  for_mlsli_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/for_mlsli')
  for_other_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/for_other')
  for_rls_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/for_rls')
  from_pluto_post2017_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/from_pluto_post2017')


In [42]:
update_df.to_csv('data/final_buildings_update.csv')

In [43]:
manual_review = pd.concat([for_mlsli, for_other, for_rls, from_pluto_post2017])
manual_review.to_csv('data/final_manual_review.csv')

In [44]:
dep = pd.concat([from_pluto_pre2017, no_listings])
dep.to_csv('data/final_deprecate.csv')

## Dealing with PLUTO only buildings


In [45]:
in_pluto.shape

(130936, 92)

In [46]:
in_pluto['bldgclass'].fillna('NaN', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_pluto['bldgclass'].fillna('NaN', inplace=True)


In [47]:
# filtering based on PW boundaries and building class
in_pluto = in_pluto[(in_pluto['borough'] != 'SI') & (in_pluto['longitude'] < -73.74376451)]
resbuildings = in_pluto[in_pluto['resarea'] > 0]
print(in_pluto.shape, resbuildings.shape)

(3791, 92) (2286, 92)


In [48]:
with open('data/resbuildings.pkl', 'wb') as f:
    pkl.dump(resbuildings, f)

In [49]:
dup_check = pluto.groupby(['address', 'zipcode'])['bbl'].nunique()

In [50]:
dup_check[dup_check >= 2]

address         zipcode
                10026.0    2
                10312.0    4
                10314.0    2
                10453.0    2
                10454.0    2
                          ..
WEST 8 STREET   11204.0    3
                11224.0    3
WEST 87 STREET  10024.0    2
WEST 9 STREET   11231.0    3
WEST 97 STREET  10025.0    2
Name: bbl, Length: 5421, dtype: int64

In [51]:
# TODO: is this right?? we should also be getting the buildings that have been 
# matched but were duplicates right?
dupes = pd.DataFrame(columns=resbuildings.columns)
for address, zipcode in dup_check[dup_check >= 2].index:
    _ = resbuildings[(resbuildings['address'] == address) & (resbuildings['zipcode'] == zipcode)]
    dupes = pd.concat([dupes, _])
print(dupes.shape)

(153, 92)


In [52]:
no_dupes = resbuildings[~resbuildings['bbl'].isin(dupes['bbl'])]

In [53]:
no_dupes.shape

(2133, 92)

In [51]:
try:
    with open('data/gdf.pkl', 'rb') as f:
        gdf = pkl.load(f)
except FileNotFoundError:
    shapefile_path = 'data/nyc_mappluto_23v1_1_shp/MapPLUTO.shp'
    gdf = gpd.read_file(shapefile_path)
    # formatting for geospatial join
    gdf.to_crs('epsg:4326', inplace=True)
    with open('data/gdf.pkl', 'wb') as f:
        pkl.dump(gdf, f)

In [52]:
gdf.head()

Unnamed: 0,Borough,Block,Lot,CD,BCT2020,BCTCB2020,CT2010,CB2010,SchoolDist,Council,...,FIRM07_FLA,PFIRM15_FL,Version,DCPEdited,Latitude,Longitude,Notes,Shape_Leng,Shape_Area,geometry
0,MN,1,10,101,1000500,10005000003,5,1000,2,1,...,1.0,1.0,23v1.1,,40.688766,-74.018682,,0.0,7478663.0,"POLYGON ((-74.01208 40.69205, -74.01215 40.692..."
1,MN,1,101,101,1000100,10001001001,1,1001,2,1,...,,1.0,23v1.1,,40.68992,-74.045337,,0.0,501897.3,"MULTIPOLYGON (((-74.04263 40.69004, -74.04257 ..."
2,BK,457,23,306,3007700,30077001003,77,1003,15,39,...,,,23v1.1,,40.678323,-73.991928,,0.0,1596.315,"POLYGON ((-73.99182 40.67841, -73.99198 40.678..."
3,MN,1,201,101,1000100,10001001000,1,1000,2,0,...,,1.0,23v1.1,,40.698188,-74.041329,,0.0,1148539.0,"POLYGON ((-74.03823 40.69836, -74.03868 40.698..."
4,MN,2,1,101,1000900,10009001022,9,1025,2,1,...,1.0,1.0,23v1.1,t,40.700369,-74.012911,,0.0,100825.0,"POLYGON ((-74.01313 40.69977, -74.01313 40.699..."


In [53]:
gdf.columns

Index(['Borough', 'Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010',
       'CB2010', 'SchoolDist', 'Council', 'ZipCode', 'FireComp', 'PolicePrct',
       'HealthCent', 'HealthArea', 'Sanitboro', 'SanitDistr', 'SanitSub',
       'Address', 'ZoneDist1', 'ZoneDist2', 'ZoneDist3', 'ZoneDist4',
       'Overlay1', 'Overlay2', 'SPDist1', 'SPDist2', 'SPDist3', 'LtdHeight',
       'SplitZone', 'BldgClass', 'LandUse', 'Easements', 'OwnerType',
       'OwnerName', 'LotArea', 'BldgArea', 'ComArea', 'ResArea', 'OfficeArea',
       'RetailArea', 'GarageArea', 'StrgeArea', 'FactryArea', 'OtherArea',
       'AreaSource', 'NumBldgs', 'NumFloors', 'UnitsRes', 'UnitsTotal',
       'LotFront', 'LotDepth', 'BldgFront', 'BldgDepth', 'Ext', 'ProxCode',
       'IrrLotCode', 'LotType', 'BsmtCode', 'AssessLand', 'AssessTot',
       'ExemptTot', 'YearBuilt', 'YearAlter1', 'YearAlter2', 'HistDist',
       'Landmark', 'BuiltFAR', 'ResidFAR', 'CommFAR', 'FacilFAR', 'BoroCode',
       'BBL', 'CondoNo', 'Tract2

In [57]:
no_dupes_gdf = gdf[gdf['BBL'].isin(no_dupes['bbl'])]
print(no_dupes.shape, no_dupes_gdf.shape)

(2133, 92) (2133, 95)


In [58]:
for c in preso.columns:
    if 'lat' in c:
        print(c)

centroid_latitude


In [59]:
preso.dropna(subset=['centroid_latitude', 'centroid_longitude'])[['centroid_latitude', 'centroid_longitude']]

Unnamed: 0,centroid_latitude,centroid_longitude
0,40.705330,-74.002719
1,40.705284,-74.014269
2,40.703212,-74.006179
3,40.705775,-74.007242
4,40.703809,-74.012756
...,...,...
749213,40.780273,-73.780304
749214,40.695855,-73.799332
749215,40.703458,-73.811952
749216,40.821019,-73.821910


In [60]:
preso.centroid_latitude

0         40.705330
1         40.705284
2         40.703212
3         40.705775
4         40.703809
            ...    
749213    40.780273
749214    40.695855
749215    40.703458
749216    40.821019
749218    40.633818
Name: centroid_latitude, Length: 744425, dtype: object

In [54]:
preso_geo = preso.dropna(subset=['centroid_latitude', 'centroid_longitude'])
geometry = gpd.points_from_xy(preso_geo.centroid_longitude, preso_geo.centroid_latitude)
preso_geo = preso_geo.drop(['centroid_latitude', 'centroid_longitude'], axis=1)
preso_geo = gpd.GeoDataFrame(preso_geo, crs="EPSG:4326", geometry=geometry)
geometry

<GeometryArray>
[<POINT (-74.003 40.705)>, <POINT (-74.014 40.705)>, <POINT (-74.006 40.703)>,
 <POINT (-74.007 40.706)>, <POINT (-74.013 40.704)>, <POINT (-74.012 40.703)>,
 <POINT (-74.013 40.703)>, <POINT (-74.007 40.708)>, <POINT (-74.012 40.703)>,
 <POINT (-74.013 40.705)>,
 ...
 <POINT (-73.816 40.742)>, <POINT (-73.824 40.763)>, <POINT (-73.778 40.739)>,
  <POINT (-74.012 40.71)>, <POINT (-73.986 40.688)>,   <POINT (-73.78 40.78)>,
 <POINT (-73.799 40.696)>, <POINT (-73.812 40.703)>, <POINT (-73.822 40.821)>,
 <POINT (-73.948 40.634)>]
Length: 744368, dtype: geometry

In [62]:
sp1 = gpd.sjoin(preso_geo, no_dupes_gdf, how = 'inner', op = 'within')

In [63]:
sp1.shape

(1446, 198)

In [64]:
no_dupes_gdf['BBL'].nunique()

2133

In [65]:
no_dupes_gdf.shape

(2133, 95)

In [66]:
sp1['BBL'].nunique()

947

In [67]:
sp1['num_rls'].fillna(0, inplace=True)
sp1['num_mlsli'].fillna(0, inplace=True)
sp1['num_other_listings'].fillna(0, inplace=True)
sp1['num_acris'].fillna(0, inplace=True)

In [68]:
sp1_no_listings = sp1[(sp1['num_acris'] + sp1['num_mlsli'] + sp1['num_other_listings'] + sp1['num_rls']) == 0]
sp1_no_listings.shape

(525, 198)

In [69]:
sp1_no_listings.groupby(['BBL'])['id'].nunique()

BBL
1.000908e+09    1
1.001320e+09    1
1.003728e+09    1
1.004078e+09    1
1.004208e+09    1
               ..
4.158538e+09    2
4.159100e+09    1
4.160620e+09    1
4.161668e+09    1
4.161808e+09    5
Name: id, Length: 284, dtype: int64

In [70]:
sp1_has_listings = sp1[~sp1['BBL'].isin(sp1_no_listings['BBL'])]
sp1_has_listings.shape

(759, 198)

In [71]:
sp1_has_listings.groupby('BBL')['id'].nunique()

BBL
1.000168e+09    1
1.000638e+09    3
1.001418e+09    1
1.002240e+09    1
1.002250e+09    1
               ..
4.160640e+09    1
4.161180e+09    1
4.161260e+09    1
4.161560e+09    1
4.161668e+09    4
Name: id, Length: 663, dtype: int64

In [72]:
sp1.head()

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,PLUTOMapID,FIRM07_FLA,PFIRM15_FL,Version,DCPEdited,Latitude,Longitude,Notes,Shape_Leng,Shape_Area
959,15991,New York,NY,10028,,,,,,,...,1,,,23v1.1,,40.774952,-73.955392,,0.0,2595.442358
1267,20873,New York,NY,10021,,,True,True,,,...,1,,,23v1.1,,40.771223,-73.96485,,0.0,4788.678065
1920,31185,New York,NY,10027,,,,,,,...,1,,,23v1.1,,40.807039,-73.94448,,0.0,12869.472509
423896,41136,New York,NY,10027,,,,,,,...,1,,,23v1.1,,40.807039,-73.94448,,0.0,12869.472509
564289,35281,New York,NY,10027,,,,,,,...,1,,,23v1.1,,40.807039,-73.94448,,0.0,12869.472509


In [73]:
sp1_matched = sp1[sp1['id'].isin(matched_buildings['id'])]

In [74]:
sp1_matched.shape

(344, 198)

In [75]:
sp1_unmatched = sp1[~sp1['id'].isin(matched_buildings['id'])]

In [76]:
sp1_matched.to_csv('data/sp1_matched.csv')
sp1_unmatched.to_csv('data/sp1_unmatched.csv')

In [77]:
sp1_unmatched.shape

(1102, 198)

In [78]:
sp1_unmatched_mn = sp1_unmatched[sp1_unmatched['Borough'] == 'MN']
sp1_unmatched_mn.shape

(142, 198)

In [79]:
sp1_unmatched_not_mn = sp1_unmatched[sp1_unmatched['Borough'] != 'MN']
sp1_unmatched_not_mn.shape

(960, 198)

## Saving out the categories
Need to save:
  - Update Building
    - update_df
  - Manual Review:
    - for_mlsli
    - for_other
    - for_rls
    - from_pluto_post2017
  - Deprecate:
    - from_pluto_pre2017
    - no_listings

In [80]:
no_dupes_gdf.columns

Index(['Borough', 'Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010',
       'CB2010', 'SchoolDist', 'Council', 'ZipCode', 'FireComp', 'PolicePrct',
       'HealthCent', 'HealthArea', 'Sanitboro', 'SanitDistr', 'SanitSub',
       'Address', 'ZoneDist1', 'ZoneDist2', 'ZoneDist3', 'ZoneDist4',
       'Overlay1', 'Overlay2', 'SPDist1', 'SPDist2', 'SPDist3', 'LtdHeight',
       'SplitZone', 'BldgClass', 'LandUse', 'Easements', 'OwnerType',
       'OwnerName', 'LotArea', 'BldgArea', 'ComArea', 'ResArea', 'OfficeArea',
       'RetailArea', 'GarageArea', 'StrgeArea', 'FactryArea', 'OtherArea',
       'AreaSource', 'NumBldgs', 'NumFloors', 'UnitsRes', 'UnitsTotal',
       'LotFront', 'LotDepth', 'BldgFront', 'BldgDepth', 'Ext', 'ProxCode',
       'IrrLotCode', 'LotType', 'BsmtCode', 'AssessLand', 'AssessTot',
       'ExemptTot', 'YearBuilt', 'YearAlter1', 'YearAlter2', 'HistDist',
       'Landmark', 'BuiltFAR', 'ResidFAR', 'CommFAR', 'FacilFAR', 'BoroCode',
       'BBL', 'CondoNo', 'Tract2

In [81]:
not_in_pw = no_dupes_gdf[~no_dupes_gdf['BBL'].isin(sp1['BBL'])]

In [82]:
print(sp1['BBL'].nunique(), sp1_no_listings['BBL'].nunique(), sp1_has_listings['BBL'].nunique())
print(sp1.shape, sp1_no_listings.shape, sp1_has_listings.shape)

947 284 663
(1446, 198) (525, 198) (759, 198)


In [83]:
no_dupes_gdf.shape

(2133, 95)

In [84]:
not_in_pw['BBL'].nunique()

1186

In [85]:
not_in_pw[not_in_pw['Borough'] == 'MN'].shape

(19, 95)

In [86]:
not_in_pw[not_in_pw['Borough'] != 'MN'].shape

(1167, 95)

In [87]:
dupes_matched = dupes[dupes['bbl'].isin(preso['source_id'])] 
dupes_matched_post17 = dupes_matched[dupes_matched['yearbuilt'] >= 2017] #new construction
dupes_matched_pre17 = dupes_matched[dupes_matched['yearbuilt'] < 2017] #update other building

dupes_unmatched = dupes[~dupes['bbl'].isin(matched_buildings['bbl'])] # manual review

In [88]:
dupes_matched.shape

(0, 92)

In [89]:
dupes['bbl']

559567    4.115550e+09
640674    4.115550e+09
797527    4.115550e+09
218958    2.044700e+09
758524    3.035180e+09
              ...     
515701    3.074350e+09
341624    2.048580e+09
333020    2.049020e+09
550447    3.048880e+09
748775    3.024360e+09
Name: bbl, Length: 153, dtype: float64

In [90]:
preso.source_id

0         1.000730e+09
1         1.000130e+09
2         1.000360e+09
3         1.000398e+09
4         1.000100e+09
              ...     
749213    4.058720e+09
749214    4.101220e+09
749215    4.096830e+09
749216    2.055790e+09
749218             NaN
Name: source_id, Length: 744425, dtype: float64

In [91]:
dupes_unmatched.bbl

559567    4.115550e+09
640674    4.115550e+09
797527    4.115550e+09
218958    2.044700e+09
758524    3.035180e+09
              ...     
515701    3.074350e+09
341624    2.048580e+09
333020    2.049020e+09
550447    3.048880e+09
748775    3.024360e+09
Name: bbl, Length: 153, dtype: float64

In [92]:
merge_check = sp1_has_listings.groupby('BBL')['id'].nunique()
sp1_has_listings[sp1_has_listings['BBL'].isin(merge_check[merge_check > 1].index)]['id'].nunique()

134

In [93]:
split_check = sp1_has_listings.groupby('id')['BBL'].nunique()
sp1_has_listings[sp1_has_listings['id'].isin(split_check[split_check > 1].index)]['BBL'].nunique()

0

In [94]:
split_check > 1

id
115        False
1896       False
2566       False
2646       False
2828       False
           ...  
1826711    False
1826906    False
5108856    False
5108860    False
7069212    False
Name: BBL, Length: 759, dtype: bool

In [95]:
sp1_has_listings = sp1_has_listings.drop(['index_right'], axis=1)
sp2 = gpd.sjoin(preso_gdf, sp1_has_listings, how = 'inner', op = 'intersects')

In [96]:
for x in preso_gdf.columns:
    if 'index' in x:
        print('preso', x)
for x in sp1_has_listings.columns:
    if 'index' in x:
        print(x)

In [97]:
gdf.columns

Index(['Borough', 'Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010',
       'CB2010', 'SchoolDist', 'Council', 'ZipCode', 'FireComp', 'PolicePrct',
       'HealthCent', 'HealthArea', 'Sanitboro', 'SanitDistr', 'SanitSub',
       'Address', 'ZoneDist1', 'ZoneDist2', 'ZoneDist3', 'ZoneDist4',
       'Overlay1', 'Overlay2', 'SPDist1', 'SPDist2', 'SPDist3', 'LtdHeight',
       'SplitZone', 'BldgClass', 'LandUse', 'Easements', 'OwnerType',
       'OwnerName', 'LotArea', 'BldgArea', 'ComArea', 'ResArea', 'OfficeArea',
       'RetailArea', 'GarageArea', 'StrgeArea', 'FactryArea', 'OtherArea',
       'AreaSource', 'NumBldgs', 'NumFloors', 'UnitsRes', 'UnitsTotal',
       'LotFront', 'LotDepth', 'BldgFront', 'BldgDepth', 'Ext', 'ProxCode',
       'IrrLotCode', 'LotType', 'BsmtCode', 'AssessLand', 'AssessTot',
       'ExemptTot', 'YearBuilt', 'YearAlter1', 'YearAlter2', 'HistDist',
       'Landmark', 'BuiltFAR', 'ResidFAR', 'CommFAR', 'FacilFAR', 'BoroCode',
       'BBL', 'CondoNo', 'Tract2

In [98]:
preso_gdf.columns

Index(['id', 'city', 'state', 'zip', 'google_map_params',
       'building_bike_storage', 'building_doorman', 'building_elevator',
       'building_gym', 'building_laundry',
       ...
       'num_mlsli', 'last_mlsli_listing', 'building_id', 'num_other_listings',
       'last_listing', 'building_id', 'num_acris', 'last_acris_created',
       'last_acris_sale', 'geometry_json'],
      dtype='object', length=105)

In [99]:
sp2.head()

Unnamed: 0,id_left,city_left,state_left,zip_left,google_map_params_left,building_bike_storage_left,building_doorman_left,building_elevator_left,building_gym_left,building_laundry_left,...,PLUTOMapID,FIRM07_FLA,PFIRM15_FL,Version,DCPEdited,Latitude,Longitude,Notes,Shape_Leng,Shape_Area
959,15991,New York,NY,10028,,,,,,,...,1,,,23v1.1,,40.774952,-73.955392,,0.0,2595.442358
1267,20873,New York,NY,10021,,,True,True,,,...,1,,,23v1.1,,40.771223,-73.96485,,0.0,4788.678065
2300,36726,New York,NY,10032,,,,,,,...,1,,,23v1.1,,40.836486,-73.942308,,0.0,4042.880381
2810,45331,Brooklyn,NY,11210,,,,,,,...,1,,,23v1.1,,40.632673,-73.944412,,0.0,2194.139191
2979,48096,Brooklyn,NY,11216,,,,,,,...,1,,,23v1.1,,40.683001,-73.944154,,0.0,8178.308193


In [100]:
sp2['id_left'].nunique()

789

In [101]:
sp2['source_id_left'].nunique()

726

In [102]:
sp2['BBL'].nunique()

657

In [103]:
sp2_grouped = sp2.groupby('BBL')['id_left'].nunique()

In [104]:
sp2_grouped[sp2_grouped > 1]

BBL
1.000638e+09    3
1.003468e+09    2
1.003468e+09    2
1.004268e+09    2
1.006938e+09    2
               ..
4.123900e+09    2
4.124580e+09    2
4.157930e+09    2
4.160050e+09    3
4.161668e+09    4
Name: id_left, Length: 72, dtype: int64

In [105]:
sp2_grouped[sp2_grouped == 1]

BBL
1.000168e+09    1
1.001418e+09    1
1.002240e+09    1
1.002250e+09    1
1.002760e+09    1
               ..
4.160620e+09    1
4.160640e+09    1
4.161180e+09    1
4.161260e+09    1
4.161560e+09    1
Name: id_left, Length: 585, dtype: int64

In [106]:
sp2_grouped2 = sp2.groupby('id_left')['BBL'].nunique()

In [107]:
sp2_grouped2[sp2_grouped2 > 1]

id_left
124930    2
220639    3
627858    2
629054    2
650816    2
654401    2
661534    2
Name: BBL, dtype: int64

In [108]:
sp2_grouped2[sp2_grouped2 == 1]

id_left
115        1
1896       1
2566       1
2646       1
2828       1
          ..
1826711    1
1826906    1
5108856    1
5108860    1
7069212    1
Name: BBL, Length: 782, dtype: int64

In [109]:
sp1_has_listings.shape

(759, 197)

In [110]:
sp2_grouped

BBL
1.000168e+09    1
1.000638e+09    3
1.001418e+09    1
1.002240e+09    1
1.002250e+09    1
               ..
4.160640e+09    1
4.161180e+09    1
4.161260e+09    1
4.161560e+09    1
4.161668e+09    4
Name: id_left, Length: 657, dtype: int64

In [111]:
sp2_grouped[sp2_grouped == 1]

BBL
1.000168e+09    1
1.001418e+09    1
1.002240e+09    1
1.002250e+09    1
1.002760e+09    1
               ..
4.160620e+09    1
4.160640e+09    1
4.161180e+09    1
4.161260e+09    1
4.161560e+09    1
Name: id_left, Length: 585, dtype: int64

In [112]:
sp2_grouped2[sp2_grouped2 > 1]

id_left
124930    2
220639    3
627858    2
629054    2
650816    2
654401    2
661534    2
Name: BBL, dtype: int64

In [113]:
update_buildings_pluto = sp2_grouped[sp2_grouped == 1]

In [114]:
sp2_update_buildings = sp2[sp2['BBL'].isin(update_buildings_pluto.index)]
sp2_update_buildings.head()

Unnamed: 0,id_left,city_left,state_left,zip_left,google_map_params_left,building_bike_storage_left,building_doorman_left,building_elevator_left,building_gym_left,building_laundry_left,...,PLUTOMapID,FIRM07_FLA,PFIRM15_FL,Version,DCPEdited,Latitude,Longitude,Notes,Shape_Leng,Shape_Area
959,15991,New York,NY,10028,,,,,,,...,1,,,23v1.1,,40.774952,-73.955392,,0.0,2595.442358
1267,20873,New York,NY,10021,,,True,True,,,...,1,,,23v1.1,,40.771223,-73.96485,,0.0,4788.678065
2300,36726,New York,NY,10032,,,,,,,...,1,,,23v1.1,,40.836486,-73.942308,,0.0,4042.880381
2810,45331,Brooklyn,NY,11210,,,,,,,...,1,,,23v1.1,,40.632673,-73.944412,,0.0,2194.139191
5289,85000,Brooklyn,NY,11220,,,,,,,...,1,,,23v1.1,,40.635227,-74.008607,,0.0,2537.012498


In [115]:
cols_to_drop = [x for x in sp2.columns if '_right' in x]
sp2.drop(columns=cols_to_drop, inplace=True)
cols_update = [x.replace('_left', '') if '_left' in x else x for x in sp2.columns]
sp2.columns = cols_update
sp2_update_buildings = sp2[sp2['BBL'].isin(update_buildings_pluto.index)]
sp2_update_buildings.head()

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,PLUTOMapID,FIRM07_FLA,PFIRM15_FL,Version,DCPEdited,Latitude,Longitude,Notes,Shape_Leng,Shape_Area
959,15991,New York,NY,10028,,,,,,,...,1,,,23v1.1,,40.774952,-73.955392,,0.0,2595.442358
1267,20873,New York,NY,10021,,,True,True,,,...,1,,,23v1.1,,40.771223,-73.96485,,0.0,4788.678065
2300,36726,New York,NY,10032,,,,,,,...,1,,,23v1.1,,40.836486,-73.942308,,0.0,4042.880381
2810,45331,Brooklyn,NY,11210,,,,,,,...,1,,,23v1.1,,40.632673,-73.944412,,0.0,2194.139191
5289,85000,Brooklyn,NY,11220,,,,,,,...,1,,,23v1.1,,40.635227,-74.008607,,0.0,2537.012498


In [116]:
def update_buildings(matched_buildings, mapped_fields):
    update_df = {'id': [], 'field_name': [], 'old_value': [],
             'new_value': [], 'in_search': []}
    for fnames in mapped_fields:
        wrong = matched_buildings[matched_buildings[f'{fnames[0]}'] != \
                                matched_buildings[f'{fnames[1]}']]
        wrong = wrong[(wrong[f'{fnames[1]}'] != 0)]
        wrong = wrong[~(wrong[f'{fnames[1]}'].isna())]
        if fnames[0] in ('num_stories', 'year_built'):
            # new value should be greater for both year_built and num_stories
            wrong = wrong[wrong[f'{fnames[0]}'] < wrong[f'{fnames[1]}']]
        update_df['id'] += list(wrong['id'])
        update_df['old_value'] += list(wrong[f'{fnames[0]}'])
        update_df['new_value'] += list(wrong[f'{fnames[1]}'])
        update_df['in_search'] += list(wrong['in_search'])
        update_df['field_name'] += [f'{fnames[0]}'] * len(wrong['id'])
    
    update_df = pd.DataFrame(update_df)
    return update_df

In [117]:
mapped_fields = [('zip', 'ZipCode'),
                 ('year_built', 'YearBuilt'),
                 #('school_district_code', 'schooldist'),
                 ('num_stories', 'NumFloors'), 
                 ('num_units', 'UnitsRes'), 
                 ('lot_area', 'LotArea'), 
                 ('lot_front', 'LotFront'), 
                 ('lot_depth', 'LotDepth'),
                 ('building_class', 'BldgClass'),
                 ('building_front', 'BldgFront'),
                 ('building_depth', 'BldgDepth'),
                 ('building_area', 'BldgArea'),
                 ('landmark', 'Landmark')]

# map mismatched fields
sp2_update_buildings['zip'] = pd.to_numeric(sp2_update_buildings['zip'])
sp2_update_buildings['NumFloors'] = np.ceil(sp2_update_buildings['NumFloors'])

# map landmark field
sp2_update_buildings['Landmark'] = [True if type(x) == str else False for x in sp2_update_buildings['Landmark']]
sp2_update_buildings['landmark'] = [x if x else 'f' for x in sp2_update_buildings['landmark']]
sp2_update_buildings['landmark'] = [False if (x.lower() in ('f', 'false') or x.isspace()) else True for x in sp2_update_buildings['landmark']]

update_buildings2 = update_buildings(sp2_update_buildings, mapped_fields)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a

In [118]:
update_buildings2['id'].nunique()

578

In [119]:
for c in sp2.columns:
    print(c)

id
city
state
zip
google_map_params
building_bike_storage
building_doorman
building_elevator
building_gym
building_laundry
building_pets
building_pool
building_prewar
building_rooftop
building_storage
building_style_code
county_code
school_district_code
created_at
updated_at
neighborhood_id
source_id
display_address
display_name
building_garage
garage_type_code
year_built
benefit_construction_completed
num_stories
num_units
tax_class
centroid_latitude
centroid_longitude
tax_benefit_code
tax_benefit_start_date
tax_exemption_end_date
tax_abatement_end_date
j51_crc_perc
building_class
landmark
building_front
building_depth
tranche
new_development
in_search
architect
developer
management_firm_id
building_style
conversion_notes
account_executive_name
account_executive_phone
service_level
pet_policy
sublet_policy
front_desk_phone
building_type
source
building_complex_id
apportionment_date
lot_front
lot_depth
built_far
corner_lot
building_area
lot_area
zoning_primary
zoning_secondary
geometry

In [120]:
matched_buildings.shape

(730044, 197)

In [121]:
sp2_grouped[sp2_grouped == 1]

BBL
1.000168e+09    1
1.001418e+09    1
1.002240e+09    1
1.002250e+09    1
1.002760e+09    1
               ..
4.160620e+09    1
4.160640e+09    1
4.161180e+09    1
4.161260e+09    1
4.161560e+09    1
Name: id_left, Length: 585, dtype: int64

In [122]:
sp2[sp2['BBL'].isin(sp2_grouped[sp2_grouped == 1].index)].groupby('source')['id'].nunique()

source
manual_future_building          65
manually_entered_backlog_22      2
pluto_v15_1                      1
pluto_v17_1                    517
Name: id, dtype: int64

In [123]:
with open('data/buildings_final2.pkl', 'rb') as f:
    preso = pkl.load(f)
preso.groupby('source')['id'].nunique()

KeyboardInterrupt: 

In [None]:
preso_geo.groupby('source')['id'].nunique()

source
manual_future_buildin               1
manual_future_building           1192
manual_future_buildings             1
manual_future_listing               2
manually entered                   76
manually_entered                  414
manually_entered_backlog_22        44
manually_entered_future             2
pitney_bowes                      979
pluto_merge                         1
pluto_split                        35
pluto_v09_1                      2141
pluto_v10_1                      1182
pluto_v10_2                       798
pluto_v11_1                       544
pluto_v11_2                       565
pluto_v12_1                       515
pluto_v12_2                       427
pluto_v13_1                       480
pluto_v13_2                       509
pluto_v14_1                       767
pluto_v14_2                      1031
pluto_v15_1                      1351
pluto_v16_1                       277
pluto_v16_2                      2171
pluto_v17_1                    728861
Name:

In [None]:
gs_match = sp2[sp2['BBL'].isin(sp2_grouped[sp2_grouped == 1].index)]
gs_preso_gdf = preso_gdf[preso_gdf['id'].isin(gs_match['id'])]
gs_preso_gdf.drop(columns=['created_at',
                            'updated_at',
                            'new_dev_start_date',
                            'new_dev_end_date',
                            'land_lease_expiration',
                            'last_rls_listing',
                            'last_mlsli_listing',
                            'last_listing',
                            'last_acris_created'], inplace=True)
gs_preso_gdf['source_id'] = pd.to_numeric(gs_preso_gdf['source_id'], errors='coerce')
gs_preso_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/gs_one_one')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gs_preso_gdf.drop(columns=['created_at',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
  pd.Int64Index,
  gs_preso_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/gs_one_one')


In [None]:
gs_match = sp2[sp2['BBL'].isin(sp2_grouped[sp2_grouped > 1].index)]
gs_preso_gdf = preso_gdf[preso_gdf['id'].isin(gs_match['id'])]
gs_preso_gdf.drop(columns=['created_at',
                            'updated_at',
                            'new_dev_start_date',
                            'new_dev_end_date',
                            'land_lease_expiration',
                            'last_rls_listing',
                            'last_mlsli_listing',
                            'last_listing',
                            'last_acris_created'], inplace=True)
gs_preso_gdf['source_id'] = pd.to_numeric(gs_preso_gdf['source_id'], errors='coerce')
gs_preso_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/gs_one_many')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gs_preso_gdf.drop(columns=['created_at',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
  pd.Int64Index,
  gs_preso_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/gs_one_many')


In [None]:
gs_match = sp2[sp2['id'].isin(sp2_grouped2[sp2_grouped2 > 1].index)]
gs_preso_gdf = preso_gdf[preso_gdf['id'].isin(gs_match['id'])]
gs_preso_gdf.drop(columns=['created_at',
                            'updated_at',
                            'new_dev_start_date',
                            'new_dev_end_date',
                            'land_lease_expiration',
                            'last_rls_listing',
                            'last_mlsli_listing',
                            'last_listing',
                            'last_acris_created'], inplace=True)
gs_preso_gdf['source_id'] = pd.to_numeric(gs_preso_gdf['source_id'], errors='coerce')
gs_preso_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/gs_many_one')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gs_preso_gdf.drop(columns=['created_at',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
  pd.Int64Index,
  gs_preso_gdf[['id', 'geometry', 'display_address', 'zip', 'source_id']].to_file('data/gs_many_one')


In [None]:
for c in gs_preso_gdf.columns:
    if 'last' in c:
        print(c)

years_last_altered
last_acris_sale


In [None]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime

[column for column in preso_gdf.columns if is_datetime(preso_gdf[column])]

['created_at',
 'updated_at',
 'new_dev_start_date',
 'new_dev_end_date',
 'land_lease_expiration',
 'last_rls_listing',
 'last_mlsli_listing',
 'last_listing',
 'last_acris_created']

In [None]:
preso_gdf.columns

Index(['id', 'city', 'state', 'zip', 'google_map_params',
       'building_bike_storage', 'building_doorman', 'building_elevator',
       'building_gym', 'building_laundry',
       ...
       'num_mlsli', 'last_mlsli_listing', 'building_id', 'num_other_listings',
       'last_listing', 'building_id', 'num_acris', 'last_acris_created',
       'last_acris_sale', 'geometry_json'],
      dtype='object', length=105)

In [None]:
preso_gdf['geometry'] = [x.geoms[0] for x in preso_gdf['geometry']]

In [None]:
matched_buildings.shape

(730044, 197)

In [None]:
matched_buildings[matched_buildings['id'].isin(matched_buildings.groupby('id')['source_id'].nunique() > 1)]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,appbbl,appdate,plutomapid,firm07_flag,pfirm15_flag,version,dcpedited,latitude,longitude,notes


In [None]:
print(matched_buildings.shape, matched_buildings.id.nunique(), matched_buildings.bbl.nunique())

(730044, 197) 730044 725967


In [None]:
dup_bbls = matched_buildings.groupby('bbl')['id'].nunique()[matched_buildings.groupby('bbl')['id'].nunique() > 1]

In [None]:
matched_buildings[matched_buildings.bbl.isin(dup_bbls.index)].groupby(['in_search'])['id'].nunique()

in_search
False    4497
True     3105
Name: id, dtype: int64

In [None]:
_ = matched_buildings[(matched_buildings.bbl.isin(dup_bbls.index)) & (matched_buildings['in_search'] == True)]
print(_.shape, _['id'].nunique(), _['bbl'].nunique())

(3105, 197) 3105 3059


In [None]:
matched_buildings[matched_buildings.bbl.isin(matched_buildings.groupby('bbl')['id'].nunique()[matched_buildings.groupby('bbl')['id'].nunique() > 1].index)][['id', 'display_address', 'source_id']]

Unnamed: 0,id,display_address,source_id
87,1345,345 West 58th St,1.010490e+09
88,42939,30 West 60th Street,1.010490e+09
277,4413,30 East 30th Street,1.008590e+09
278,661058,30 EAST 30 STREET,1.008590e+09
296,4713,545 WEST 37TH STREET,1.007090e+09
...,...,...,...
723663,662057,28-13 ASTORIA BOULEVARD,4.008390e+09
729725,831929,2681 MARION AVENUE,2.032870e+09
729726,860737,2681 Marion Avenue,2.032870e+09
730021,855928,1993 Arthur Avenue,2.030680e+09


In [None]:
not_in_pw.shape

(1186, 95)

In [None]:
not_in_pw.to_file('data/not_in_pw')

  pd.Int64Index,


In [None]:
preso_gdf[['id', 'display_address', 'source_id', 'geometry']].to_file('data/preso_gdf')

  pd.Int64Index,
  preso_gdf[['id', 'display_address', 'source_id', 'geometry']].to_file('data/preso_gdf')


KeyboardInterrupt: 

In [None]:
no_dupes_gdf.to_file('data/no_dupes')

  pd.Int64Index,


In [11]:
update_df = pd.read_csv('data/update_df.csv')
update_df['id'] = pd.to_numeric(update_df['id'], 'coerce')

  update_df = pd.read_csv('data/update_df.csv')


In [12]:
update_df[update_df['id'] == 636458]

Unnamed: 0.1,Unnamed: 0,id,field_name,old_value,new_value,in_search
53478,53478,636458,num_units,802.0,19201.0,True
371232,371232,636458,lot_area,1800.0,38000,True
452828,452828,636458,lot_front,18.0,20.5,True
484129,484129,636458,building_class,V1,RM,True
499306,499306,636458,building_front,0.0,190.0,True
513906,513906,636458,building_depth,0.0,200.0,True
539262,539262,636458,building_area,0.0,707634,True
1132712,1132712,636458,geometry_x,MULTIPOLYGON (((-73.944240962493 40.7479289436...,POLYGON ((-73.94420534847346 40.74763782275416...,True


In [13]:
matched_buildings[matched_buildings['id'] == 636458]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,appbbl,appdate,plutomapid,firm07_flag,pfirm15_flag,version,dcpedited,latitude,longitude,notes
588521,636458,Queens,NY,11101,,,,,,,...,,,1,,,23v1.1,,40.747961,-73.944457,


In [28]:
preso[preso['id'] == 514129]['source']

594114    pluto_v17_1
Name: source, dtype: object

In [14]:
pluto[pluto['bbl'] == 4004377502]

Unnamed: 0,borough,block,lot,cd,bct2020,bctcb2020,ct2010,cb2010,schooldist,council,...,appbbl,appdate,plutomapid,firm07_flag,pfirm15_flag,version,dcpedited,latitude,longitude,notes
692294,QN,437,7502,402.0,4001903.0,40019030000.0,19.0,1059.0,30.0,26.0,...,,,1,,,23v1.1,,40.747961,-73.944457,


In [27]:
pluto['address'].fillna('na', inplace=True)
pluto[pluto['address'].str.contains('11 44 DRIVE')][['address', 'bbl']]

Unnamed: 0,address,bbl
65639,9-11 44 DRIVE,4004500000.0
413917,11-11 44 DRIVE,4004470000.0


In [18]:
gdf[gdf['BBL'] == 4004377502]['geometry'].to_json()

'{"type": "FeatureCollection", "features": [{"id": "428653", "type": "Feature", "properties": {}, "geometry": {"type": "Polygon", "coordinates": [[[-73.94420534847346, 40.747637822754164], [-73.94421008999788, 40.7476245812946], [-73.9442740531848, 40.74763698074852], [-73.94434051551112, 40.747649865020755], [-73.94440116869998, 40.74766162254412], [-73.94447048533753, 40.747675060953874], [-73.94456094254194, 40.74769259601946], [-73.94489668008083, 40.74775768124105], [-73.94486061047967, 40.74785842412541], [-73.94479696974952, 40.748036172165904], [-73.94470239876253, 40.74830031052474], [-73.94436666452401, 40.74823520859924], [-73.9440169470451, 40.74816739462963], [-73.94410764390622, 40.74790191854325], [-73.94419747648031, 40.74765101935653], [-73.9442155250827, 40.74763979522918], [-73.94420534847346, 40.747637822754164]]]}, "bbox": [-73.94489668008083, 40.7476245812946, -73.9440169470451, 40.74830031052474]}], "bbox": [-73.94489668008083, 40.7476245812946, -73.9440169470451

In [19]:
gdf[gdf['BBL'] == 4004377502]['geometry']

428653    POLYGON ((-73.94421 40.74764, -73.94421 40.747...
Name: geometry, dtype: geometry

In [8]:
from shapely.geometry.multipolygon import MultiPolygon
from shapely import to_geojson
m = MultiPolygon([gdf[gdf['BBL'] == 4004377502]['geometry'].item()])
to_geojson(m)

'{"type":"MultiPolygon","coordinates":[[[[-73.94420534847346,40.747637822754164],[-73.94421008999788,40.7476245812946],[-73.9442740531848,40.74763698074852],[-73.94434051551112,40.747649865020755],[-73.94440116869998,40.74766162254412],[-73.94447048533753,40.747675060953874],[-73.94456094254194,40.74769259601946],[-73.94489668008083,40.74775768124105],[-73.94486061047967,40.74785842412541],[-73.94479696974952,40.748036172165904],[-73.94470239876253,40.74830031052474],[-73.94436666452401,40.74823520859924],[-73.9440169470451,40.74816739462963],[-73.94410764390622,40.74790191854325],[-73.94419747648031,40.74765101935653],[-73.9442155250827,40.74763979522918],[-73.94420534847346,40.747637822754164]]]]}'

In [15]:
def convert_geos(geo):
    try:
        return to_geojson(MultiPolygon([geo]))
    except ValueError:
        return to_geojson(geo)
[convert_geos(x) for x in gdf['geometry']]

['{"type":"MultiPolygon","coordinates":[[[[-74.01208461672105,40.692051314720516],[-74.01214566414592,40.69203822219325],[-74.01220121297158,40.69204419488755],[-74.01232466003289,40.69209116083113],[-74.01235945131845,40.69210738615592],[-74.0123897516398,40.69212275742716],[-74.01240883072332,40.692133858707955],[-74.0124301561227,40.69215521212171],[-74.01244811749179,40.69219364494922],[-74.01245766028099,40.69221286264119],[-74.01254013665607,40.69219619980666],[-74.01255921308709,40.692192778527854],[-74.01262527212305,40.692179572351044],[-74.01282852345757,40.6921389393151],[-74.01322351230438,40.69205860411361],[-74.01324483280364,40.69205433146317],[-74.01328522976273,40.69204664016842],[-74.01330318329525,40.692042367051904],[-74.01331440493563,40.692037240009846],[-74.01332674728117,40.692027844518186],[-74.01333235455567,40.692019301630594],[-74.0133351589944,40.692009904460946],[-74.01333347343666,40.69199965595879],[-74.01332561499622,40.69198513515223],[-74.013315513688

In [13]:
gdf['geometry']

0         POLYGON ((-74.01208 40.69205, -74.01215 40.692...
1         MULTIPOLYGON (((-74.04263 40.69004, -74.04257 ...
2         POLYGON ((-73.99182 40.67841, -73.99198 40.678...
3         POLYGON ((-74.03823 40.69836, -74.03868 40.698...
4         POLYGON ((-74.01313 40.69977, -74.01313 40.699...
                                ...                        
856898    POLYGON ((-74.24997 40.50916, -74.25033 40.509...
856899    POLYGON ((-74.24983 40.50903, -74.25064 40.508...
856900    POLYGON ((-74.24971 40.50891, -74.25009 40.508...
856901    POLYGON ((-74.24960 40.50880, -74.24984 40.508...
856902    POLYGON ((-74.24977 40.50860, -74.24984 40.508...
Name: geometry, Length: 856903, dtype: geometry

In [58]:
address_match = pd.read_csv('data/address_match.csv')

AttributeError: module 'geopandas' has no attribute 'read_csv'

In [56]:
address_match.head()

Unnamed: 0.1,Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,...,firm07_fla,pfirm15_fl,version,dcpedited,latitude,longitude,notes,shape_leng,shape_area,geometry_y
0,0,243063,Brooklyn,NY,11219,,,,,,...,,,23v1.1,,40.638096,-73.996509,,0.0,1831.452923,POLYGON ((-73.99634628603779 40.63803417679382...
1,1,61336,Brooklyn,NY,11220,,,,,,...,,,23v1.1,,40.634768,-74.020807,,0.0,1911.676971,POLYGON ((-74.02066406291931 40.63467951208545...
2,2,172582,Brooklyn,NY,11219,,,,,,...,,,23v1.1,,40.62628,-73.995414,,0.0,2530.695092,POLYGON ((-73.99526068571095 40.62637314294976...
3,3,24140,New York,NY,10037,,,,,,...,,,23v1.1,,40.815583,-73.938999,,0.0,2710.23594,"POLYGON ((-73.9388667161788 40.81568616935353,..."
4,4,166571,Brooklyn,NY,11219,,,,,,...,,,23v1.1,,40.63618,-73.992981,,0.0,2165.427046,POLYGON ((-73.99312431675497 40.63609267150114...


In [59]:
type(address_match)

pandas.core.frame.DataFrame

In [60]:
address_match['geometry_x']

0     MULTIPOLYGON (((-73.9963462906372 40.638034514...
1     MULTIPOLYGON (((-74.0206425867218 40.634732934...
2     MULTIPOLYGON (((-73.9953841743279 40.626448025...
3     MULTIPOLYGON (((-73.9388668750637 40.815686663...
4     MULTIPOLYGON (((-73.9928374422694 40.636271764...
5     MULTIPOLYGON (((-73.9381641354613 40.692514656...
6     MULTIPOLYGON (((-73.9286953136072 40.683188550...
7     MULTIPOLYGON (((-74.0007578329469 40.740656429...
8     MULTIPOLYGON (((-73.9549934555418 40.621393211...
9     MULTIPOLYGON (((-73.9227425020167 40.688294165...
10    MULTIPOLYGON (((-73.932285556198 40.6961601477...
11    MULTIPOLYGON (((-73.9553888021747 40.733981691...
12    MULTIPOLYGON (((-73.9580715148739 40.684701934...
13    MULTIPOLYGON (((-73.9915502454587 40.682726104...
14    MULTIPOLYGON (((-73.9851425540873 40.671581194...
15    MULTIPOLYGON (((-73.9553229565855 40.714957858...
16    MULTIPOLYGON (((-73.9545564402107 40.667302180...
17    MULTIPOLYGON (((-73.9572890377662 40.66909