# Choosing Which Fields to Update
This notebook will go through the logic needed to choose which fields to update in PW `buildings` table. The rules were created using the `pluto_preso_investigation.ipynb` notebook, and are detailed in the Notion page.

In [1]:
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine, text
import numpy as np
from sqlalchemy.orm import sessionmaker

import pickle as pkl

In [2]:
pluto = pd.read_csv("data/pluto_23v1_1.csv")

  pluto = pd.read_csv("data/pluto_23v1_1.csv")


In [3]:
%load_ext sql

In [4]:
%sql postgresql://teleport:@localhost:2023/perchwell

In [5]:
try:
    with open('data/buildings.pkl', 'rb') as f:
        preso = pkl.load(f)
except:
    rowlist = []
    # Create the SQLAlchemy engine
    engine = create_engine('postgresql://teleport:@localhost:2023/perchwell')
    # Define the SQL query string
    for ran in range(0,800000,100000):
        currentrow = len(rowlist)
        query_string = f'''select b.*
            from buildings b
            ---join with geographies
            inner join building_geographies bg
                on b.id = bg.building_id
            ---filter for only NYC region code = 1278
            where bg.geography_id = 1278
                --and source_id is not null 
            LIMIT 100000 OFFSET {currentrow};
            '''
        try:
            # Execute the query
            with engine.connect() as conn:
                result = conn.execute(text(query_string))

                # Process the query result
                for row in result:
                    # Access the row data
                    rowlist.append(row)
            print(f'{len(rowlist)} rows appended')
        except:
            engine = create_engine('postgresql://teleport:@localhost:2023/perchwell')
            with engine.connect() as conn:
                result = conn.execute(text(query_string))
                for row in result:
                    rowlist.append(row)
            print(f'{len(rowlist)} rows appended')
    preso = pd.DataFrame(rowlist)
    with open('data/buildings.pkl', 'wb') as f:
        pkl.dump(preso, f)

In [6]:
mapped_fields = [('zip', 'zipcode'),
                 ('year_built', 'yearbuilt'),
                 #('school_district_code', 'schooldist'),
                 ('num_stories', 'numfloors'), 
                 ('num_units', 'unitsres'), 
                 ('lot_area', 'lotarea'), 
                 ('lot_front', 'lotfront'), 
                 ('lot_depth', 'lotdepth'),
                 ('building_class', 'bldgclass'),
                 ('building_front', 'bldgfront'),
                 ('building_depth', 'bldgdepth'),
                 ('building_area', 'bldgarea'),
                 ('landmark_x', 'landmark_y')]

In [7]:
preso['source_id'] = pd.to_numeric(preso['source_id'])
matched_buildings = preso.merge(pluto, how='inner', left_on='source_id', right_on='bbl')
print(f'matched buildings: {matched_buildings.shape}\
      \nbuildings in preso: {preso.shape}\nbuildings in pluto: {pluto.shape}')

matched buildings: (731141, 183)      
buildings in preso: (748819, 91)
buildings in pluto: (859068, 92)


In [8]:
matched_buildings.groupby('landmark_y')['id'].nunique()

landmark_y
INDIVIDUAL AND INTERIOR LANDMARK      93
INDIVIDUAL LANDMARK                 1221
INTERIOR LANDMARK                     28
Name: id, dtype: int64

In [9]:
matched_buildings.groupby('landmark_x')['id'].nunique()

landmark_x
                                       18133
                                           1
(FORMER) DOLLAR SAVINGS BANK               1
(FORMER) FIREHOUSE ENGINE COMPANY 2        1
(FORMER) HAVEMEYERS & ELDER FILTER         1
                                       ...  
YOUNG MEN'S CHRISTIAN ASSOCIATION (        1
f                                        950
false                                     84
t                                          1
true                                       3
Name: id, Length: 86, dtype: int64

In [10]:
# map mismatched fields
matched_buildings['zip'] = pd.to_numeric(matched_buildings['zip'])
matched_buildings['numfloors'] = np.ceil(matched_buildings['numfloors'])

# map landmark field
matched_buildings['landmark_y'] = [True if type(x) == str else False for x in matched_buildings['landmark_y']]
matched_buildings['landmark_x'] = [x if x else 'f' for x in matched_buildings['landmark_x']]
matched_buildings['landmark_x'] = [False if (x.lower() in ('f', 'false') or x.isspace()) else True for x in matched_buildings['landmark_x']]

In [11]:
update_df = {'id': [], 'field_name': [], 'old_value': [], 'new_value': [], 'in_search': []}
for fnames in mapped_fields:
    wrong = matched_buildings[matched_buildings[f'{fnames[0]}'] != matched_buildings[f'{fnames[1]}']]
    update_df['id'] += list(wrong['id'])
    update_df['old_value'] += list(wrong[f'{fnames[0]}'])
    update_df['new_value'] += list(wrong[f'{fnames[1]}'])
    update_df['in_search'] += list(wrong['in_search'])
    update_df['field_name'] += [f'{fnames[0]}'] * len(wrong['id'])

update_df = pd.DataFrame(update_df)
update_df['id'].nunique()

434074

In [12]:
update_df.groupby('field_name').nunique()

Unnamed: 0_level_0,id,old_value,new_value,in_search
field_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
building_area,31444,8738,12082,2
building_class,32344,203,188,2
building_depth,19171,1058,1723,2
building_front,13793,1028,1663,2
landmark_x,1275,2,2,2
lot_area,391161,15236,16529,2
lot_depth,4842,1522,2069,2
lot_front,8009,1751,2599,2
num_stories,54454,80,72,2
num_units,16814,378,399,2


In [13]:
print(update_df.shape, update_df['id'].nunique())
update_df = update_df[(update_df['new_value'] != 0)]
update_df =  update_df[~(update_df['new_value'].isna())]
print(update_df.shape, update_df['id'].nunique())

(605116, 5) 434074
(554687, 5) 422172


In [14]:
update_df.groupby('field_name').nunique()

Unnamed: 0_level_0,id,old_value,new_value,in_search
field_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
building_area,27673,8254,12081,2
building_class,32311,203,188,2
building_depth,15550,875,1722,2
building_front,11264,890,1662,2
landmark_x,1263,1,1,2
lot_area,391015,15184,16528,2
lot_depth,4375,1411,2068,2
lot_front,6849,1580,2598,2
num_stories,22479,80,72,2
num_units,15010,365,398,2


In [15]:
# checking year_built and num_stories rules
year_built = update_df[update_df['field_name'] == 'year_built']
num_stories = update_df[update_df['field_name'] == 'num_stories']
update_df = update_df[update_df['field_name'] != 'year_built']
update_df = update_df[update_df['field_name'] != 'num_stories']

In [16]:
year_built_ok = year_built[year_built['new_value'] > year_built['old_value']]
year_built_bad = year_built[year_built['new_value'] < year_built['old_value']]
num_stories_ok = num_stories[num_stories['new_value'] > num_stories['old_value']]
num_stories_bad = num_stories[num_stories['new_value'] < num_stories['old_value']]

In [17]:
print(f'year_built: {len(year_built)}, year_built_ok: {len(year_built_ok)}, year_built_bad: {len(year_built_bad)}')
print(f'num_stories: {len(num_stories)}, num_stories_ok: {len(num_stories_ok)}, num_stories_bad: {len(num_stories_bad)}')

year_built: 23613, year_built_ok: 19146, year_built_bad: 4400
num_stories: 22493, num_stories_ok: 19659, num_stories_bad: 2758


In [18]:
year_built_bad

Unnamed: 0,id,field_name,old_value,new_value,in_search
4098,7069386,year_built,2022.0,2021.0,True
4099,5108862,year_built,2020.0,2019.0,True
4113,5108847,year_built,2022.0,2021.0,True
4120,853600,year_built,2019.0,1884.0,True
4122,661519,year_built,1885.0,1884.0,False
...,...,...,...,...,...
31232,4415,year_built,2017.0,2015.0,True
31233,4360,year_built,2001.0,2000.0,True
31234,4320,year_built,1931.0,1904.0,True
31242,230,year_built,2012.0,2009.0,True


In [19]:
num_stories_bad

Unnamed: 0,id,field_name,old_value,new_value,in_search
31273,1826375,num_stories,17.0,16.0,True
31274,1826374,num_stories,17.0,16.0,True
31275,693348,num_stories,17.0,16.0,True
31299,4003407,num_stories,10.0,9.0,True
31301,230319,num_stories,68.0,57.0,True
...,...,...,...,...,...
85743,4415,num_stories,33.0,30.0,True
85745,4204,num_stories,46.0,41.0,True
85750,43052,num_stories,91.0,71.0,True
85751,24035,num_stories,42.0,41.0,True


In [20]:
update_df = pd.concat([update_df, num_stories_ok, year_built_ok])

In [22]:
update_df['id'].nunique()

420342