In [157]:
import pickle as pkl
import pandas as pd
import sqlalchemy as db
import numpy as np
import geopandas as gpd
import json
import re

from sqlalchemy import create_engine, text
from shapely.geometry import shape
from shapely.geometry.multipolygon import MultiPolygon
from shapely import to_geojson
from sqlalchemy.orm import sessionmaker
from datetime import datetime

from building_refresh_v2 import pull_data

In [158]:
try:
    change_ids = pd.read_csv('data/pluto_v17_pw_comparison_2023_07_25 (1).csv')
except FileNotFoundError:
    with open('pluto_v17_comparison.sql', 'r') as f:
        sql_query = f.read()
    change_ids = pull_data(sql_query)

In [159]:
change_ids.shape

(3688, 2)

In [160]:
updates_df = pd.read_csv('updates_MN.csv')
updates_df.groupby('field_name')['id'].nunique()

field_name
building_area     1500
building_class    2888
building_depth    1036
building_front     848
geometry          5331
landmark           384
lot_area           445
lot_depth          274
lot_front          445
num_stories        592
num_units          229
year_built        1108
zip                 69
Name: id, dtype: int64

In [161]:
updates_df.shape

(15149, 3)

In [162]:
updates_df.nunique()

id            5331
field_name      13
new_value     8111
dtype: int64

In [163]:
set(updates_df['field_name'])

{'building_area',
 'building_class',
 'building_depth',
 'building_front',
 'geometry',
 'landmark',
 'lot_area',
 'lot_depth',
 'lot_front',
 'num_stories',
 'num_units',
 'year_built',
 'zip'}

In [164]:
def get_key(b_id, f_name):
    if f_name in ('lot_area', 'lot_front', 'lot_depth'):
        f_name = 'lot'
    elif f_name in ('building_area', 'building_front', 'building_depth'):
        f_name = 'building_area'
    return str(b_id) + f_name

In [165]:
updates_df['key'] = [get_key(x, y) for x, y in zip(updates_df['id'], updates_df['field_name'])]
updates_df

Unnamed: 0,id,field_name,new_value,key
0,538,zip,10038.0,538zip
1,12094,zip,10001.0,12094zip
2,5467,zip,10005.0,5467zip
3,8181,zip,10038.0,8181zip
4,12304,zip,10005.0,12304zip
...,...,...,...,...
15144,34903,landmark,True,34903landmark
15145,35617,landmark,True,35617landmark
15146,37099,landmark,True,37099landmark
15147,39445,landmark,True,39445landmark


In [166]:
change_ids['key'] = [get_key(x, y) for x, y in zip(change_ids['building_id'], change_ids['field_name'])]

In [167]:
updates_df.groupby('field_name')['id'].nunique()

field_name
building_area     1500
building_class    2888
building_depth    1036
building_front     848
geometry          5331
landmark           384
lot_area           445
lot_depth          274
lot_front          445
num_stories        592
num_units          229
year_built        1108
zip                 69
Name: id, dtype: int64

In [168]:
updates_df[updates_df['key'].isin(change_ids['key'])][['id', 'field_name', 'new_value']].to_csv('data/updates_with_changes.csv')
with open('data/MN_updates_change_id.txt', 'w') as f:
     f.write(str(set(updates_df['id'])))

print(len(set(updates_df['id'])))

5331


In [169]:
geos = updates_df[updates_df['field_name'] == 'geometry']
updates_df = updates_df[updates_df['field_name'] != 'geometry']
updates_df = updates_df[~updates_df['key'].isin(change_ids['key'])][['id', 'field_name', 'new_value']]
updated_geos = geos[geos['id'].isin(updates_df['id'])]
updates_df = updates_df.append(updated_geos)

  updates_df = updates_df.append(updated_geos)


In [170]:
updates_df[['id', 'field_name', 'new_value']].to_csv('MN_updates_no_change_v17.csv')
with open('MN_updates_no_change_id.txt', 'w') as f:
     f.write(str(set(updates_df['id'])))

In [171]:
updates_df[~updates_df['key'].isin(change_ids['key'])][['id', 'field_name', 'new_value']].groupby('field_name')['id'].nunique()

field_name
building_area     1448
building_class    2231
building_depth     988
building_front     801
geometry          4047
landmark           384
lot_area           411
lot_depth          244
lot_front          413
num_stories        505
num_units          203
year_built         222
zip                 33
Name: id, dtype: int64

In [172]:
updates_df.shape

(11930, 4)