# Address Matching between PLUTO and Perchwell's `buildings` table
This notebook will perform a second order matching based on address on the buildings for which matches weren't found based on their BBL.

In [1]:
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine, text
import numpy as np
from sqlalchemy.orm import sessionmaker

import pickle as pkl
import re

In [2]:
pluto = pd.read_csv("data/pluto_23v1_1.csv")

  pluto = pd.read_csv("data/pluto_23v1_1.csv")


In [3]:
%load_ext sql

In [4]:
%sql postgresql://teleport:@localhost:2023/perchwell

In [5]:
def pull_data(sql_query:str, file_out="pw.csv", port_num=2023):
    """
    For a given sql_query, this function will connect to Perchwell's database, pull
    the corresponding data, save it to a csv in the file specified (default is 
    pw.csv), and returns that data in the form of a Pandas DataFrame.

    Args:
        sql_query (str): query whose results you wish to download
        file_out (str, optional): filepath for the output file. Defaults to "pw.csv".
        port_num (int, optional): Port number for Teleport Connection. Defaults to 2023.

    Returns:
        pd.DataFrame: DataFrame containing results of sql_query 
    """    
    pw_df = []
    prev_row_count = -1
    current_row = 0
    # Create the SQLAlchemy engine
    engine = create_engine(f"postgresql://teleport:@localhost:{port_num}/perchwell")

    # in order to get around the 300k row limit
    while prev_row_count <= current_row:
        current_row = len(pw_df)
        if prev_row_count == current_row:
            print(f"final row count: {current_row}")
            break
        # Define the SQL query string, but keep last line
        query_string = f"""{sql_query}
            -- NOTE: DO NOT DELETE BELOW THIS
            LIMIT 100000 OFFSET {current_row};
            """
        try:
            # Execute the query
            with engine.connect() as conn:
                result = conn.execute(text(query_string))

                # Process the query result
                for row in result:
                    # Access the row data
                    pw_df.append(row)
        except:
            # in case of timeout
            engine = create_engine(f"postgresql://teleport:@localhost:{port_num}/perchwell")
            with engine.connect() as conn:
                result = conn.execute(text(query_string))
                for row in result:
                    pw_df.append(row)
        print(f"{len(pw_df)} rows appended")
        prev_row_count = current_row
    pw_df = pd.DataFrame(pw_df)
    #pw_df.to_csv(file_out)
    return pw_df

In [6]:
try:
    with open('data/buildings_with_boro_listing_rls.pkl', 'rb') as f:
        preso = pkl.load(f)
except FileNotFoundError:
    query_string = f'''select b.*, rls.*, mlsli.*, l.*, a.*, bg.name as BORO
            from buildings b
            ---join with geographies
            inner join (
                select b_geo.building_id, geo.name
                from building_geographies b_geo
                inner join geographies geo
                    on b_geo.geography_id = geo.id
                where geo.name in ('Manhattan', 'Brooklyn', 'Bronx', 'Queens')
                ) bg on b.id = bg.building_id
            left join (
                select
                    building_id
                    , count(*) as num_other_listings
                    , max(created_at) as last_listing
                from listings
                where not(
                        lower(source) like '%mlsli%' 
                        or lower(source) like '%rls%' 
                        or source = 'real_plus')
                group by building_id
            ) l on l.building_id = b.id
            left join (
                select
                   building_id
                    , count(distinct(id)) as num_rls
                    , max(created_at) as last_rls_listing
                from listings
                where --not(lower(source) like '%mlsli%')
                  lower(source) like '%rls%' or source = 'real_plus'
                group by building_id
            ) rls on rls.building_id = b.id
            left join (
                select
                    building_id
                    , count(distinct(id)) as num_mlsli
                    , max(created_at) as last_mlsli_listing
                from listings
                where lower(source) like '%mlsli%'
                group by building_id
            ) mlsli on mlsli.building_id = b.id
            left join (
                select 
                    building_id
                    , count(distinct(acris_document_id)) as num_acris
                    , max(created_at) as last_acris_created
                    , max(sale_date) as last_acris_sale
                from historicals
                group by building_id
                order by count(distinct(acris_document_id)) desc
            ) a on b.id = a.building_id
            '''
    preso = pull_data(query_string)
    with open('data/buildings_with_boro_listing_rls.pkl', 'wb') as f:
        pkl.dump(preso, f)

In [7]:
preso['source_id'] = pd.to_numeric(preso['source_id'])
matched_buildings = preso.merge(pluto, how='inner', left_on='source_id', right_on='bbl')
print(f'matched buildings: {matched_buildings.shape}\
      \nbuildings in preso: {preso.shape}\nbuildings in pluto: {pluto.shape}')

matched buildings: (731178, 197)      
buildings in preso: (748826, 105)
buildings in pluto: (859068, 92)


In [8]:
in_preso = preso[~preso['id'].isin(matched_buildings['id'])]
in_pluto = pluto[~pluto['bbl'].isin(matched_buildings['bbl'])]

In [9]:
print(in_preso.shape, in_pluto.shape)

(17648, 105) (132033, 92)


In [10]:
in_preso['display_address'].str.upper().str.strip()

44                 303 WEST 46TH STREET
88                    353 SECOND AVENUE
114                     40 WORTH STREET
151                       1205 BROADWAY
206                621 GREENWICH STREET
                      ...              
748816                  213 40TH AVENUE
748817               259-18 80TH STREET
748818    224-17 HORACE HARDING PARKWAY
748819          147-38 SPRINGFIELD LANE
748825             2113 NOSTRAND AVENUE
Name: display_address, Length: 17648, dtype: object

In [11]:
in_pluto.dropna(subset=['address','zipcode'], axis=0, inplace=True)
#in_pluto[in_pluto['address'].str.contains('3RD AVE')]['address']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_pluto.dropna(subset=['address','zipcode'], axis=0, inplace=True)


Steps for Address Matching:
1. strip trailing and leading whitespace from PW and PLUTO and convert to upper case
2. drop na from PLUTO addresses
3. convert zip codes to int
4. Convert written out numbers to numbers (i.e. first ave -> 1st ave)
5. Remove suffix of numbers
6. Match buildings based on the street address and zip code being the same

In [12]:
in_preso['display_address'] = in_preso['display_address'].str.upper().str.strip().copy()
in_pluto['address'] = in_pluto['address'].str.upper().str.strip().copy()
in_pluto = in_pluto.dropna(axis=0, subset=['address', 'zipcode'])
in_pluto['zipcode'] = in_pluto['zipcode'].astype(int, errors='ignore').copy()
in_preso['zip'] = in_preso['zip'].astype(int, errors='ignore').copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_preso['display_address'] = in_preso['display_address'].str.upper().str.strip().copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_pluto['address'] = in_pluto['address'].str.upper().str.strip().copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_preso['zip'] = in_preso['zip'].astype(in

In [13]:
my_dict = {
  "FIRST": "1ST",
  "SECOND": "2ND",
  "THIRD": "3RD",
  "FOURTH": "4TH",
  "FIFTH": "5TH",
  "SIXTH": "6TH",
  "SEVENTH": "7TH",
  "EIGHTH": "8TH",
  "NINTH": "9TH",
  "TENTH": "10TH",
  "ELEVENTH": "11TH",
  "TWELFTH": "12TH",
  "THIRTEENTH": "13TH",
  "FOURTEENTH": "14TH",
  "FIFTEENTH": "15TH",
  "SIXTEENTH": "16TH",
  "SEVENTEENTH": "17TH",
  "EIGHTEENTH": "18TH",
  "NINETEENTH": "19TH",
  "TWENTIETH": "20TH",
  "TWENTY-FIRST": "21ST",
  "TWENTY-SECOND": "22ND",
  "TWENTY-THIRD": "23RD",
  "TWENTY-FOURTH": "24TH",
  "TWENTY-FIFTH": "25TH",
  "TWENTY-SIXTH": "26TH",
  "TWENTY-SEVENTH": "27TH",
  "TWENTY-EIGHTH": "28TH",
  "TWENTY-NINTH": "29TH",
  "THIRTIETH": "30TH",
  "THIRTY-FIRST": "31ST",
  "THIRTY-SECOND": "32ND",
  "THIRTY-THIRD": "33RD",
  "THIRTY-FOURTH": "34TH",
  "THIRTY-FIFTH": "35TH",
  "THIRTY-SIXTH": "36TH",
  "THIRTY-SEVENTH": "37TH",
  "THIRTY-EIGHTH": "38TH",
  "THIRTY-NINTH": "39TH",
  "FORTIETH": "40TH",
  "FORTY-FIRST": "41ST",
  "FORTY-SECOND": "42ND",
  "FORTY-THIRD": "43RD",
  "FORTY-FOURTH": "44TH",
  "FORTY-FIFTH": "45TH",
  "FORTY-SIXTH": "46TH",
  "FORTY-SEVENTH": "47TH",
  "FORTY-EIGHTH": "48TH",
  "FORTY-NINTH": "49TH",
  "FIFTIETH": "50TH",
  "FIFTY-FIRST": "51ST",
  "FIFTY-SECOND": "52ND",
  "FIFTY-THIRD": "53RD",
  "FIFTY-FOURTH": "54TH",
  "FIFTY-FIFTH": "55TH",
  "FIFTY-SIXTH": "56TH",
  "FIFTY-SEVENTH": "57TH",
  "FIFTY-EIGHTH": "58TH",
  "FIFTY-NINTH": "59TH",
  "SIXTIETH": "60TH",
  "SIXTY-FIRST": "61ST",
  "SIXTY-SECOND": "62ND",
  "SIXTY-THIRD": "63RD",
"SIXTY-FOURTH": "64TH",
"SIXTY-FIFTH": "65TH",
"SIXTY-SIXTH": "66TH",
"SIXTY-SEVENTH": "67TH",
"SIXTY-EIGHTH": "68TH",
"SIXTY-NINTH": "69TH",
"SEVENTIETH": "70TH",
"SEVENTY-FIRST": "71ST",
"SEVENTY-SECOND": "72ND",
"SEVENTY-THIRD": "73RD",
"SEVENTY-FOURTH": "74TH",
"SEVENTY-FIFTH": "75TH",
"SEVENTY-SIXTH": "76TH",
"SEVENTY-SEVENTH": "77TH",
"SEVENTY-EIGHTH": "78TH",
"SEVENTY-NINTH": "79TH",
"EIGHTIETH": "80TH",
"EIGHTY-FIRST": "81ST",
"EIGHTY-SECOND": "82ND",
"EIGHTY-THIRD": "83RD",
"EIGHTY-FOURTH": "84TH",
"EIGHTY-FIFTH": "85TH",
"EIGHTY-SIXTH": "86TH",
"EIGHTY-SEVENTH": "87TH",
"EIGHTY-EIGHTH": "88TH",
"EIGHTY-NINTH": "89TH",
"NINETIETH": "90TH",
"NINETY-FIRST": "91ST",
"NINETY-SECOND": "92ND",
"NINETY-THIRD": "93RD",
"NINETY-FOURTH": "94TH",
"NINETY-FIFTH": "95TH",
"NINETY-SIXTH": "96TH",
"NINETY-SEVENTH": "97TH",
"NINETY-EIGHTH": "98TH",
"NINETY-NINTH": "99TH",
"ONE HUNDREDTH": "100TH"
}

In [14]:
def remove_suffix(df, field_name):
    return df[field_name].replace("(?<=\d)(st|nd|rd|th)\b", '', regex=True).copy()

def convert_nums(df, field_name):
    new_dict = {key.replace("-", " "): value for key, value in my_dict.items()}
    new_keys = list(new_dict.keys())
    new_values = list(new_dict.values())
    for i in range(len(new_dict)):
        df[field_name] =  df[field_name].str.replace(new_keys[i], new_values[i]).copy()
    return df

In [15]:
# convert numbers
new_dict = {key.replace('-', ' '): value for key, value in my_dict.items()}
new_keys = list(new_dict.keys())
new_values = list(new_dict.values())
for i in range(len(new_dict)):
    in_pluto['address'] = in_pluto.loc[:,'address'].str.replace(new_keys[i], new_values[i]).copy()
    in_preso['display_address'] = in_preso.loc[:,'display_address'].str.replace(new_keys[i], new_values[i]).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_preso['display_address'] = in_preso.loc[:,'display_address'].str.replace(new_keys[i], new_values[i]).copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_preso['display_address'] = in_preso.loc[:,'display_address'].str.replace(new_keys[i], new_values[i]).copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

In [16]:
in_pluto['address'] = [re.sub(r"(?<=\d)(ST|ND|RD|TH)\b", '', x) for x in in_pluto['address']]
in_preso['display_address'] = [re.sub(r"(?<=\d)(ST|ND|RD|TH)\b", '', x) for x in in_preso['display_address']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_preso['display_address'] = [re.sub(r"(?<=\d)(ST|ND|RD|TH)\b", '', x) for x in in_preso['display_address']]


In [17]:
in_pluto['address']

14           140 AMBER STREET
16          45 ARDSLEY STREET
44          231 OSGOOD AVENUE
80                  SAND LANE
84        38 WOLVERINE STREET
                 ...         
859035          10 DAWN COURT
859036          14 DAWN COURT
859038      218 CHEEVERS LANE
859060      12 EVERETT AVENUE
859063         47 CAROL COURT
Name: address, Length: 130825, dtype: object

In [18]:
in_preso['display_address']

44                   303 WEST 46 STREET
88                         353 2 AVENUE
114                     40 WORTH STREET
151                       1205 BROADWAY
206                621 GREENWICH STREET
                      ...              
748816                    213 40 AVENUE
748817                 259-18 80 STREET
748818    224-17 HORACE HARDING PARKWAY
748819          147-38 SPRINGFIELD LANE
748825             2113 NOSTRAND AVENUE
Name: display_address, Length: 17648, dtype: object

In [19]:
in_preso.groupby('in_search')['id'].nunique()

in_search
False    11056
True      6592
Name: id, dtype: int64

In [20]:
in_preso['source_id'].nunique()

12359

In [21]:
in_preso = in_preso[in_preso['in_search'] == True]

In [22]:
len(in_preso)

6592

In [23]:
second_matches = in_preso.merge(right=in_pluto, how='inner', left_on='display_address', right_on='address')
second_matches[second_matches['zip'] == second_matches['zipcode']]
len(second_matches)

614

In [24]:
second_matches.groupby('in_search')['id'].nunique()

in_search
True    558
Name: id, dtype: int64

In [25]:
second_matches['display_address']

0      621 GREENWICH STREET
1        222 EAST 81 STREET
2       577 WEST 161 STREET
3       776 FRANKLIN AVENUE
4             829 60 STREET
               ...         
609     34 HAWTHORNE AVENUE
610      2 HAWTHORNE AVENUE
611    149 HAWTHORNE AVENUE
612     406 ATLANTIC AVENUE
613           7 BOND STREET
Name: display_address, Length: 614, dtype: object

In [26]:
not_matched = in_preso[~in_preso['id'].isin(second_matches['id'])]

In [27]:
len(not_matched)

6034

In [28]:
not_matched[['display_address', 'zip', 'source', 'id', 'in_search','num_rls', 'num_mlsli', 'num_other_listings', 'num_acris', 'created_at', 'last_listing', 'last_rls_listing', 'last_mlsli_listing', 'last_acris_sale', 'boro', 'source_id']].to_csv('data/not_matched_with_listing.csv')

In [29]:
not_matched.shape

(6034, 105)

In [30]:
not_matched[~not_matched['source_id'].isna()][['display_address', 'zip', 'source', 'id', 'in_search','num_rls', 'num_mlsli', 'num_other_listings', 'num_acris', 'created_at', 'last_listing', 'last_rls_listing', 'last_mlsli_listing', 'boro', 'source_id']].head()

Unnamed: 0,display_address,zip,source,id,in_search,num_rls,num_mlsli,num_other_listings,num_acris,created_at,last_listing,last_rls_listing,last_mlsli_listing,boro,source_id
88,353 2 AVENUE,10010,pluto_v17_1,1341,True,2.0,,1.0,1.0,2014-01-18 04:09:20.476609,2019-05-15 03:58:52.735464,2017-05-02 12:51:54.489809,NaT,Manhattan,1009010000.0
114,40 WORTH STREET,10013,pluto_v17_1,1702,True,,,,,2014-01-18 04:09:27.320681,NaT,NaT,NaT,Manhattan,1001480000.0
281,169 RIVINGTON STREET,10002,pluto_v17_1,4382,True,,,,,2014-01-18 04:10:21.622216,NaT,NaT,NaT,Manhattan,1003480000.0
286,264 5 AVENUE,10001,pluto_v17_1,4489,True,,,,1.0,2014-01-18 04:10:23.786695,NaT,NaT,NaT,Manhattan,1008300000.0
364,67 GREENWICH STREET,10006,pluto_v17_1,5713,True,,,,1.0,2014-01-18 04:10:48.839057,NaT,NaT,NaT,Manhattan,1000190000.0


In [31]:
x = not_matched[not_matched['source'].str.contains('manual')]

In [32]:
pluto.dropna(subset='address', inplace=True)
pluto[pluto['address'].str.contains('BROADWAY AVENUE')][['address', 'borough']]

Unnamed: 0,address,borough
39857,4780 BROADWAY AVENUE,MN


In [33]:
second_matches.groupby('source')['id'].nunique()

source
manual_future_building     25
manually entered            1
manually_entered           29
pitney_bowes              110
pluto_v17_1               393
Name: id, dtype: int64

In [34]:
not_matched.groupby('source')['id'].nunique()

source
                             1
manual_future_building      23
manually entered            26
manually_entered          2265
pitney_bowes               870
pluto_merge                  1
pluto_split                  5
pluto_v09_1                  1
pluto_v13_1                  1
pluto_v13_2                  2
pluto_v14_1                  1
pluto_v14_2                  1
pluto_v15_1                  1
pluto_v16_2                  3
pluto_v17_1               2833
Name: id, dtype: int64

In [35]:
# checking to see if manually entered buildings are duplicated
dup_check = not_matched[~not_matched['source'].str.contains('pluto')]
pluto['address'] = pluto['address'].str.upper().str.strip().copy()
pluto['zipcode'] = pluto['zipcode'].astype(int, errors='ignore').copy()
for i in range(len(new_dict)):
    pluto['address'] = pluto.loc[:,'address'].str.replace(new_keys[i], new_values[i]).copy()

In [36]:
duplicates = dup_check.merge(pluto, how='inner', left_on='display_address', right_on='address')
duplicates.dropna(subset=['zip', 'zipcode'], inplace=True)
duplicates = duplicates[duplicates['zip'].astype(int) == duplicates['zipcode'].astype(int)]

In [37]:
duplicates[['display_address', 'zip', 'source', 'id', 'in_search','num_rls', 'num_mlsli', 'num_other_listings', 'num_acris', 'created_at', 'last_listing', 'last_rls_listing', 'last_mlsli_listing', 'boro', 'source_id']]

Unnamed: 0,display_address,zip,source,id,in_search,num_rls,num_mlsli,num_other_listings,num_acris,created_at,last_listing,last_rls_listing,last_mlsli_listing,boro,source_id
5,276 85 AVENUE,11001,pitney_bowes,1719372,True,,,,,2019-10-01 20:56:54.763149,NaT,NaT,NaT,Queens,
6,32-33 100 STREET,11369,manually_entered,1806401,True,,1.0,,,2020-11-12 05:43:55.820271,NaT,NaT,2019-08-29 00:58:38.116334,Queens,
53,145 AVENUE,11434,manually_entered,1817572,True,,1.0,,,2020-12-02 19:13:24.679026,NaT,NaT,2019-08-28 21:46:16.653146,Queens,
54,145 AVENUE,11434,manually_entered,1817572,True,,1.0,,,2020-12-02 19:13:24.679026,NaT,NaT,2019-08-28 21:46:16.653146,Queens,
57,145 AVENUE,11434,manually_entered,1817572,True,,1.0,,,2020-12-02 19:13:24.679026,NaT,NaT,2019-08-28 21:46:16.653146,Queens,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061,32-33 105 STREET,11369,manually_entered,1807258,True,,1.0,,,2020-11-12 05:46:09.929628,NaT,NaT,2019-10-03 06:38:11.689643,Queens,
1063,150 OCEANA DRIVE WEST,11235,manually_entered,1810787,True,,2.0,,,2020-12-02 16:53:33.417540,NaT,NaT,2020-11-19 15:35:50.301555,Brooklyn,
1106,115 AVENUE,11434,manually_entered,1818108,True,,1.0,,,2020-12-02 19:15:13.309215,NaT,NaT,2019-08-29 06:23:28.509321,Queens,
1107,115 AVENUE,11434,manually_entered,1818108,True,,1.0,,,2020-12-02 19:15:13.309215,NaT,NaT,2019-08-29 06:23:28.509321,Queens,


In [38]:
duplicates.shape

(161, 197)

In [39]:
not_matched = not_matched[~not_matched['id'].isin(duplicates['id'])]
not_matched['num_listings'] = not_matched['num_listings'].fillna(0).copy()

KeyError: 'num_listings'

In [None]:
not_matched.groupby('num_listings')['id'].nunique()

num_listings
0.0      1627
1.0      2438
2.0       711
3.0       261
4.0        92
         ... 
506.0       1
521.0       1
538.0       1
722.0       1
732.0       1
Name: id, Length: 80, dtype: int64

In [None]:
not_matched['id'].nunique()

5394

In [None]:
no_listings = not_matched[not_matched['num_listings'] == 0]

In [None]:
no_listings.groupby('source')['id'].nunique()

source
manually entered       2
manually_entered       1
pitney_bowes         199
pluto_split            3
pluto_v14_1            1
pluto_v17_1         1421
Name: id, dtype: int64

In [None]:
in_preso.shape

(5964, 101)

In [None]:
in_preso['source_id'].nunique()

3030

In [None]:
no_listings.groupby('num_acris')['id'].nunique()

num_acris
0.0      17
1.0     262
2.0      94
3.0      34
4.0       8
5.0       2
6.0       2
9.0       1
14.0      1
Name: id, dtype: int64

In [None]:
pluto[pluto['address'].str.contains('220 JOHNSON AVE')][['borough','address']]

Unnamed: 0,borough,address
616753,BX,3220 JOHNSON AVENUE


In [None]:
preso[preso['display_address'].str.contains('220 Johnson Ave')]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,num_rls,num_mlsli,building_id,num_listings,last_listing,building_id.1,num_acris,last_created,last_acris_sale,boro
430181,771235,Bronx,NY,10463,,,,,,,...,,,,,NaT,,,NaT,,Bronx


In [None]:
in_pluto.groupby('borough')['bbl'].nunique()

borough
BK    1875
BX     457
MN     291
QN    3234
SI    1472
Name: bbl, dtype: int64

In [40]:
len(not_matched)

5950

In [41]:
len(x)

2314

In [42]:
x.groupby('source')['id'].nunique()

source
manual_future_building      23
manually entered            26
manually_entered          2265
Name: id, dtype: int64

In [43]:
nyc_zip = pd.read_csv('data/nyc-zip-codes.csv')
nyc_zip.head()

Unnamed: 0,Borough,Neighborhood,ZipCode
0,Bronx,Central Bronx,10453
1,Bronx,Central Bronx,10457
2,Bronx,Central Bronx,10460
3,Bronx,Bronx Park and Fordham,10458
4,Bronx,Bronx Park and Fordham,10467


In [46]:
nyc_zip.groupby('Borough')['ZipCode'].nunique()

Borough
Bronx            25
Brooklyn         37
Manhattan        43
Queens           61
Staten Island    12
Name: ZipCode, dtype: int64

In [47]:
x['num_mlsli'] = x['num_mlsli'].fillna(0).copy()
x['num_rls'] = x['num_rls'].fillna(0).copy()
x['num_acris'] = x['num_acris'].fillna(0).copy()
x['num_other_listings'] = x['num_other_listings'].fillna(0).copy()

In [49]:
for_mlsli = x[(x['num_mlsli'] > 0) & (x['num_rls'] == 0)]

In [50]:
for_mlsli.shape

(2266, 105)

In [56]:
for_mlsli[~for_mlsli['zip'].astype(int).isin(nyc_zip['ZipCode'])]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,num_mlsli,last_mlsli_listing,building_id,num_other_listings,last_listing,building_id.1,num_acris,last_acris_created,last_acris_sale,boro
46508,1806923,Bellerose Manor,NY,11001,,,,,,,...,3.0,2020-09-24 11:36:50.622353,,0.0,NaT,,0.0,NaT,,Queens
46513,1807935,Bushwick,NY,11003,,,,,,,...,1.0,2019-12-24 09:14:56.037376,,0.0,NaT,,0.0,NaT,,Brooklyn
46514,1808131,East Village,NY,11050,,,,,,,...,1.0,2020-01-23 22:43:16.309763,,0.0,NaT,,0.0,NaT,,Manhattan
46528,1810701,Wakefield,NY,11763,,,,,,,...,1.0,2019-08-29 00:56:50.759104,,0.0,NaT,,0.0,NaT,,Bronx
46530,1811182,Flatiron,NY,11050,,,,,,,...,1.0,2020-07-30 21:15:47.519813,,0.0,NaT,,0.0,NaT,,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748759,1815616,New York,NY,11729,,,,,,,...,1.0,2020-10-12 02:57:50.184727,,0.0,NaT,,0.0,NaT,,Manhattan
748763,1815933,New York,NY,11754,,,,,,,...,1.0,2019-08-28 21:46:58.964171,,0.0,NaT,,0.0,NaT,,Queens
748765,1816054,New York,NY,11040,,,,,,,...,1.0,2020-04-29 08:36:08.555518,,0.0,NaT,,0.0,NaT,,Queens
748770,1816398,New York,NY,12775,,,,,,,...,1.0,2020-09-25 20:15:50.705117,,0.0,NaT,,0.0,NaT,,Queens


In [59]:
remove = for_mlsli[~for_mlsli['zip'].astype(int).isin(nyc_zip['ZipCode'])]

In [57]:
nyc_zip[nyc_zip['ZipCode'] == 11729]

Unnamed: 0,Borough,Neighborhood,ZipCode


In [60]:
manual_review = for_mlsli[for_mlsli['zip'].astype(int).isin(nyc_zip['ZipCode'])]

In [61]:
print(f'not in NYC: {remove.shape}\nin NYC: {manual_review.shape}')

not in NYC: (372, 105)
in NYC: (1894, 105)


In [62]:
remove.groupby('boro')['id'].nunique()

boro
Bronx         19
Brooklyn      74
Manhattan    169
Queens       110
Name: id, dtype: int64

In [63]:
manual_review.groupby('boro')['id'].nunique()

boro
Bronx           5
Brooklyn       16
Manhattan      13
Queens       1860
Name: id, dtype: int64

In [67]:
import plotly.graph_objects as go
trace = go.Scattermapbox(
    lat=manual_review['centroid_latitude'],
    lon=manual_review['centroid_longitude'],
    mode='markers',
    marker=dict(size=5, color='blue'),
    text=manual_review['id']
)
# Create layout
layout = go.Layout(
    title='In Perchwell but not in PLUTO',
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken='pk.eyJ1IjoiYXJuYXYtcHciLCJhIjoiY2xpYzR1NGFnMGN3ZTNkbXE5MHlid2hleiJ9.m7BGQisN4zz2iwFPWpiqsQ',
        bearing=0,
        center=dict(
            lat=40.7128,  # Center latitude of New York City
            lon=-74.0060  # Center longitude of New York City
        ),
        pitch=0,
        zoom=10
    ),
)
# Create figure
fig = go.Figure(data=[trace], layout=layout)
# Display figure
fig.show()

In [65]:
manual_review.columns

Index(['id', 'city', 'state', 'zip', 'google_map_params',
       'building_bike_storage', 'building_doorman', 'building_elevator',
       'building_gym', 'building_laundry',
       ...
       'num_mlsli', 'last_mlsli_listing', 'building_id', 'num_other_listings',
       'last_listing', 'building_id', 'num_acris', 'last_acris_created',
       'last_acris_sale', 'boro'],
      dtype='object', length=105)

In [68]:
import geopandas as gpd

ModuleNotFoundError: No module named 'geopandas'

In [69]:
pluto.head()

Unnamed: 0,borough,block,lot,cd,bct2020,bctcb2020,ct2010,cb2010,schooldist,council,...,appbbl,appdate,plutomapid,firm07_flag,pfirm15_flag,version,dcpedited,latitude,longitude,notes
0,BK,7104,338,315.0,3040000.0,30400000000.0,400.0,1000.0,21.0,47.0,...,,,1,,,23v1.1,,40.599103,-73.972539,
1,BK,7104,339,315.0,3040000.0,30400000000.0,400.0,1000.0,21.0,47.0,...,,,1,,,23v1.1,,40.599108,-73.972485,
2,BK,3435,19,304.0,3041100.0,30411000000.0,411.0,1000.0,32.0,37.0,...,,,1,,,23v1.1,,40.689043,-73.906925,
3,BK,3106,33,301.0,3049100.0,30491000000.0,491.0,2001.0,14.0,34.0,...,,,1,,,23v1.1,,40.703341,-73.941771,
4,BK,3435,20,304.0,3041100.0,30411000000.0,411.0,1000.0,32.0,37.0,...,,,1,,,23v1.1,,40.689081,-73.906886,


In [74]:
pluto[pluto['address'].str.contains('COVERT')]['address']

342       206 COVERT STREET
2990      247 COVERT STREET
11455     312 COVERT STREET
11472     220 COVERT STREET
11486     252 COVERT STREET
                ...        
831660    290 COVERT STREET
831833    231 COVERT STREET
831856    259 COVERT STREET
832022     72 COVERT STREET
839257    225 COVERT STREET
Name: address, Length: 229, dtype: object

In [80]:
pluto[(pluto['block'] == 3415) & (pluto['address'].str.contains('EVERGREEN'))]['address']

659584    690 EVERGREEN AVENUE
Name: address, dtype: object

In [99]:
pluto[pluto['bbl'] == 3036030010]['address']

Series([], Name: address, dtype: object)

In [97]:
preso[preso['source_id'] == 3036030010]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,num_mlsli,last_mlsli_listing,building_id,num_other_listings,last_listing,building_id.1,num_acris,last_acris_created,last_acris_sale,boro
481247,212392,Brooklyn,NY,11212,,,,,,,...,,NaT,,,NaT,212392.0,3.0,2018-05-16 08:20:20.578716,2017-10-11,Brooklyn


In [108]:
pd.options.display.max_rows = 400
pluto[pluto['address'].str.contains(str.upper('Rockaway Avenue'))]['address'].sort_values()

836023        100 ROCKAWAY AVENUE
120209        102 ROCKAWAY AVENUE
278389       1029 ROCKAWAY AVENUE
728693    1034-38 ROCKAWAY AVENUE
624809       1035 ROCKAWAY AVENUE
389590        104 ROCKAWAY AVENUE
624515       1040 ROCKAWAY AVENUE
27719        1045 ROCKAWAY AVENUE
48820        1050 ROCKAWAY AVENUE
48530        1055 ROCKAWAY AVENUE
574844       1058 ROCKAWAY AVENUE
282827        106 ROCKAWAY AVENUE
478398       1061 ROCKAWAY AVENUE
683237       1066 ROCKAWAY AVENUE
578824        108 ROCKAWAY AVENUE
48819        1084 ROCKAWAY AVENUE
48511        1085 ROCKAWAY AVENUE
610947       1095 ROCKAWAY AVENUE
578825        110 ROCKAWAY AVENUE
282830        112 ROCKAWAY AVENUE
282829        116 ROCKAWAY AVENUE
48513        1165 ROCKAWAY AVENUE
48822        1168 ROCKAWAY AVENUE
654306        118 ROCKAWAY AVENUE
550630       1184 ROCKAWAY AVENUE
154750       1185 ROCKAWAY AVENUE
282828        120 ROCKAWAY AVENUE
154749       1205 ROCKAWAY AVENUE
120470        122 ROCKAWAY AVENUE
552227       1

In [117]:
zips = pluto['zipcode'].dropna()

In [118]:
print(len(zips))
zips = set(zips)

856564


In [119]:
len(zips)

217

In [120]:
zips

{10001.0,
 10002.0,
 10003.0,
 10004.0,
 10005.0,
 10006.0,
 10007.0,
 10009.0,
 10010.0,
 10011.0,
 10012.0,
 10013.0,
 10014.0,
 10016.0,
 10017.0,
 10018.0,
 10019.0,
 10020.0,
 10021.0,
 10022.0,
 10023.0,
 10024.0,
 10025.0,
 10026.0,
 10027.0,
 10028.0,
 10029.0,
 10030.0,
 10031.0,
 10032.0,
 10033.0,
 10034.0,
 10035.0,
 10036.0,
 10037.0,
 10038.0,
 10039.0,
 10040.0,
 10044.0,
 10045.0,
 10055.0,
 10065.0,
 10069.0,
 10075.0,
 10103.0,
 10105.0,
 10106.0,
 10110.0,
 10112.0,
 10118.0,
 10119.0,
 10120.0,
 10121.0,
 10122.0,
 10123.0,
 10128.0,
 10151.0,
 10152.0,
 10155.0,
 10158.0,
 10165.0,
 10169.0,
 10170.0,
 10175.0,
 10176.0,
 10271.0,
 10278.0,
 10280.0,
 10281.0,
 10282.0,
 10301.0,
 10302.0,
 10303.0,
 10304.0,
 10305.0,
 10306.0,
 10307.0,
 10308.0,
 10309.0,
 10310.0,
 10312.0,
 10314.0,
 10451.0,
 10452.0,
 10453.0,
 10454.0,
 10455.0,
 10456.0,
 10457.0,
 10458.0,
 10459.0,
 10460.0,
 10461.0,
 10462.0,
 10463.0,
 10464.0,
 10465.0,
 10466.0,
 10467.0,
 10468.0,


In [123]:
zips.__contains__(11103)

True

In [124]:
zips_bx = [10463, 10471,10466, 10469, 10470, 10475,10458, 10467, 10468,10461, 10462, 10464, 10465, 10472, 10473,10453, 10457, 10460 ,10451, 10452, 10456, 10454, 10455, 10459, 10474]
zips_bk = [ 11211, 11222, 11201, 11205, 11215, 11217, 11231, 11213, 11212, 11216, 11233, 11238, 11207, 11208, 11220, 11232, 11204, 11218, 11219, 11230, 11203, 11210, 11225, 11226, 11234, 11236, 11239, 11209, 11214, 11228, 11223, 11224, 11229, 11235, 11206, 11221, 11237]
zips_mn = [10031, 10032, 10033, 10034, 10040,10026, 10027, 10030, 10037, 10039,10029, 10035,10023, 10024, 10025,10021, 10028, 10044, 10128,10001, 10011, 10018, 10019, 10020, 10036,10010, 10016, 10017, 10022,10012, 10013, 10014,10002, 10003, 10009, 10004, 10005, 10006, 10007, 10038, 10280]
zips_qn = [11101, 11102, 11103, 11104, 11105, 11106,11368, 11369, 11370, 11372, 11373, 11377, 11378,11354, 11355, 11356, 11357, 11358, 11359, 11360,11361, 11362, 11363, 11364,11374, 11375, 11379, 11385,11365, 11366, 11367,11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421,11412, 11423, 11432, 11433, 11434, 11435, 11436,11004, 11005, 11411, 11413, 11422, 11426, 11427, 11428, 11429,11691, 11692, 11693, 11694, 11695, 11697]

In [125]:
zips2 = zips_bx + zips_bk + zips_mn + zips_qn
len(zips2)

164

In [131]:
y = [x if x not in zips2 else None for x in zips]

In [132]:
len(set(y))

54

In [137]:
not_matched[~not_matched['zip'].astype(int).isin(zips)]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,num_mlsli,last_mlsli_listing,building_id,num_other_listings,last_listing,building_id.1,num_acris,last_acris_created,last_acris_sale,boro
21023,337288,Queens,NY,0,,,,,,,...,,NaT,,,NaT,337288.0,1.0,2018-03-19 11:22:52.175444,2013-04-11,Queens
23741,380346,Queens,NY,0,,,,,,,...,,NaT,,,NaT,380346.0,1.0,2018-03-19 06:43:48.303794,2017-10-20,Queens
46427,1371847,Great Neck,NY,11020,,,,,,,...,1.0,2019-10-23 12:16:15.592998,1371847.0,1.0,2019-08-28 03:26:33.985033,,,NaT,,Queens
46486,1508228,Lawrence,NY,11559,,,,,,,...,,NaT,,,NaT,,,NaT,,Queens
46487,1597806,Valley Stream,NY,11580,,,,,,,...,,NaT,1597806.0,1.0,2019-08-28 02:10:22.547846,,,NaT,,Queens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748752,1815392,New York,NY,11746,,,,,,,...,1.0,2019-08-28 23:18:13.707010,,,NaT,,,NaT,,Manhattan
748759,1815616,New York,NY,11729,,,,,,,...,1.0,2020-10-12 02:57:50.184727,,,NaT,,,NaT,,Manhattan
748763,1815933,New York,NY,11754,,,,,,,...,1.0,2019-08-28 21:46:58.964171,,,NaT,,,NaT,,Queens
748770,1816398,New York,NY,12775,,,,,,,...,1.0,2020-09-25 20:15:50.705117,,,NaT,,,NaT,,Queens


In [143]:
for_mlsli[~for_mlsli['zip'].astype(int).isin(zips2)].shape

(372, 105)

In [139]:
for_mlsli.shape

(2266, 105)

In [150]:
mlsli_nyc = for_mlsli[for_mlsli['zip'].astype(int).isin(zips2)]

In [151]:
mlsli_nyc[mlsli_nyc['id'] == 1806923]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,num_mlsli,last_mlsli_listing,building_id,num_other_listings,last_listing,building_id.1,num_acris,last_acris_created,last_acris_sale,boro


In [152]:
mlsli_nyc.shape

(1894, 105)

In [153]:
mlsli_nyc.groupby('boro')['id'].nunique()

boro
Bronx           5
Brooklyn       16
Manhattan      13
Queens       1860
Name: id, dtype: int64

In [159]:
pluto[pluto['address'] == '410 TOMPKINS AVENUE'][['bldgclass', 'bbl', 'zipcode']]

Unnamed: 0,bldgclass,bbl,zipcode
432901,RM,3018398000.0,11216.0


In [156]:
pluto.columns

Index(['borough', 'block', 'lot', 'cd', 'bct2020', 'bctcb2020', 'ct2010',
       'cb2010', 'schooldist', 'council', 'zipcode', 'firecomp', 'policeprct',
       'healthcenterdistrict', 'healtharea', 'sanitboro', 'sanitdistrict',
       'sanitsub', 'address', 'zonedist1', 'zonedist2', 'zonedist3',
       'zonedist4', 'overlay1', 'overlay2', 'spdist1', 'spdist2', 'spdist3',
       'ltdheight', 'splitzone', 'bldgclass', 'landuse', 'easements',
       'ownertype', 'ownername', 'lotarea', 'bldgarea', 'comarea', 'resarea',
       'officearea', 'retailarea', 'garagearea', 'strgearea', 'factryarea',
       'otherarea', 'areasource', 'numbldgs', 'numfloors', 'unitsres',
       'unitstotal', 'lotfront', 'lotdepth', 'bldgfront', 'bldgdepth', 'ext',
       'proxcode', 'irrlotcode', 'lottype', 'bsmtcode', 'assessland',
       'assesstot', 'exempttot', 'yearbuilt', 'yearalter1', 'yearalter2',
       'histdist', 'landmark', 'builtfar', 'residfar', 'commfar', 'facilfar',
       'borocode', 'bbl', 'cond

In [161]:
preso[preso['display_address'].str.contains('118 LEROY')]

Unnamed: 0,id,city,state,zip,google_map_params,building_bike_storage,building_doorman,building_elevator,building_gym,building_laundry,...,num_mlsli,last_mlsli_listing,building_id,num_other_listings,last_listing,building_id.1,num_acris,last_acris_created,last_acris_sale,boro
