The master_file serves as the glue for all other files in the project.  It is the central file to load training and test data, tag the addresses, standardize the addresses, and compare the different address lists.

In [50]:
from collections import defaultdict
from address_compare import standardizers as stndrdzr
from address_compare import comparers as comps
from address_compare import matcher as mtch
from address_compare.crf_tagger import AddressTagger
import json
import pandas as pd

In [51]:
# Editable parameters to control the functions
retrain_crf_tagger = False
standardize_addresses = True

file_location_raw_addresses_1 = 'address_compare\\data\\sandbox data.xlsx'
raw_address_field_name_list_1 = 'Single String Address'

file_location_raw_addresses_2 = 'address_compare\\data\\sandbox data.xlsx'
raw_address_field_name_list_2 = 'Single String Address'

In [52]:
# Placeholder for reading/calling the training data for the CRF Tagger and sending the training data to train the model
if retrain_crf_tagger:
    with open('data/tagged_addresses.json') as f:
        td = json.load(f)
    
    #send training data to CRF tagger to train the model here...

In [53]:
# Placeholder for reading/calling the 2 lists of raw addresses
raw_address_list_1 = pd.read_excel(file_location_raw_addresses_1)
raw_address_list_2 = pd.read_excel(file_location_raw_addresses_2)

In [54]:
# Add New Addresses to Raw Address List 2 to Have Differences
new_addresses_to_add = pd.DataFrame({'Single String Address': ["52 Main St Apt 75","123 Maple Dr","97531 George Allen Rd", "13579 East Elm Ave Bldg 4"]})
print (new_addresses_to_add)

       Single String Address
0          52 Main St Apt 75
1               123 Maple Dr
2      97531 George Allen Rd
3  13579 East Elm Ave Bldg 4


In [55]:
raw_address_list_2 = raw_address_list_2.append(new_addresses_to_add, ignore_index = True)
print (raw_address_list_2)

              Single String Address
0             1732 BROADWAY, STE 20
1                      747 BROADWAY
2                  5 - 433 10TH AVE
3             APT 20, 1420 10TH AVE
4               210 10TH ST, BLDG A
5                  #23 137 11TH AVE
6                APT A 167 11TH AVE
7            UNIT 123 - 319 11TH PL
8                      129 12TH AVE
9   3 BRITISH PROPERTY WAY, UNIT 20
10            1732 BROADWAY, STE 20
11                     747 BROADWAY
12                 5 - 433 10TH AVE
13            APT 20, 1420 10TH AVE
14              210 10TH ST, BLDG A
15                 #23 137 11TH AVE
16               APT A 167 11TH AVE
17           UNIT 123 - 319 11TH PL
18                     129 12TH AVE
19  3 BRITISH PROPERTY WAY, UNIT 20
20               APT A 167 11TH AVE
21           UNIT 123 - 319 11TH PL
22                     129 12TH AVE
23  3 BRITISH PROPERTY WAY, UNIT 20
24                     250 25TH AVE
25                    2500 BROADWAY
26                52 Main St

In [56]:
# Create List for Raw Address Dataframes
raw_addresses = [raw_address_list_1, raw_address_list_2]

In [57]:
# Add a field called Record_ID if it doesn't already exist in the raw address files
for dtfrm in raw_addresses:
    dtfrm = stndrdzr.record_id_addition(dtfrm)

In [58]:
# Add Empty Missing Columns to Dataframe
missing_columns = ['CITY','STATE','ZIP_CODE','UNKNOWN']
for dtfrm in raw_addresses:
    dtfrm = stndrdzr.empty_column_addition(dtfrm, missing_columns)

In [59]:
# instantiate AddressTagger object with default options, which gives the model trained in `Train CRF Model`.ipynb
at = AddressTagger()

In [60]:
# Placeholder for calling the trained CRF Tagger on the 2 lists of raw addresses
tagged_address_list_1 = at.series_to_address_df(raw_address_list_1[raw_address_field_name_list_1], standardize = standardize_addresses)
tagged_address_list_2 = at.series_to_address_df(raw_address_list_2[raw_address_field_name_list_2], standardize = standardize_addresses)

In [61]:
# Add Remaining Columns from Raw Address Dataframes to Tagged Address Dataframes
joined_address_list_1 = tagged_address_list_1.join(raw_address_list_1[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN']])
joined_address_list_2 = tagged_address_list_2.join(raw_address_list_2[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN']])

In [62]:
print (joined_address_list_1)

   STREET_NUMBER PRE_DIRECTION       STREET_NAME STREET_TYPE POST_DIRECTION  \
0           1732                        BROADWAY                              
1            747                        BROADWAY                              
2            433                            10TH      AVENUE                  
3           1420                            10TH      AVENUE                  
4            210                            10TH      STREET                  
5            137                            11TH      AVENUE                  
6            167                            11TH      AVENUE                  
7            319                            11TH       PLACE                  
8            129                            12TH      AVENUE                  
9              3                BRITISH PROPERTY         WAY                  
10          1732                        BROADWAY                              
11           747                        BROADWAY    

In [63]:
print (joined_address_list_2)

   STREET_NUMBER PRE_DIRECTION       STREET_NAME STREET_TYPE POST_DIRECTION  \
0           1732                        BROADWAY                              
1            747                        BROADWAY                              
2            433                            10TH      AVENUE                  
3           1420                            10TH      AVENUE                  
4            210                            10TH      STREET                  
5            137                            11TH      AVENUE                  
6            167                            11TH      AVENUE                  
7            319                            11TH       PLACE                  
8            129                            12TH      AVENUE                  
9              3                BRITISH PROPERTY         WAY                  
10          1732                        BROADWAY                              
11           747                        BROADWAY    

In [64]:
# Intra-Grouping of Tagged Address Lists to Consolidate Duplicates
grouped_address_list_1 = stndrdzr.consolidate_address_list(joined_address_list_1)
grouped_address_list_2 = stndrdzr.consolidate_address_list(joined_address_list_2)

In [65]:
print (grouped_address_list_1)

    UNIT_TYPE UNIT_NUMBER STREET_NUMBER PRE_DIRECTION       STREET_NAME  \
0                                   129                            12TH   
1                                   250                            25TH   
2                                  2500                        BROADWAY   
3                                   747                        BROADWAY   
4                      23           137                            11TH   
5           5                       433                            10TH   
6   APARTMENT          20          1420                            10TH   
7   APARTMENT           A           167                            11TH   
8    BUILDING           A           210                            10TH   
9       SUITE          20          1732                        BROADWAY   
10       UNIT         123           319                            11TH   
11       UNIT          20             3                BRITISH PROPERTY   

   STREET_TYPE POST_DIRE

In [66]:
# Call Either the Exact Match or Learning Match Functions to match the 2 lists
exact_matches = mtch.exact_matcher(grouped_address_list_1, grouped_address_list_2)


In [67]:
#unmatched_address_list_1 = grouped_address_list_1.query("Record_ID not in @matched_addresses_list_1")
#unmatched_address_list_2 = grouped_address_list_2.query("Record_ID not in @matched_addresses_list_2")
unmatched_address_list_1 = grouped_address_list_1.mask(grouped_address_list_1.Record_ID.isin(exact_matches['Record_ID_list_1'])).dropna()
unmatched_address_list_2 = grouped_address_list_2.mask(grouped_address_list_2.Record_ID.isin(exact_matches['Record_ID_list_2'])).dropna()

In [68]:
print (unmatched_address_list_1)

Empty DataFrame
Columns: [UNIT_TYPE, UNIT_NUMBER, STREET_NUMBER, PRE_DIRECTION, STREET_NAME, STREET_TYPE, POST_DIRECTION, UNKNOWN, CITY, STATE, ZIP_CODE, Record_ID]
Index: []


In [69]:
print (unmatched_address_list_2)

    UNIT_TYPE UNIT_NUMBER STREET_NUMBER PRE_DIRECTION   STREET_NAME  \
0                                   123                       MAPLE   
5                                 97531                GEORGE ALLEN   
9   APARTMENT          75            52                        MAIN   
11   BUILDING           4         13579          EAST           ELM   

   STREET_TYPE POST_DIRECTION UNKNOWN CITY STATE ZIP_CODE Record_ID  
0        DRIVE                                                (27,)  
5         ROAD                                                (28,)  
9       STREET                                                (26,)  
11      AVENUE                                                (29,)  
