The master_file serves as the glue for all other files in the project.  It is the central file to load training and test data, tag the addresses, standardize the addresses, and compare the different address lists.

In [16]:
from collections import defaultdict
from address_compare import standardizers as stndrdzr
from address_compare import comparers as comps
from address_compare import matcher as mtch
from address_compare.crf_tagger import AddressTagger
from address_compare import address_randomizer as add_rndm
import json
import pandas as pd

In [17]:
# Editable parameters to control the functions
retrain_crf_tagger = False
standardize_addresses = True
num_rndm_addresses_to_create = 100
use_raw_address_files = False #if False, only the specified number of randomly created addresses above will be used

field_name_raw_addresses = 'Single String Address'

file_location_raw_addresses_1 = 'address_compare\\data\\sandbox data.xlsx'
file_location_raw_addresses_2 = 'address_compare\\data\\sandbox data.xlsx'

In [18]:
# Placeholder for reading/calling the training data for the CRF Tagger and sending the training data to train the model
if retrain_crf_tagger:
    with open('data/tagged_addresses.json') as f:
        td = json.load(f)
    
    #send training data to CRF tagger to train the model here...

In [19]:
# Placeholder for reading/calling the 2 lists of raw addresses
if use_raw_address_files:
    raw_address_list_1 = pd.read_excel(file_location_raw_addresses_1)
    raw_address_list_2 = pd.read_excel(file_location_raw_addresses_2)
else:
    raw_address_list_1 = add_rndm.random_addresses(num_rndm_addresses_to_create, field_name_raw_addresses)
    raw_address_list_2 = add_rndm.random_addresses(num_rndm_addresses_to_create, field_name_raw_addresses)

In [20]:
# Create List for Raw Address Dataframes
raw_addresses = [raw_address_list_1, raw_address_list_2]

In [21]:
# Add a field called Record_ID if it doesn't already exist in the raw address files
for dtfrm in raw_addresses:
    dtfrm = stndrdzr.record_id_addition(dtfrm)

In [22]:
# Add Empty Missing Columns to Dataframe
missing_columns = ['CITY','STATE','ZIP_CODE','UNKNOWN']
for dtfrm in raw_addresses:
    dtfrm = stndrdzr.empty_column_addition(dtfrm, missing_columns)

In [23]:
# instantiate AddressTagger object with default options, which gives the model trained in `Train CRF Model`.ipynb
at = AddressTagger()

In [24]:
# Placeholder for calling the trained CRF Tagger on the 2 lists of raw addresses
tagged_address_list_1 = at.series_to_address_df(raw_address_list_1[field_name_raw_addresses], standardize = standardize_addresses)
tagged_address_list_2 = at.series_to_address_df(raw_address_list_2[field_name_raw_addresses], standardize = standardize_addresses)

In [25]:
# Add Remaining Columns from Raw Address Dataframes to Tagged Address Dataframes
joined_address_list_1 = tagged_address_list_1.join(raw_address_list_1[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN']])
joined_address_list_2 = tagged_address_list_2.join(raw_address_list_2[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN']])

In [26]:
# Intra-Grouping of Tagged Address Lists to Consolidate Duplicates
grouped_address_list_1 = stndrdzr.consolidate_address_list(joined_address_list_1)
grouped_address_list_2 = stndrdzr.consolidate_address_list(joined_address_list_2)

In [27]:
# Call Either the Exact Match or Learning Match Functions to match the 2 lists
exact_matches = mtch.exact_matcher(grouped_address_list_1, grouped_address_list_2)

In [28]:
unmatched_address_list_1 = grouped_address_list_1.mask(grouped_address_list_1.Record_ID.isin(exact_matches['Record_ID_list_1'])).dropna()
unmatched_address_list_2 = grouped_address_list_2.mask(grouped_address_list_2.Record_ID.isin(exact_matches['Record_ID_list_2'])).dropna()