The master_file serves as the glue for all other files in the project.  It is the central file to load training and test data, tag the addresses, standardize the addresses, and compare the different address lists.

In [389]:
from collections import defaultdict
from address_compare import standardizers as stndrdzr
from address_compare import comparers as comps
from address_compare import matcher as mtch
from address_compare.crf_tagger import AddressTagger
from address_compare import address_randomizer as add_rndm
import json
import pandas as pd

In [390]:
# Editable parameters to control the functions
retrain_crf_tagger = False
standardize_addresses = True
num_rndm_addresses_to_create = 100
use_raw_address_files = True #if False, only the specified number of randomly created addresses above will be used
group_addresses_intra_list = True #if False, duplicates within a list will not be grouped in order to easily compare against the golden/manual matches

field_name_raw_addresses = 'Single String Address'
field_name_record_id = 'Record_ID'

file_location_raw_addresses_1 = 'data\\MarijuanaApplicants - test data list 1.xlsx'
file_location_raw_addresses_2 = 'data\\MarijuanaApplicants - test data list 2.xlsx'

In [391]:
# Placeholder for reading/calling the training data for the CRF Tagger and sending the training data to train the model
if retrain_crf_tagger:
    with open('data/tagged_addresses.json') as f:
        td = json.load(f)
    
    #send training data to CRF tagger to train the model here...

In [392]:
# Placeholder for reading/calling the 2 lists of raw addresses
if use_raw_address_files:
    raw_address_list_1 = pd.read_excel(file_location_raw_addresses_1)
    raw_address_list_2 = pd.read_excel(file_location_raw_addresses_2)
else:
    raw_address_list_1 = add_rndm.random_addresses(num_rndm_addresses_to_create, field_name_raw_addresses)
    raw_address_list_2 = add_rndm.random_addresses(num_rndm_addresses_to_create, field_name_raw_addresses)

In [393]:
# Add a field called Record_ID if it doesn't already exist in the raw address files
raw_address_list_1 = stndrdzr.record_id_addition(raw_address_list_1, field_name_record_id)
raw_address_list_2 = stndrdzr.record_id_addition(raw_address_list_2, field_name_record_id)

In [394]:
# Add Empty Missing Columns to Dataframe
missing_columns = ['CITY','STATE','ZIP_CODE','UNKNOWN']
raw_address_list_1 = stndrdzr.empty_column_addition(raw_address_list_1, missing_columns)
raw_address_list_2 = stndrdzr.empty_column_addition(raw_address_list_2, missing_columns)

In [395]:
# instantiate AddressTagger object with default options, which gives the model trained in `Train CRF Model`.ipynb
at = AddressTagger()

In [396]:
# Call the trained CRF Tagger on the 2 lists of raw addresses
tagged_address_list_1 = at.series_to_address_df(raw_address_list_1[field_name_raw_addresses], standardize = standardize_addresses)
tagged_address_list_2 = at.series_to_address_df(raw_address_list_2[field_name_raw_addresses], standardize = standardize_addresses)

In [397]:
# Check for Errors in Zip Codes and Replace City Names with Primary City from Zip Code
raw_address_list_1 = stndrdzr.fix_cities_zips(raw_address_list_1)
raw_address_list_2 = stndrdzr.fix_cities_zips(raw_address_list_2)

In [398]:
# Add Remaining Columns from Raw Address Dataframes to Tagged Address Dataframes
joined_address_list_1 = tagged_address_list_1.join(raw_address_list_1[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN','Zip_Code_Error']])
joined_address_list_2 = tagged_address_list_2.join(raw_address_list_2[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN','Zip_Code_Error']])

In [399]:
# Remove Addresses with Zip Code Errors (I.e., where the Zip Code is not valid for the given state)
error_addresses_list_1 = joined_address_list_1.where(joined_address_list_1.Zip_Code_Error == "Yes").dropna()
error_addresses_list_2 = joined_address_list_2.where(joined_address_list_2.Zip_Code_Error == "Yes").dropna()

In [400]:
# Only Addresses without Zip Code Errors (I.e., where the Zip Code is valid for the given state)
nonerror_addresses_list_1 = joined_address_list_1.where(joined_address_list_1.Zip_Code_Error == "No").dropna()
nonerror_addresses_list_2 = joined_address_list_2.where(joined_address_list_2.Zip_Code_Error == "No").dropna()

In [401]:
nonerror_addresses_list_1 = nonerror_addresses_list_1.astype({'Record_ID':'int', 'ZIP_CODE':'int'})
nonerror_addresses_list_2 = nonerror_addresses_list_2.astype({'Record_ID':'int', 'ZIP_CODE':'int'})
nonerror_addresses_list_1 = nonerror_addresses_list_1.astype({'Record_ID':'str', 'ZIP_CODE':'str'})
nonerror_addresses_list_2 = nonerror_addresses_list_2.astype({'Record_ID':'str', 'ZIP_CODE':'str'})

In [402]:
# Intra-Grouping of Tagged Address Lists to Consolidate Duplicates
if group_addresses_intra_list:
    grouped_address_list_1 = stndrdzr.consolidate_address_list(nonerror_addresses_list_1)
    grouped_address_list_2 = stndrdzr.consolidate_address_list(nonerror_addresses_list_2)
else:
    grouped_address_list_1 = nonerror_addresses_list_1.copy()
    grouped_address_list_2 = nonerror_addresses_list_2.copy()

In [403]:
# Call Either the Exact Match or Learning Match Functions to match the 2 lists
exact_matches = mtch.exact_matcher(grouped_address_list_1, grouped_address_list_2)

In [404]:
unmatched_address_list_1 = grouped_address_list_1.mask(grouped_address_list_1.Record_ID.isin(exact_matches['Record_ID_list_1'])).dropna()
unmatched_address_list_2 = grouped_address_list_2.mask(grouped_address_list_2.Record_ID.isin(exact_matches['Record_ID_list_2'])).dropna()

In [405]:
# Dictionary of DataFrames for Excel File
dataframes_for_excel = {'raw_addresses_list_1': raw_address_list_1, 'raw_addresses_list2': raw_address_list_2,
                        'zip_errors_list1': error_addresses_list_1, 'zip_errors_list2': error_addresses_list_2,
                       'exact_matches': exact_matches, 'unmatched_list_1': unmatched_address_list_1,
                       'unmatched_list_2': unmatched_address_list_2}

In [406]:
# Write Dict of DataFrames to Excel
writer = pd.ExcelWriter('output\\raw_to_matched_addresses.xlsx', engine='xlsxwriter')
for sheet, frame in  dataframes_for_excel.items():
    frame.to_excel(writer, sheet_name = sheet)
writer.save()

In [407]:
# Compare Output of Exact Match to Manually Tagged Matches
if not group_addresses_intra_list:
    manual_matches = pd.read_excel('data\\marijuana applicants test data - correct matches.xlsx', dtype=str)
    golden_exact_matches = manual_matches.where(manual_matches.Match_Type.isin(["Exact","Standardized Exact"])).dropna().reset_index()


In [408]:
if not group_addresses_intra_list:
    join_cols = ['Record_ID_list_1','Record_ID_list_2']
    subset_columns_exact_matches = exact_matches[join_cols].copy()
    subset_columns_exact_matches['row_index'] = subset_columns_exact_matches.index
    subset_cols_golden_exact_matches = golden_exact_matches[join_cols].copy()
    subset_cols_golden_exact_matches['row_index'] = subset_cols_golden_exact_matches.index

    test_vs_golden_compare = mtch.exact_matcher(subset_columns_exact_matches, subset_cols_golden_exact_matches, join_cols)
    print (test_vs_golden_compare)

In [409]:
if not group_addresses_intra_list:
    missing_golden_matches = subset_cols_golden_exact_matches.mask(subset_cols_golden_exact_matches.row_index.isin(test_vs_golden_compare.row_index_list_2)).dropna()
    print (missing_golden_matches)

In [410]:
if not group_addresses_intra_list:
    matches_not_in_golden = subset_columns_exact_matches.mask(subset_columns_exact_matches.row_index.isin(test_vs_golden_compare.row_index_list_1)).dropna()
    print (matches_not_in_golden)