The master_file serves as the glue for all other files in the project.  It is the central file to load training and test data, tag the addresses, standardize the addresses, and compare the different address lists.

In [113]:
from collections import defaultdict
from address_compare import standardizers as stndrdzr
from address_compare import comparers as comps
from address_compare import matcher as mtch
from address_compare.crf_tagger import AddressTagger
from address_compare import address_randomizer as add_rndm
import json
import pandas as pd
import sys
import numpy as np
import sklearn

In [114]:
# Editable parameters to control the functions
retrain_crf_tagger = False #if True, the specified training file will be used to retrain the CRF Tagger
standardize_addresses = True #if True, the tagged address components will be standardized (changed to upper case, unit types, street types, etc. changed to long form names)
num_rndm_addresses_to_create = 100 #if use_raw_address_files = False, the number of addresses that will be randomly created for use in the tagger and compare functions
use_raw_address_files = True #if False, only the specified number of randomly created addresses above will be used
group_addresses_intra_list = False #if False, duplicates within a list will not be grouped in order to easily compare against the golden/manual matches

view_address_tagger_metrics_against_single_file_only = False #if True, the file will only run the address tagger against the file in file_location_tagger_test_metrics and will calculate the accuracy of the tagger

field_name_raw_addresses = 'Single String Address' #represents the name of the field in the raw address files containing the raw address (street information)
field_name_record_id = 'Record_ID' #represents the name of the field containing the Record ID in the raw files; if not present in the raw files, populate with None

file_location_raw_addresses_1 = 'data\\MarijuanaApplicants - test data list 1.xlsx'
file_location_raw_addresses_2 = 'data\\MarijuanaApplicants - test data list 2.xlsx'

file_location_tagger_test_metrics = 'data\\standardized tagged washington state addresses.xlsx' #only used if view_address_tagger_metrics.. == True

In [115]:
# Placeholder for reading/calling the training data for the CRF Tagger and sending the training data to train the model
if retrain_crf_tagger:
    with open('data/tagged_addresses.json') as f:
        td = json.load(f)
    
    #send training data to CRF tagger to train the model here...

In [116]:
# Run Single File against Tagger and Calculate Accuracy
if view_address_tagger_metrics_against_single_file_only:
    test_file = pd.read_excel(file_location_tagger_test_metrics, keep_default_na=False, dtype=str)
    test_file = stndrdzr.record_id_addition(test_file, field_name_record_id)
    tagger = AddressTagger()
    tagged_test_file = tagger.series_to_address_df(test_file[field_name_raw_addresses], standardize = standardize_addresses)
    crf_tagged_test_file = tagged_test_file.join(test_file['Record_ID'])
    
    manual_tagged_test_file = test_file[['Record_ID', 'Tagged Street Number','Tagged Pre Street Direction','Tagged Street Name','Tagged Street Type','Tagged Post Street Direction','Tagged Unit Type','Tagged Unit Number']].copy()
    manual_tagged_test_file = manual_tagged_test_file.rename(columns={'Tagged Street Number':'STREET_NUMBER',
                                                                     'Tagged Pre Street Direction':'PRE_DIRECTION',
                                                                     'Tagged Street Name':'STREET_NAME',
                                                                     'Tagged Street Type':'STREET_TYPE',
                                                                     'Tagged Post Street Direction':'POST_DIRECTION',
                                                                     'Tagged Unit Type':'UNIT_TYPE',
                                                                     'Tagged Unit Number':'UNIT_NUMBER'})

    cols_for_matcher = ['UNIT_TYPE','UNIT_NUMBER','STREET_NUMBER','PRE_DIRECTION','STREET_NAME','STREET_TYPE','POST_DIRECTION']
    correctly_tagged_addresses = mtch.exact_matcher(crf_tagged_test_file, manual_tagged_test_file, cols_for_matcher)
    
    incorrectly_tagged_addresses = crf_tagged_test_file.mask(crf_tagged_test_file.Record_ID.isin(correctly_tagged_addresses['Record_ID_list_1'])).dropna()
    total_records = crf_tagged_test_file.shape[0]
    correctly_tagged = correctly_tagged_addresses.shape[0]
    incorrectly_tagged = incorrectly_tagged_addresses.shape[0]
    tagger_accuracy = correctly_tagged / total_records
    print ('tagger accuracy = ', tagger_accuracy)
    
    for col in cols_for_matcher:
        precision, recall, fscore, ignore = sklearn.metrics.precision_recall_fscore_support(manual_tagged_test_file[col], crf_tagged_test_file[col], pos_label=None,average='micro')
        print ('column = ', col, 'precision = ', precision, 'recall = ', recall, 'f1score = ',fscore)
    
    sys.exit()

In [117]:
# Placeholder for reading/calling the 2 lists of raw addresses
if use_raw_address_files:
    raw_address_list_1 = pd.read_excel(file_location_raw_addresses_1)
    raw_address_list_2 = pd.read_excel(file_location_raw_addresses_2)
else:
    raw_address_list_1 = add_rndm.random_addresses(num_rndm_addresses_to_create, field_name_raw_addresses)
    raw_address_list_2 = add_rndm.random_addresses(num_rndm_addresses_to_create, field_name_raw_addresses)

In [118]:
# Add a field called Record_ID if it doesn't already exist in the raw address files
raw_address_list_1 = stndrdzr.record_id_addition(raw_address_list_1, field_name_record_id)
raw_address_list_2 = stndrdzr.record_id_addition(raw_address_list_2, field_name_record_id)

In [119]:
# Add Empty Missing Columns to Dataframe
missing_columns = ['CITY','STATE','ZIP_CODE','UNKNOWN']
raw_address_list_1 = stndrdzr.empty_column_addition(raw_address_list_1, missing_columns)
raw_address_list_2 = stndrdzr.empty_column_addition(raw_address_list_2, missing_columns)

In [120]:
# instantiate AddressTagger object with default options, which gives the model trained in `Train CRF Model`.ipynb
at = AddressTagger()

In [121]:
# Call the trained CRF Tagger on the 2 lists of raw addresses
tagged_address_list_1 = at.series_to_address_df(raw_address_list_1[field_name_raw_addresses], standardize = standardize_addresses)
tagged_address_list_2 = at.series_to_address_df(raw_address_list_2[field_name_raw_addresses], standardize = standardize_addresses)

In [122]:
# Check for Errors in Zip Codes and Replace City Names with Primary City from Zip Code
raw_address_list_1 = stndrdzr.fix_cities_zips(raw_address_list_1)
raw_address_list_2 = stndrdzr.fix_cities_zips(raw_address_list_2)

In [123]:
# Add Remaining Columns from Raw Address Dataframes to Tagged Address Dataframes
joined_address_list_1 = tagged_address_list_1.join(raw_address_list_1[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN','Zip_Code_Error']])
joined_address_list_2 = tagged_address_list_2.join(raw_address_list_2[['Record_ID','CITY','STATE','ZIP_CODE','UNKNOWN','Zip_Code_Error']])

In [124]:
# Remove Addresses with Zip Code Errors (I.e., where the Zip Code is not valid for the given state)
error_addresses_list_1 = joined_address_list_1.where(joined_address_list_1.Zip_Code_Error == "Yes").dropna()
error_addresses_list_2 = joined_address_list_2.where(joined_address_list_2.Zip_Code_Error == "Yes").dropna()

In [125]:
# Only Addresses without Zip Code Errors (I.e., where the Zip Code is valid for the given state)
nonerror_addresses_list_1 = joined_address_list_1.where(joined_address_list_1.Zip_Code_Error == "No").dropna()
nonerror_addresses_list_2 = joined_address_list_2.where(joined_address_list_2.Zip_Code_Error == "No").dropna()

In [126]:
nonerror_addresses_list_1 = nonerror_addresses_list_1.astype({'Record_ID':'int', 'ZIP_CODE':'int'})
nonerror_addresses_list_2 = nonerror_addresses_list_2.astype({'Record_ID':'int', 'ZIP_CODE':'int'})
nonerror_addresses_list_1 = nonerror_addresses_list_1.astype({'Record_ID':'str', 'ZIP_CODE':'str'})
nonerror_addresses_list_2 = nonerror_addresses_list_2.astype({'Record_ID':'str', 'ZIP_CODE':'str'})

In [127]:
# Intra-Grouping of Tagged Address Lists to Consolidate Duplicates
if group_addresses_intra_list:
    grouped_address_list_1 = stndrdzr.consolidate_address_list(nonerror_addresses_list_1)
    grouped_address_list_2 = stndrdzr.consolidate_address_list(nonerror_addresses_list_2)
else:
    grouped_address_list_1 = nonerror_addresses_list_1.copy()
    grouped_address_list_2 = nonerror_addresses_list_2.copy()

In [128]:
# Call Either the Exact Match or Learning Match Functions to match the 2 lists
exact_matches = mtch.exact_matcher(grouped_address_list_1, grouped_address_list_2)

In [129]:
unmatched_address_list_1 = grouped_address_list_1.mask(grouped_address_list_1.Record_ID.isin(exact_matches['Record_ID_list_1'])).dropna()
unmatched_address_list_2 = grouped_address_list_2.mask(grouped_address_list_2.Record_ID.isin(exact_matches['Record_ID_list_2'])).dropna()

In [130]:
# Dictionary of DataFrames for Excel File
dataframes_for_excel = {'raw_addresses_list_1': raw_address_list_1, 'raw_addresses_list2': raw_address_list_2,
                        'zip_errors_list1': error_addresses_list_1, 'zip_errors_list2': error_addresses_list_2,
                       'exact_matches': exact_matches, 'unmatched_list_1': unmatched_address_list_1,
                       'unmatched_list_2': unmatched_address_list_2}

In [131]:
# Write Dict of DataFrames to Excel
writer = pd.ExcelWriter('output\\raw_to_matched_addresses.xlsx', engine='xlsxwriter')
for sheet, frame in  dataframes_for_excel.items():
    frame.to_excel(writer, sheet_name = sheet)
writer.save()

In [132]:
# Compare Output of Exact Match to Manually Tagged Matches
if not group_addresses_intra_list:
    manual_matches = pd.read_excel('data\\marijuana applicants test data - correct matches.xlsx', dtype=str)
    golden_exact_matches = manual_matches.where(manual_matches.Match_Type.isin(["Exact","Standardized Exact"])).dropna().reset_index()


In [133]:
if not group_addresses_intra_list:
    join_cols = ['Record_ID_list_1','Record_ID_list_2']
    subset_columns_exact_matches = exact_matches[join_cols].copy()
    subset_columns_exact_matches['row_index'] = subset_columns_exact_matches.index
    subset_cols_golden_exact_matches = golden_exact_matches[join_cols].copy()
    subset_cols_golden_exact_matches['row_index'] = subset_cols_golden_exact_matches.index
    
    subset_columns_exact_matches = subset_columns_exact_matches.astype(str)
    subset_cols_golden_exact_matches = subset_cols_golden_exact_matches.astype(str)
    
    test_vs_golden_compare = mtch.exact_matcher(subset_columns_exact_matches, subset_cols_golden_exact_matches, join_cols)
    print (test_vs_golden_compare)

    Record_ID_list_1 Record_ID_list_2 row_index_list_1 row_index_list_2
0                  1                1                0                0
1                  3                2                1                1
2                  4                3                2                2
3                  5                4                3                3
4                  6                5                4                4
5                  7                6                5                5
6                  8                7                6                6
7                 11                8                7                7
8                 12                9                8                8
9                 13               10                9                9
10                14               12               10               10
11                15               13               11               11
12                16               14               12          

In [134]:
if not group_addresses_intra_list:
    missing_golden_matches = subset_cols_golden_exact_matches.mask(subset_cols_golden_exact_matches.row_index.isin(test_vs_golden_compare.row_index_list_2)).dropna()
    print (missing_golden_matches)

Empty DataFrame
Columns: [Record_ID_list_1, Record_ID_list_2, row_index]
Index: []


In [135]:
if not group_addresses_intra_list:
    matches_not_in_golden = subset_columns_exact_matches.mask(subset_columns_exact_matches.row_index.isin(test_vs_golden_compare.row_index_list_1)).dropna()
    print (matches_not_in_golden)

Empty DataFrame
Columns: [Record_ID_list_1, Record_ID_list_2, row_index]
Index: []


In [136]:
if not group_addresses_intra_list:
    total_records_list_1 = raw_address_list_1.shape[0]
    total_records_list_2 = raw_address_list_2.shape[0]
    total_modeled_matches = exact_matches.shape[0]
    total_manual_exact_matches = golden_exact_matches.shape[0]
    total_correct_positive_matches = test_vs_golden_compare.shape[0]
    false_negatives = missing_golden_matches.shape[0]
    false_positives = matches_not_in_golden.shape[0]
    
    accuracy_list_1 = (total_records_list_1 - (false_negatives + false_positives)) / total_records_list_1
    accuracy_list_2 = (total_records_list_2 - (false_negatives + false_positives)) / total_records_list_2
    precision = total_correct_positive_matches / (total_correct_positive_matches + false_positives)
    recall = total_correct_positive_matches / (total_correct_positive_matches + false_negatives)
    
    print ('list 1 accuracy = ', accuracy_list_1)
    print ('list 2 accuracy = ', accuracy_list_2)
    print ('precision = ', precision)
    print ('recall = ', recall)

list 1 accuracy =  1.0
list 2 accuracy =  1.0
precision =  1.0
recall =  1.0
