# String Matching using NLP TF_IDF on Large Dataset

## The goal here is to find a last name match of a query list of 2000 names in a list of 1.4 million lookup name list. 

In [2]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import plot, xlabel, ylabel
%matplotlib inline
from matplotlib.path import Path
from matplotlib.figure import Figure
from matplotlib.patches import PathPatch
from matplotlib.patches import Patch
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import matplotlib.cm as cm
from IPython.core.display import display
import matplotlib.patches as patches
import math
import numpy as np
from numpy import percentile
from numpy.random import seed
import os
from sklearn.preprocessing import StandardScaler
import datetime as dt
import time
import glob
import pickle
import spacy
import re
import pyodbc
import sqlalchemy as sal
from sklearn.preprocessing import normalize 
from sqlalchemy import create_engine
import pandas as pd
from scipy.stats import bartlett
from scipy.stats import boxcox
from sklearn.preprocessing import normalize 
from scipy.stats import jarque_bera
from scipy.stats import levene
from scipy.stats import normaltest
import scipy.stats as stats
from scipy.stats.mstats import winsorize
from scipy.stats import zscore
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score #
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import precision_recall_curve
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
import xlsxwriter
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, \
    adjusted_mutual_info_score, adjusted_rand_score
from sklearn.metrics import silhouette_samples,  silhouette_score
from sklearn.metrics.cluster import contingency_matrix

In [3]:
import en_core_web_md
nlp = spacy.load('en_core_web_md')
from random import sample 

In [4]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) 
pd.set_option('display.max_colwidth', -1) 

## Import lookup dataset

In [5]:
# Import lookup data
import_path = "D:\\dev_data\\re\\hcad"
owner_cols = ['ACCOUNT', 'MAILTO']
file_name2 = 'ss_owners.csv'
file2 = import_path+"\\"+ file_name2
l1 = pd.read_csv(file2,  dtype= str, encoding = "ISO-8859-1", names=owner_cols, skiprows=1)
l1.shape

(1087036, 2)

In [6]:
l1.head(3)

Unnamed: 0,ACCOUNT,MAILTO
0,32180000021,SANTOS DOLORES ST JOHN
1,32180000022,GRIMALDO ROSIE
2,32180000023,GARCIA ANTONIO


In [7]:
# Utility function for standard text cleaning
def text_cleaner(text):
    text = re.sub(r'"','',text)
    text = re.sub(r'&','',text)
    text = re.sub(r'','',text)
    ext = re.sub("[\[].*?[\]];", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

## Preprocess lookup dataset

In [8]:
l2 = l1.copy()

In [9]:
# Convert column of names to string and clean
l = l2['MAILTO'].tolist() 
l=['missing' if x is np.nan else x for x in l]
s = '||||'.join(l).lower()
sc = text_cleaner(s)
#sc[:10]
names = [str(x) for x in sc.split('||||') if x]
names[:3]

['santos dolores st john', 'grimaldo rosie', 'garcia antonio']

In [10]:
l3 = pd.DataFrame(names, index=l2.index, columns=['MAILTO_cleaned'])
l3.head(3)

Unnamed: 0,MAILTO_cleaned
0,santos dolores st john
1,grimaldo rosie
2,garcia antonio


In [11]:
# Combine original df with cleaned names
l4 = pd.concat([l2, l3], axis=1)
l4['source'] = 'lookup' # Add this so can identify the array later
l4['de_prop_given'] = '' # Placeholder for future use
l4['id'] = '' # So to match with columns from pro dateset

l4.shape

(1087036, 6)

In [12]:
l4.head(3)

Unnamed: 0,ACCOUNT,MAILTO,MAILTO_cleaned,source,de_prop_given,id
0,32180000021,SANTOS DOLORES ST JOHN,santos dolores st john,lookup,,
1,32180000022,GRIMALDO ROSIE,grimaldo rosie,lookup,,
2,32180000023,GARCIA ANTONIO,garcia antonio,lookup,,


In [13]:
# Remove duplicates
l4 = l4.drop_duplicates(subset='ACCOUNT', keep="last")
l4.shape

(1087036, 6)

In [14]:
# Tag commercial names by regex pattern
def find_pat(text):
    if re.search(r" llc|current owner|  inc| lc| ltd| lp| churchcorp| company|city of houston|\
        county of harris|state of texas| company| harris county|harris county|county of harris|\
            texas department| city of katy|parcel", text):
        return 1
    return   0
# Apply the function
l4['non_person'] = l4['MAILTO_cleaned'].apply(find_pat)
# Number of commercial entries
l4['non_person'].sum()

68266

In [15]:
# Drop commercial entries
l4 = l4.drop(l4[l4.non_person ==1].index)
l4.shape

(1018770, 7)

In [16]:
# Add addresses with the word estate in them
all_l_names = l4['MAILTO_cleaned'].tolist()
r = re.compile("estate")
newlist = list(filter(r.match, all_l_names))
estate_mask = [i in newlist for i in all_l_names]
l4['estate_mask'] = estate_mask
l4['estate_mask'] = l4['estate_mask']*1
l4['estate_mask'].sum()

50

In [17]:
# Extract the last name
l4['l_name'] = l4['MAILTO_cleaned'].str.extract('^([\w\-]+)', expand=True)
l4 = l4[~l4['l_name'].isnull()] # Filter away those names that start with digits
# Drop extra columns
l4 = l4.drop(['MAILTO', 'non_person'], 1)

In [18]:
l4.sample(5)

Unnamed: 0,ACCOUNT,MAILTO_cleaned,source,de_prop_given,id,estate_mask,l_name
466446,1071560000012,watkins richard m patricia,lookup,,,0,watkins
975247,1297510010009,bolden mary,lookup,,,0,bolden
866314,1248730010010,gonzalez jacqueline g,lookup,,,0,gonzalez
553650,1065240000019,jones april m joseph r,lookup,,,0,jones
1021355,1354430010030,christopher frances,lookup,,,0,christopher


In [19]:
# Modify single letter initials in the name so the get picked by in n-gram vectorization

def fix_middle_initial(name):
    namel = []
    namel = list(name.split(' '))
    namel_count = []
    namel_count = [len(i) for i in namel]
    for i, letter_count in enumerate(namel_count):
        if letter_count ==1:
            namel[i] = namel[i] * 3
        else: continue
    return ' '.join(namel)

In [20]:
l4['MAILTO_cleaned2'] = l4['MAILTO_cleaned'].apply(fix_middle_initial)

# Import query data

In [21]:
path = "D:\\dev_data\\re\\pro\\fls\\monthly_download"
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0 , dtype = str)
    li.append(df)
q1 = pd.concat(li, axis=0, ignore_index=True)
q1['ones'] = 1
q1 = q1.fillna(0)
q1 = q1.astype(str)
q1.shape

(5033, 43)

## Preprocess querry dataset

In [22]:
q1.head(2)

Unnamed: 0,FLS_DOWNL_DATE,DOWNL_SOURCE,DOWNL_COUNTY,DOCK_NUM,DE_KEY,DE_NAME,DE_FIRST_NAME,DE_MIDDLE_NAME,DE_LAST_NAME,DE_STR_ADD,DE_CITY_ZIP,DE_CITY,DE_STATE,DE_ZIP,DE_KEYMAP,DE_PROP_VAL,DE_BR_SQF,DE_YR_BUILD,DE_LEGAL_ADD,EX_KEY,EX_NAME,EX_FIRST_NAME,EX_MIDDLE_NAME,EX_LAST_NAME,EX_STR_ADD,EX_CITY_ZIP,EX_CITY,EX_STATE,EX_ZIP,DOCK_FILE_DATE,DOCK_FILM,ATT_KEY,ATT_NAME,ATT_FIRST_NAME,ATT_MIDDLE_NAME,ATT_LAST_NAME,ATT_ADD,ATT_CITY_ZIP,ATT_CITY,ATT_STATE,ATT_ZIP,LIST_NAME,ones
0,22-Dec-20,FLS,HARRIS,490628,490628_DE_JODZIO,DAVID W JODZIO,DAVID,W,JODZIO,12719 CORNING DR,HOUSTON TX 77089,HOUSTON,TX,77089,616D,$161735,3/1313,1973,LT 30 BLK 9 SCARSDALE 2,490628_EX_JODZIO,HEATHER E JODZIO,HEATHER,E,JODZIO,12719 CORNING DR,HOUSTON TX 77089,HOUSTON,TX,77089,12/17/2020,0,490628_ATT_LEWIS,POLLY LEWIS,POLLY,0,LEWIS,16055 SPACE CENTER BLVD #190,HOUSTON TX 77062,HOUSTON,TX,77062,List1,1
1,22-Dec-20,FLS,HARRIS,490639,490639_DE_BOURGEOIS,LOUISE BOURGEOIS,LOUISE,0,BOURGEOIS,9723 SPRINGMONT DR,HOUSTON TX 77080,HOUSTON,TX,77080,450K,$206326,3/1800,1971,LT 26 BLK 5 KEMPWOOD NORTH,490639_EX_BOURGEOIS,DONALD A BOURGEOIS,DONALD,A,BOURGEOIS,9723 SPRINGMONT DR,HOUSTON TX 77080,HOUSTON,TX,77080,12/17/2020,0,490639_ATT_CORDELL,CHRISTINE CORDELL,CHRISTINE,0,CORDELL,9800 NW FRWY STE 216,HOUSTON TX 77092,HOUSTON,TX,77092,List1,1


In [23]:
# Remove duplicate cases
q2 = q1.drop_duplicates(subset='DE_KEY', keep="first")
q2.shape

(2432, 43)

In [24]:
# Remove duplicated EX SITE ADD
q2 = q2.drop_duplicates(subset='EX_STR_ADD', keep="first")
q2.shape

(2338, 43)

In [25]:
# Convert full date but different format
q2['FLS_DOWNL_DATE'] = pd.to_datetime(q2['FLS_DOWNL_DATE'])
q2['DOCK_FILE_DATE'] = pd.to_datetime(q2['DOCK_FILE_DATE'])

In [26]:
# Convert year only string to int
q2['DE_YR_BUILD'] = pd.to_datetime(q2['DE_YR_BUILD'], format='%Y', errors='coerce')
# q2['DE_YR_BUILD'] = q2['DE_YR_BUILD'].fillna(0)
# #temp_l = q2['DE_YR_BUILD'].tolist()
# #temp_l = [round(num) for num in temp_l]
# #q2['DE_YR_BUILD'] = temp_l
q2['DE_YR_BUILD'] = pd.DatetimeIndex(q2['DE_YR_BUILD']).year
# #q2['DE_YR_BUILD'] = q2['DE_YR_BUILD'].fillna(0)

In [27]:
q_dates = ['FLS_DOWNL_DATE', 'DE_YR_BUILD', 'DOCK_FILE_DATE']
q2[q_dates].sample(3)

Unnamed: 0,FLS_DOWNL_DATE,DE_YR_BUILD,DOCK_FILE_DATE
4662,2020-11-27,,2020-09-21
958,2020-12-22,,2020-10-14
1892,2021-02-08,,2021-01-06


In [28]:
q2[q_dates].dtypes

FLS_DOWNL_DATE    datetime64[ns]
DE_YR_BUILD       float64       
DOCK_FILE_DATE    datetime64[ns]
dtype: object

In [29]:
###############
# Export a version for inspection
excel_path = "D:\\dev_data\\re\\pro\\fls\\excel_reports\\"
q2.to_excel(excel_path+'monthly_combined.xlsx')

In [30]:
# Fill NAs with zero as drop duplicates does not work with nulls
#q2 = q2.astype(str)
q2['de_prop_given'] = '0'
q2['de_prop_given'][q2['DE_STR_ADD'] != '0']='1'
q2.shape

(2338, 44)

In [31]:
q2 = q2.reset_index(drop=True)
# Add incremental index
q2.insert(0, 'id', range(0, q2.shape[0]))
q2[['id', 'DE_STR_ADD', 'de_prop_given']].sample(3, random_state=10)

Unnamed: 0,id,DE_STR_ADD,de_prop_given
168,168,9813 ABIGAIL GRACE CT,1
2226,2226,0,0
47,47,4806 N MAIN,1


In [32]:
# Check to see how many missing values we have for EX_STR_ADD
add_l = q2['EX_STR_ADD'].tolist()
len([x for x in add_l if x in [0]])

0

In [33]:
# Tag each dock that has same EX address as 1 else 0
ex_add_1 = q2['EX_STR_ADD'].tolist()
ex_add_2 = []
ex_add_3 = []
for l in ex_add_1:
    if l not in ex_add_2:
        ex_add_2.append(l) 
        ex_add_3.append(1)
    else:
        ex_add_3.append(0)
q2['unique_ex_add'] = ex_add_3
q2.shape

(2338, 46)

In [34]:
# Add cume EX_STR_ADD count 
q2['unique_ex_add_cume'] = q2.groupby(['EX_STR_ADD'])['unique_ex_add'].cumcount()
q2.sort_values(['EX_STR_ADD', 'unique_ex_add_cume'], ascending=[False, False])[['EX_STR_ADD', 'unique_ex_add_cume']][10:20]

Unnamed: 0,EX_STR_ADD,unique_ex_add_cume
1358,N1098 KNEPPRATH RD,0
1189,9955 KEMPWOOD DR #935,0
1867,9949 WOODWIND LN N,0
1780,9936 COMMON HAWKER CT,0
2310,9931 KEMP FOREST,0
1260,9919 DRIFTWOOD PARK DR,0
2078,9910 ELIZABETHS GLEN LN,0
949,9910 AVES ST,0
2127,9901 SHARPCREST ST #K3,0
334,9825 NE MURDEN COVE,0


## Feature engineering
### Here we use Tf-idf (term frequency–inverse document frequency) by looking at a normalized count where each word count is divided by the number of rows (i.e. document) this word appears in. I chose this method instead of for example bag-of-words becasue we are comparing names where on average every document has 3 words in it. And also I am not looking for similarities but actual exact match.

I use Tf-idf twice, once to eliminate all documents from lookup dataset that do not have the exact last names like in query data. This will improve the performance. In a second round I will then compare the query data with a smaller loopup data using the second round of Tf-idf.

In [36]:
## Prepare TF-IDF for name matching
q3 = q2[['id', 'DOCK_NUM','DE_NAME', 'DE_LAST_NAME', 'de_prop_given']]
q3 = q3.applymap(lambda s:s.lower() if type(s) == str else s)
q3['source'] = 'query' # Add this so can identify the array later
q3['estate_mask'] = '' # Add placeholder as Lookup data has one
q3 = q3.rename(columns={"DE_NAME": "MAILTO_cleaned", "DOCK_NUM": "ACCOUNT"})
# Remove pro duplicates
q3 = q3.drop_duplicates(subset='MAILTO_cleaned', keep="last")
q3.shape

(2336, 7)

In [37]:
q3.sample(2)

Unnamed: 0,id,ACCOUNT,MAILTO_cleaned,DE_LAST_NAME,de_prop_given,source,estate_mask
1295,1295,491243,jessie garza,garza,1,query,
2123,2123,487927,karen houston,houston,0,query,


In [38]:
# Modify single letter initials in the name so the get picked by in n-gram vectorization
def fix_middle_initial(name):
    namel = []
    namel = list(name.split(' '))
    namel_count = []
    namel_count = [len(i) for i in namel]
    for i, letter_count in enumerate(namel_count):
        if letter_count ==1:
            namel[i] = namel[i] * 3
        else: continue
    return ' '.join(namel)

In [39]:
# Extract the last name
#q3['l_name'] = q3['MAILTO_cleaned'].str.extract('([\w\-]+)$', expand=True)
q3['l_name'] = q3['DE_LAST_NAME']
q3 = q3.drop('DE_LAST_NAME', axis=1)
q3['MAILTO_cleaned2'] = q3['MAILTO_cleaned'].apply(fix_middle_initial)

In [40]:
q3.sample(2)

Unnamed: 0,id,ACCOUNT,MAILTO_cleaned,de_prop_given,source,estate_mask,l_name,MAILTO_cleaned2
235,235,488365,kathy a howell,1,query,,howell,kathy aaa howell
263,263,490646,john c meyer,0,query,,meyer,john ccc meyer


In [41]:
# Get difference between two dataset columns before merging them. A match has to return zero element
l_columns = list(l4.columns)
q_columns = list(q3.columns)
set(q_columns) ^ set(l_columns) 

set()

In [42]:
# Combine lookup with query dataset into a new df where can use cosign similarities
result = l4.append(q3, sort=False)
result = result.reset_index(drop=True)

In [43]:
result.groupby('source').sample(n=2, random_state=1)

Unnamed: 0,ACCOUNT,MAILTO_cleaned,source,de_prop_given,id,estate_mask,l_name,MAILTO_cleaned2
874134,1276880010027,carr kirlice v,lookup,,,0.0,carr,carr kirlice vvv
924294,1301260010006,baker kenneth r mary l,lookup,,,0.0,baker,baker kenneth rrr mary lll
1020643,487484,cleta r graham,query,0.0,2298.0,,graham,cleta rrr graham
1019312,488882,vera brown,query,0.0,966.0,,brown,vera brown


In [44]:
# Pick indexes of two groups for future slicing
l_index = result[result['source']=='lookup'].index
q_index = result[result['source']=='query'].index

### Vectorize

In [45]:
# use 1-1 word ngrams on last names only
vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word'
                                       # ,stop_words='english'
                                       ,ngram_range = (1, 1)
                                       #, min_df = 1
                                       , norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True)#,
                                       
                                      #  max_df=1, max_features=None)
X = vectorizer.fit_transform(result['l_name'])

In [46]:
print(vectorizer.get_feature_names()[100:150])

['abatan', 'abate', 'abatie', 'abatte', 'abaunza', 'abay', 'abaya', 'abayan', 'abayatissa', 'abayomi', 'abaza', 'abazajian', 'abazie', 'abba', 'abbara', 'abbas', 'abbasi', 'abbasian', 'abbasimalayeri', 'abbasmanesh', 'abbaspour', 'abbassi', 'abbassian', 'abbaszadeh', 'abbaszadehrizi', 'abbate', 'abbe', 'abbey', 'abbie', 'abbinanti', 'abbit', 'abbitt', 'abbot', 'abbott', 'abbouchi', 'abboud', 'abboushi', 'abbrat', 'abbruscato', 'abbs', 'abbud', 'abc', 'abcede', 'abd', 'abdal', 'abdala', 'abdali', 'abdalla', 'abdallah', 'abdani']


In [47]:
#nd-array info
X.shape, X.ndim, X.size

((1020683, 120349), 2, 1026165)

In [48]:
# Filter away rows where there is no last name from query list 
# Get similarities of lookup and query dataset 
sim1 = X[l_index].dot(X[q_index].transpose())
sim1.shape, sim1.ndim, sim1.size

((1018347, 2336), 2, 1488330)

In [49]:
# Get non zero values' indexes and their values
nonzero_tup = np.stack(np.nonzero(sim1), axis=-1)
# Filter away zeros and return a list of indexs where there was a match with last names only
res_list1 = [x[0] for x in nonzero_tup]
# Convert list to array as it is expensive to remvoe duplicates in a large list
res_array = np.array(res_list1)
res_array.shape

(1488330,)

In [50]:
res_uniques = np.unique(res_array) # This is the smaller lookup dataset where there is one exact match of last name for query data
res_uniques.shape

(364268,)

## By using TFIDF and cosine similarities we have reduced the size of lookup dataset from 1 million to 360k. This will improve the performance of futher matching solutions.