# Use of TF-IDF on large dataset for string matching

## There are many fuzzy matching algorithms that work fine on small dataset. However, they fall short when used on even modest data sets of greater than a few thousand records. This is because they compare each record to all the other records in the data set. Here we will use TF_IDF to compare list of 2500 names in a lookup database of around 1 million names. The goal is to shrink the lookup dateset to a much smaller size so we can do better n-gram analysis.

In [1]:
from IPython.core.display import display
import matplotlib.patches as patches
import math
import numpy as np
from numpy import percentile
from numpy.random import seed
import os
from sklearn.preprocessing import StandardScaler
import datetime as dt
import time
import spacy
import re
import pandas as pd
from scipy.stats import bartlett
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_curve, auc, roc_auc_score
import xlsxwriter
from sklearn.metrics import silhouette_samples,  silhouette_score
from sklearn.metrics.cluster import contingency_matrix

In [2]:
import en_core_web_md
nlp = spacy.load('en_core_web_md')
from random import sample 

In [3]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) 
pd.set_option('display.max_colwidth', -1) 

## Import two data sources

In [4]:
## import the query data
import_path = r'D:\dev_data\re\hcad'
file_name1 = 'pro_names.csv'
file1 = import_path+"\\"+ file_name1
dfp0 = pd.read_csv(file1)
dfp0.shape

(2536, 7)

In [5]:
dfp0.head(2)

Unnamed: 0.1,Unnamed: 0,DOCK_NUM,DE_NAME,DE_FIRST_NAME,DE_MIDDLE_NAME,DE_LAST_NAME,de_target_label
0,0,471514,CAROLYN A BROWN,CAROLYN,A,BROWN,0
1,1,485685,LUCIO SOLIS,LUCIO,,SOLIS,1


In [6]:
# Import lookup data
owner_cols = ['ACCOUNT', 'MAILTO']
file_name2 = 'ss_owners.csv'
file2 = import_path+"\\"+ file_name2
dfo0 = pd.read_csv(file2,  dtype= str, encoding = "ISO-8859-1", names=owner_cols, skiprows=1)
dfo0.shape

(1087036, 2)

In [7]:
dfo0.head(3)

Unnamed: 0,ACCOUNT,MAILTO
0,32180000021,SANTOS DOLORES ST JOHN
1,32180000022,GRIMALDO ROSIE
2,32180000023,GARCIA ANTONIO


## Helper functions

In [8]:
# Utility function for standard text cleaning
def text_cleaner(text):
    text = re.sub(r'"','',text)
    text = re.sub(r'&','',text)
    ext = re.sub("[\[].*?[\]];", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

## Preprocess lookup dataset

In [9]:
dfo1 = dfo0.copy()

In [10]:
# Convert column of names to string and clean
l = dfo1['MAILTO'].tolist() 
l=['missing' if x is np.nan else x for x in l]
s = '||||'.join(l).lower()
sc = text_cleaner(s)
#sc[:10]
names = [str(x) for x in sc.split('||||') if x]
names[:3]

['santos dolores st john', 'grimaldo rosie', 'garcia antonio']

In [11]:
dfo2 = pd.DataFrame(names, index=dfo1.index, columns=['MAILTO_cleaned'])
dfo2.head(3)

Unnamed: 0,MAILTO_cleaned
0,santos dolores st john
1,grimaldo rosie
2,garcia antonio


In [12]:
# Combine original df with cleaned names
dfo3 = pd.concat([dfo1, dfo2], axis=1)
dfo3['source'] = 'owners' # Add this so can identify the array later
dfo3['de_prop_given'] = '' # Placeholder for future use
dfo3.shape

(1087036, 5)

In [13]:
dfo3.head(3)

Unnamed: 0,ACCOUNT,MAILTO,MAILTO_cleaned,source,de_prop_given
0,32180000021,SANTOS DOLORES ST JOHN,santos dolores st john,owners,
1,32180000022,GRIMALDO ROSIE,grimaldo rosie,owners,
2,32180000023,GARCIA ANTONIO,garcia antonio,owners,


In [14]:
# Remove duplicates
dfo3 = dfo3.drop_duplicates(subset='ACCOUNT', keep="last")
dfo3.shape

(1087036, 5)

In [15]:
# Tag commercial names by regex pattern
def find_pat(text):
    if re.search(r" llc|current owner|  inc| lc| ltd| lp| churchcorp| company|city of houston|\
        county of harris|state of texas| company| harris county|harris county|county of harris|\
            texas department| city of katy|parcel", text):
        return 1
    return   0
# Applu the function
dfo3['non_person'] = dfo3['MAILTO_cleaned'].apply(find_pat)

In [16]:
# Number of commercial entries
dfo3['non_person'].sum()

68266

In [17]:
# Drop commercial entries
dfo3 = dfo3.drop(dfo3[dfo3.non_person ==1].index)
dfo3.shape

(1018770, 6)

In [18]:
# Extract the last name
dfo3['l_name'] = dfo3['MAILTO_cleaned'].str.extract('^([\w\-]+)', expand=True)
dfo3 = dfo3[~dfo3['l_name'].isnull()] # Filter away those names that start with digits
dfo3.sample(3)

Unnamed: 0,ACCOUNT,MAILTO,MAILTO_cleaned,source,de_prop_given,non_person,l_name
90835,591250080020,HARDY SAMUEL B,hardy samuel b,owners,,0,hardy
813643,1224580020002,LUNA PETRA M,luna petra m,owners,,0,luna
13406,170550000017,ROSALES RUBIN,rosales rubin,owners,,0,rosales


In [19]:
dfo3.shape

(1018347, 7)

In [20]:
dfo31 = dfo3.copy()
#dfo31 = dfo3[:10000]
dfo3.shape, dfo31.shape

((1018347, 7), (1018347, 7))

## Preprocess querry dataset

In [21]:
dfp1 = dfp0[['DOCK_NUM', 'DE_NAME']]
dfp1 = dfp1.applymap(lambda s:s.lower() if type(s) == str else s)
dfp1['source'] = 'prob' # Add this so can identify the array later
dfp1 = dfp1.rename(columns={"DE_NAME": "MAILTO_cleaned", "DOCK_NUM": "ACCOUNT"})
# Remove pro duplicates
dfp1 = dfp1.drop_duplicates(subset='MAILTO_cleaned', keep="last")

In [22]:
# Extract the last name
dfp1['l_name'] = dfp1['MAILTO_cleaned'].str.extract('([\w\-]+)$', expand=True)
dfp1.sample(5)

Unnamed: 0,ACCOUNT,MAILTO_cleaned,source,l_name
822,490477,eugene chooran,prob,chooran
2363,489542,michele boles,prob,boles
2007,488242,steven henderson,prob,henderson
2433,489680,james l wilkerson,prob,wilkerson
1274,489067,donald g wilson,prob,wilson


In [23]:
# Drop extra columns
dfo31 = dfo31.drop(['MAILTO', 'non_person'], 1)

In [24]:
dfo31.columns

Index(['ACCOUNT', 'MAILTO_cleaned', 'source', 'de_prop_given', 'l_name'], dtype='object')

In [25]:
dfp1.columns

Index(['ACCOUNT', 'MAILTO_cleaned', 'source', 'l_name'], dtype='object')

In [26]:
# Combine lookup with query dataset
result = dfo31.append(dfp1, sort=False)
result = result.reset_index(drop=True)

In [27]:
result.groupby('source').sample(n=2, random_state=1)

Unnamed: 0,ACCOUNT,MAILTO_cleaned,source,de_prop_given,l_name
874134,1276880010027,carr kirlice v,owners,,carr
924294,1301260010006,baker kenneth r mary l,owners,,baker
1018582,490461,elizabeth cowper,prob,,cowper
1018519,490572,george g edwards,prob,,edwards


In [28]:
# Pick indexes of two groups for future slicing
l_index = result[result['source']=='owners'].index
q_index = result[result['source']=='prob'].index

# Feature Engineering

### Here we use Tf-idf (term frequency–inverse document frequency) by looking at a normalized count where each word count is divided by the number of rows (i.e. document) this word appears in. I chose this method instead of for example bag-of-words becasue we are comparing names where on average every document has 3 words in it. And also I am not looking for similarities but actual exact match.

In [29]:
result[result['l_name'].isnull()]

Unnamed: 0,ACCOUNT,MAILTO_cleaned,source,de_prop_given,l_name


In [30]:
# use 1-1 word ngrams on last names only
vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word'
                                       # ,stop_words='english'
                                       ,ngram_range = (1, 1)
                                       #, min_df = 1
                                       , norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True)#,
                                       
                                      #  max_df=1, max_features=None)
X = vectorizer.fit_transform(result['l_name'])

In [31]:
print(vectorizer.get_feature_names()[:50])

['00i', '1stop', '21st', '26th', '331eg', '396bp', '3c', '3lm', '3zca', '475xj', '4mb', '4mk', '4y', '6g', '88jg', 'a91', 'aaa', 'aad', 'aagaz', 'aah', 'aaker', 'aakerberg', 'aakquanakhann', 'aal', 'aalders', 'aaloori', 'aals', 'aalund', 'aaly', 'aamir', 'aamodt', 'aamoth', 'aana', 'aanderud', 'aanstoos', 'aaqid', 'aardsma', 'aaron', 'aaronfaridi', 'aarons', 'aaronson', 'aarup', 'aasen', 'aaseng', 'aaser', 'aaserud', 'aasgaard', 'aasim', 'aaz', 'aba']


In [32]:
#nd-array info
X.shape, X.ndim, X.size

((1020102, 120327), 2, 1025585)

In [33]:
# Filter away rows where there is no last name from query list 
# Get similarities of lookup and query dataset 
sim1 = X[l_index].dot(X[q_index].transpose())
sim1.shape, sim1.ndim, sim1.size

((1018347, 1755), 2, 1062335)

In [34]:
# Get non zero values' indexes and their values
nonzero_tup = np.stack(np.nonzero(sim1), axis=-1)
# Filter away zeros and return a list of indexs where there was a match with last names only
res_list1 = [x[0] for x in nonzero_tup]
# Convert list to array as it is expensitve to remvoe duplicates in a large list
res_array = np.array(res_list1)
res_uniques = np.unique(res_array) # This is the smaller lookup dataset where there is one exact match of last name for query data
res_uniques.shape

(315440,)

In [35]:
# Drop irrelevant rows from the original 'combined' dataset
result1 = result.iloc[res_uniques]
# Keep only "owner" rows
result1 = result1[result['source']=='owners']
result1.shape

(315440, 5)

In [36]:
result1.groupby('source').sample(n=2, random_state=1)

Unnamed: 0,ACCOUNT,MAILTO_cleaned,source,de_prop_given,l_name
125493,651220080105,ramirez maritza p,owners,,ramirez
847090,1262310030043,martinez victor,owners,,martinez


### We have now reduced the 1 million lookup database to over 300,000 names. This will make it easier to perfomr a faster and better n-gram vectorization for the final match.