# NAMA Demo

First lets create some simple data and install packages

In [2]:
import pandas as pd
import numpy as np
from nama import MatchData

df1 = pd.DataFrame(['ABC Inc.','abc inc','A.B.C. INCORPORATED','The XYZ Company','X Y Z CO'],columns=['name'])
df2 = pd.DataFrame(['ABC Inc.','XYZ Co.'],columns=['name'])

print(f'Toy data:\ndf1=\n{df1}\ndf2=\n{df2}')

  from .autonotebook import tqdm as notebook_tqdm


Toy data:
df1=
                  name
0             ABC Inc.
1              abc inc
2  A.B.C. INCORPORATED
3      The XYZ Company
4             X Y Z CO
df2=
       name
0  ABC Inc.
1   XYZ Co.


## Match Data

Nama is built around an object called `Match Data`, which holds matching information about a set of strings and partitions the strings into non-overlapping groups.
   - Strings in the same group are considered "matched"
   - Strings in different groups are not matched.
Nama provides tools for creating, modifying, saving, and loading matches. Then these matches can be used to generate unique group ids for a set of strings, or perform two-way merges between pandas dataframes according to the match groups.

In [4]:
# We start matching by creating an empty matches
matches = MatchData()

# First we need to add all the strings we want to match to the matches
# (in this case the strings the name column of each dataframe)
matches = matches.add_strings(df1['name'])
matches = matches.add_strings(df2['name'])

# Initially, strings are automatically assigned to singleton groups
# (Groups are automatically labelled according to the most common string,
# with ties broken alphabetically)
print(f'Initial string groups:\n{matches.groups}')

Initial string groups:
{'ABC Inc.': ['ABC Inc.'], 'abc inc': ['abc inc'], 'A.B.C. INCORPORATED': ['A.B.C. INCORPORATED'], 'The XYZ Company': ['The XYZ Company'], 'X Y Z CO': ['X Y Z CO'], 'XYZ Co.': ['XYZ Co.']}


In [5]:
# At this point we can merge on exact matches, but there isn't much point
# (equivalent to pandas merge function)
print(f"Exact matching with singleton groups:\n{matches.merge_dfs(df1,df2,on='name')}")

Exact matching with singleton groups:
     name_x match_group    name_y
0  ABC Inc.    ABC Inc.  ABC Inc.


In [6]:
# To get better results, we need to modify the matches.
# Unite merges all groups that contain the passed strings.
matches = matches.unite(['ABC Inc.', 'A.B.C. INCORPORATED'])
print(f'Updated string groups:\n{matches.groups}')

Updated string groups:
{'ABC Inc.': ['ABC Inc.', 'A.B.C. INCORPORATED'], 'abc inc': ['abc inc'], 'The XYZ Company': ['The XYZ Company'], 'X Y Z CO': ['X Y Z CO'], 'XYZ Co.': ['XYZ Co.']}


`unite` is very flexible. We can pass a single set of strings, a nested list of strings, or mapping from strings to group labels. The mapping can even be a function that evaluates strings and generates a label.This makes it very simple to do hash collision matching.

Hash collision matching works by matching any strings that have the same hash. A hash could be almost anything, but one useful way to do collision matching is to match strings that are identical after simplifying both strings.

Nama provides some useful simplification functions in nama.utils. `simplify_corp` strips punctuation and capitalization, and removes common parts of names like starting with "the", or ending with "inc" or "ltd".

In [7]:
from nama import simplify_corp

# Make a new matches for comparison
corp_matches = MatchData(matches.strings())

# Unite strings with the same simplified representation
corp_matches = corp_matches.unite(simplify_corp)

print(f'Groups after uniting by simplify_corp:\n{corp_matches.groups}')

Groups after uniting by simplify_corp:
{'A.B.C. INCORPORATED': ['A.B.C. INCORPORATED', 'abc inc', 'ABC Inc.'], 'The XYZ Company': ['The XYZ Company', 'XYZ Co.'], 'X Y Z CO': ['X Y Z CO']}


We can also inspect the united groups

In [8]:
# Firstly, we can get the group that any string belongs too with
print(matches['A.B.C. INCORPORATED'])
# We can inspect the all the strings in the same group (i.e. that match) with
print(matches.matches('A.B.C. INCORPORATED'))
# Lastly we can convert the matches to a dataframe
print(matches.to_df())

ABC Inc.
['ABC Inc.', 'A.B.C. INCORPORATED']
                string  count            group
0             ABC Inc.      2         ABC Inc.
1  A.B.C. INCORPORATED      1         ABC Inc.
2      The XYZ Company      1  The XYZ Company
3             X Y Z CO      1         X Y Z CO
4              XYZ Co.      1          XYZ Co.
5              abc inc      1          abc inc


The matches can also be converted to a dataframe if we want to cluster the names in one dataset or create a mapping to string groups that can be used accross multiple datasets.

In [9]:
matches_df = matches.to_df()
matches_df

Unnamed: 0,string,count,group
0,ABC Inc.,2,ABC Inc.
1,A.B.C. INCORPORATED,1,ABC Inc.
2,The XYZ Company,1,The XYZ Company
3,X Y Z CO,1,X Y Z CO
4,XYZ Co.,1,XYZ Co.
5,abc inc,1,abc inc


Finally, we can save the matches in csv format for later use

In [10]:
matches.to_csv('matches.csv')

In [11]:
from nama import read_csv

# ...and load it again at a later time
loaded_matches = read_csv('matches.csv')
loaded_matches.to_df()

Unnamed: 0,string,count,group
0,ABC Inc.,2,ABC Inc.
1,A.B.C. INCORPORATED,1,ABC Inc.
2,The XYZ Company,1,The XYZ Company
3,X Y Z CO,1,X Y Z CO
4,XYZ Co.,1,XYZ Co.
5,abc inc,1,abc inc


# Embedding Similarity

The Embedding Similarity model allows us to predict the similarity of larger and more complex strings

First we'll need to train a Similarity Model to predict the similarity of larger and more complex matches for which we have some target values

In [12]:
from nama import SimilarityModel

train_kwargs = {
    'max_epochs': 2,
    'warmup_frac': 0.2,
    'transformer_lr':1e-5,
    'score_lr':10,
    'batch_size':8,
}

sim = SimilarityModel()

history_df = sim.train(matches, verbose=True, **train_kwargs)

# Save our trained model to disk
sim.save("path-to-model.bin")

training epoch 0: 100%|██████████| 1/1 [00:09<00:00,  9.70s/it]
training epoch 1: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


The Embeddings Model has some powerful function that allow us to unite strings in various ways. 

The `unite_similar` function allow us to match similar strings based on their predicted pairwise similarity. 

The `unite_nearest` function allow us to uniting embedding strings with their most similar target strings. This function is particularly useful in scenarios where you have a set of target strings and want to match each embedding string to its nearest corresponding target string.

In [13]:
from nama import load_similarity_model

# We can use our train model directly or load it from the save file
sim = load_similarity_model("path-to-model.bin")

# Or we can use the standard model from huggingface
# .... TBD

# Then we'll have the model embed our matches
embeddings = sim.embed(matches)

# Now we can do some matching
# We can unite strings according to their predicted pairwise similarity
sim_matches_similar = embeddings.unite_similar(threshold=0.5)

# We can unite strings with each string's most similar target string
# This method requires a set of target strings which will be matched to our embedded strings
sim_matches_nearest = embeddings.unite_nearest(target_strings=corp_matches.strings(),threshold=0)

# We can also manipulate the embeddings by slicing like so
first_embedding = embeddings[0:1]
print("Embedding shape: ", first_embedding.V.shape)

# Lastly we can save the embeddings for later use
embeddings.save("path-to-save-embeddings.bin")

Embedding shape:  torch.Size([1, 128])


With a trained model we can run some tests

In [14]:
# We can test the similarity model with a single threshold
test_scores = sim.test(matches, threshold=0.5)
pd.DataFrame([test_scores])

Unnamed: 0,TP,FP,TN,FN,coverage,accuracy,precision,recall,F1
0,2,18,0,0,1.0,0.1,0.1,1.0,0.181818


In [15]:
# Or can also run a test over multiple thresholds to find the optimal one
test_scores = sim.test(matches, threshold=np.linspace(0,1,11))
pd.DataFrame(test_scores)

Unnamed: 0,TP,FP,TN,FN,coverage,accuracy,precision,recall,F1,threshold
0,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.0
1,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.1
2,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.2
3,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.3
4,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.4
5,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.5
6,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.6
7,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.7
8,2,18,0,0,1.0,0.1,0.1,1.0,0.181818,0.8
9,0,6,12,2,1.0,0.0,0.0,0.0,0.0,0.9
