# Exploring Fuzzy Match Thresholds

This notebook analyzes incorrect deduplication results to find the minimum thresholds needed to avoid false matches.

In [1]:
import pandas as pd
from rapidfuzz import fuzz
from itertools import combinations

In [2]:
# Load the incorrect dedupes
df = pd.read_csv("output/incorrect_dedupes.csv")
df.head()

Unnamed: 0,representative_name,entity_name_varieties,types,num_sections,Unnamed: 4
0,Santa Clara County Ordinance Code,County of Santa Clara Ordinance Code | Ordinan...,"organization, other",15,Santa Clara County Ordinance Code Division A37
1,County Health Officer,County Health Officer | County Public Health O...,role,12,Deputy Health Officer
2,Ord. No. NS-304.1,Ord. No. NS-1004.121 | Ord. No. NS-1100.113 | ...,other,11,Many incongruencies
3,Chief Operating Officer,Chief Operating Officer | Chief Probation Officer,role,10,Chief Probation Officer
4,Ord. No. NS-3.16,Ord. No. NS-3.16 | Ord. No. NS-3.19 | Ord. No....,other,9,Many incongruencies


In [4]:
def normalize_entity_name(name: str) -> str:
    """Same normalization as fuzzy_match.py"""
    return name.lower().strip()


def get_max_similarity(varieties_str: str) -> dict:
    """
    For a pipe-separated list of entity name varieties,
    calculate all pairwise similarities and return info about the max.
    """
    varieties = [v.strip() for v in varieties_str.split("|")]
    
    if len(varieties) < 2:
        return {"max_score": None, "pair": None, "all_scores": []}
    
    all_scores = []
    max_score = 0
    max_pair = None
    
    for name1, name2 in combinations(varieties, 2):
        score = fuzz.token_sort_ratio(
            normalize_entity_name(name1),
            normalize_entity_name(name2)
        )
        all_scores.append((name1, name2, score))
        if score > max_score:
            max_score = score
            max_pair = (name1, name2)
    
    return {
        "max_score": max_score,
        "max_pair": max_pair,
        "all_scores": all_scores,
        "num_varieties": len(varieties)
    }


In [5]:
# Calculate similarities for each incorrect group
results = []

# Filter to entities with only 2 varieties
df = df[df["entity_name_varieties"].str.count(r"\|") == 1]

for idx, row in df.iterrows():
    info = get_max_similarity(row["entity_name_varieties"])
    results.append({
        "representative_name": row["representative_name"],
        "max_score": info["max_score"],
        "max_pair": info["max_pair"],
        "num_varieties": info["num_varieties"],
        "all_scores": info["all_scores"]
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values("max_score", ascending=False)
results_df.head(20)

Unnamed: 0,representative_name,max_score,max_pair,num_varieties,all_scores
40,26 United States Code Section 501(c)(3),97.435897,"(26 United States Code Section 501(c)(3), 26 U...",2,"[(26 United States Code Section 501(c)(3), 26 ..."
12,Code of Civil Procedure Section 1094.6,96.202532,"(Code of Civil Procedure Section 1094.6, Code ...",2,"[(Code of Civil Procedure Section 1094.6, Code..."
44,Class 1 Electric Bicycle,95.833333,"(Class 1 Electric Bicycle, Class 2 Electric Bi...",2,"[(Class 1 Electric Bicycle, Class 2 Electric B..."
51,Special Event Permit for a Major Special Event,95.652174,(Special Event Permit for a Major Special Even...,2,[(Special Event Permit for a Major Special Eve...
52,Trimble Road (west of State Route,94.117647,"(Trimble Road (east of State Route 1, Trimble ...",2,"[(Trimble Road (east of State Route 1, Trimble..."
37,SUPPLEMENT NO. 43,94.117647,"(SUPPLEMENT NO. 43, SUPPLEMENT NO. 44)",2,"[(SUPPLEMENT NO. 43, SUPPLEMENT NO. 44, 94.117..."
50,Section 1207.1.5,93.75,"(Section 1207.1.5, Section 1207.5.5)",2,"[(Section 1207.1.5, Section 1207.5.5, 93.75)]"
31,Health and Safety Code Section 18860 et seq.,92.134831,(Health and Safety Code Section 119300 et seq....,2,[(Health and Safety Code Section 119300 et seq...
24,California Building Standards Code,91.891892,"(California Building Standards Code, Californi...",2,"[(California Building Standards Code, Californ..."
23,CHAPTER 2.40,91.666667,"(CHAPTER 2.40, Chapter 5.40)",2,"[(CHAPTER 2.40, Chapter 5.40, 91.66666666666666)]"


## Key Finding: Required Threshold

To avoid ALL incorrect matches, the threshold must be **higher than the maximum similarity score** found above.

In [None]:
max_incorrect_score = results_df["max_score"].max()
print(f"Highest similarity score among incorrect matches: {max_incorrect_score}")
print(f"Required threshold to avoid ALL incorrect matches: > {max_incorrect_score}")

In [None]:
# Distribution of max scores
print("Distribution of max similarity scores in incorrect groups:\n")
print(results_df["max_score"].describe())

In [None]:
# How many incorrect matches would be avoided at different thresholds?
thresholds = [86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]

print("Incorrect matches avoided at different thresholds:\n")
print(f"{'Threshold':<12} {'Avoided':<12} {'Remaining':<12} {'% Avoided'}")
print("-" * 50)

total = len(results_df)
for thresh in thresholds:
    avoided = (results_df["max_score"] < thresh).sum()
    remaining = total - avoided
    pct = (avoided / total) * 100
    print(f"{thresh:<12} {avoided:<12} {remaining:<12} {pct:.1f}%")

## Detailed View: Highest Scoring Incorrect Pairs

These are the hardest cases to separate - pairs that are very similar but should NOT be matched.

In [None]:
# Show the top problematic pairs
print("Top 20 incorrect matches by similarity score:\n")
for idx, row in results_df.head(20).iterrows():
    print(f"Score: {row['max_score']}")
    print(f"  Representative: {row['representative_name']}")
    if row['max_pair']:
        print(f"  Highest pair: '{row['max_pair'][0]}' <-> '{row['max_pair'][1]}'")
    print()

In [None]:
# Histogram of scores
try:
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 6))
    plt.hist(results_df["max_score"].dropna(), bins=20, edgecolor="black")
    plt.axvline(x=85, color="red", linestyle="--", label="Current threshold (85)")
    plt.xlabel("Max Similarity Score")
    plt.ylabel("Number of Incorrect Groups")
    plt.title("Distribution of Max Similarity Scores in Incorrect Dedupes")
    plt.legend()
    plt.show()
except ImportError:
    print("matplotlib not installed - skipping histogram")

## All Pairwise Scores

Flatten all pairwise comparisons to see the full picture.

In [None]:
# Flatten all pairwise scores
all_pairs = []
for idx, row in results_df.iterrows():
    for name1, name2, score in row["all_scores"]:
        all_pairs.append({
            "representative": row["representative_name"],
            "name1": name1,
            "name2": name2,
            "score": score
        })

pairs_df = pd.DataFrame(all_pairs)
pairs_df = pairs_df.sort_values("score", ascending=False)
print(f"Total pairwise comparisons: {len(pairs_df)}")
pairs_df.head(30)

In [None]:
# How many pairs exceed the current threshold?
above_85 = (pairs_df["score"] >= 85).sum()
print(f"Pairs with score >= 85: {above_85}")
print(f"Pairs with score < 85: {len(pairs_df) - above_85}")