In [2]:
!pip install sentence-transformers scikit-learn



In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

# Load an encoder-based model (BERT-based)
# This model is trained specifically for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

import pandas as pd

# To access BigQuery dataset
from google.cloud import bigquery

In [4]:
# Define GCP project and select candidates data

project_id = "clean-energy-projects"
client = bigquery.Client(project=project_id)

query = """
SELECT * FROM `county_officials_demo.candidates`
"""

In [5]:
# Create dataframe with candidates data
df_candidates = client.query(query).to_dataframe()

In [7]:
# Inspect
df_candidates.shape
df_candidates.head(15)

Unnamed: 0,record_a_id,name_a,title_a,source_a,record_b_id,name_b,title_b,source_b,true_id_a,true_id_b
0,Campbell_Clem_Scott_clean,Scott Clem,Board of County Commissioners - District 4,county website,Campbell_Clem_Scott_news,Commissioner Clem,,news_article,Scott Clem,Scott Clem
1,Campbell_Clem_Scott_clean,Scott Clem,Board of County Commissioners - District 4,county website,Campbell_Ford_Jim_news,Commissioner Ford,,news_article,Scott Clem,Jim Ford
2,Campbell_Clem_Scott_clean,Scott Clem,Board of County Commissioners - District 4,county website,Campbell_Jordan_Bob_news,Commissioner Jordan,,news_article,Scott Clem,Bob Jordan
3,Campbell_Clem_Scott_clean,Scott Clem,Board of County Commissioners - District 4,county website,Campbell_McCreery_Kelley_news,Commissioner McCreery,,news_article,Scott Clem,Kelley McCreery
4,Campbell_Clem_Scott_clean,Scott Clem,Board of County Commissioners - District 4,county website,Campbell_Means_Jerry_news,Commissioner Means,,news_article,Scott Clem,Jerry Means
5,Campbell_Clem_Scott_email,"Clem, Scott",D4,email_list,Campbell_Clem_Scott_news,Commissioner Clem,,news_article,Scott Clem,Scott Clem
6,Campbell_Clem_Scott_email,"Clem, Scott",D4,email_list,Campbell_Ford_Jim_news,Commissioner Ford,,news_article,Scott Clem,Jim Ford
7,Campbell_Clem_Scott_email,"Clem, Scott",D4,email_list,Campbell_McCreery_Kelley_news,Commissioner McCreery,,news_article,Scott Clem,Kelley McCreery
8,Campbell_Clem_Scott_email,"Clem, Scott",D4,email_list,Campbell_Jordan_Bob_news,Commissioner Jordan,,news_article,Scott Clem,Bob Jordan
9,Campbell_Clem_Scott_email,"Clem, Scott",D4,email_list,Campbell_Means_Jerry_news,Commissioner Means,,news_article,Scott Clem,Jerry Means


In [8]:
# Build vector embeddings of names for both a and b sides
embeddings_a = model.encode(df_candidates['name_a'].tolist())
embeddings_b = model.encode(df_candidates['name_b'].tolist())

In [9]:
# Calculate similarity for each pair of names
similarities = []
for i in range(len(embeddings_a)):
    sim = cosine_similarity([embeddings_a[i]], [embeddings_b[i]])[0][0]
    similarities.append(sim)

df_candidates['similarity_score'] = similarities

print(f"  Mean: {np.mean(similarities):.3f}")
print(f"  Min: {np.min(similarities):.3f}")
print(f"  Max: {np.max(similarities):.3f}")

  Mean: 0.349
  Min: 0.003
  Max: 1.000


In [10]:
# Add ground truth column
df_candidates['is_true_match'] = (df_candidates['true_id_a'] == df_candidates['true_id_b']).astype(int)

# Compare scores for matches vs non-matches
true_matches = df_candidates[df_candidates['is_true_match'] == 1]
false_matches = df_candidates[df_candidates['is_true_match'] == 0]

print(f"TRUE MATCHES (same person):")
print(f"  Count: {len(true_matches)}")
print(f"  Mean similarity: {true_matches['similarity_score'].mean():.3f}")
print(f"  Min similarity: {true_matches['similarity_score'].min():.3f}")
print(f"\nFALSE MATCHES (different people):")
print(f"  Count: {len(false_matches)}")
print(f"  Mean similarity: {false_matches['similarity_score'].mean():.3f}")
print(f"  Max similarity: {false_matches['similarity_score'].max():.3f}")

TRUE MATCHES (same person):
  Count: 170
  Mean similarity: 0.752
  Min similarity: 0.290

FALSE MATCHES (different people):
  Count: 580
  Mean similarity: 0.230
  Max similarity: 0.532


In [13]:
# Set multiple similarity thresholds
# Goal is to find best similarity threshold
thresholds = np.arange(0.2, 0.8, 0.05)
results = []

for threshold in thresholds:
    # Predict matches
    df_candidates['predicted_match'] = (df_candidates['similarity_score'] >= threshold).astype(int)

    # Find true positives (tp), false positives (fp), true negatives (tn),
    # and false negatives (fn)
    tp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 1)).sum()
    fp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 0)).sum()
    tn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 0)).sum()
    fn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 1)).sum()

    # Calculate classic evaluation metrics

    accuracy = (tp + tn) / len(df_candidates)

    # Precision
    if (tp + fp) > 0:
      precision = tp / (tp + fp)
    else:
      precision = 0

    # Recall
    if (tp + fn) > 0:
      recall = tp / (tp + fn)
    else:
      recall = 0

    # F1 Score
    if (precision + recall) > 0:
      f1 = 2 * (precision * recall) / (precision + recall)
    else:
      f1 = 0

    results.append({
        'threshold': threshold,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })

df_results = pd.DataFrame(results)
print(df_results.sort_values('f1', ascending=False).head())

   threshold  accuracy  precision    recall        f1
6       0.50  0.989333   0.987952  0.964706  0.976190
7       0.55  0.989333   1.000000  0.952941  0.975904
5       0.45  0.984000   0.954023  0.976471  0.965116
8       0.60  0.973333   1.000000  0.882353  0.937500
4       0.40  0.966667   0.887701  0.976471  0.929972


In [14]:
# Apply best threshold - 0.55
BEST_THRESHOLD = 0.55
df_candidates['predicted_match'] = (df_candidates['similarity_score'] >= BEST_THRESHOLD).astype(int)

# Calculate final metrics
tp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 1)).sum()
fp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 0)).sum()
tn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 0)).sum()
fn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 1)).sum()

print(f"FINAL RESULTS (threshold = {BEST_THRESHOLD}):")
print(f"  True Positives: {tp}")
print(f"  False Positives: {fp}")
print(f"  True Negatives: {tn}")
print(f"  False Negatives: {fn}")
print(f"\n  Precision: {tp/(tp+fp):.1%}")
print(f"  Recall: {tp/(tp+fn):.1%}")
print(f"  Accuracy: {(tp+tn)/len(df_candidates):.1%}")

# Show example matches
print("\n=== EXAMPLE CORRECT MATCHES ===")
correct_matches = df_candidates[(df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 1)]
print(correct_matches[['name_a', 'name_b', 'similarity_score']].head(5))

# Show missed matches (false negatives)
print("\n=== MISSED MATCHES (False Negatives) ===")
missed = df_candidates[(df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 1)]
print(missed[['name_a', 'name_b', 'similarity_score', 'true_id_a']])

FINAL RESULTS (threshold = 0.55):
  True Positives: 162
  False Positives: 0
  True Negatives: 580
  False Negatives: 8

  Precision: 100.0%
  Recall: 95.3%
  Accuracy: 98.9%

=== EXAMPLE CORRECT MATCHES ===
         name_a             name_b  similarity_score
0    Scott Clem  Commissioner Clem          0.682982
5   Clem, Scott  Commissioner Clem          0.679469
14   Scott Clem  Commissioner Clem          0.682982
16     Clem, S.  Commissioner Clem          0.751724
20     Jim Ford  Commissioner Ford          0.695869

=== MISSED MATCHES (False Negatives) ===
                name_a              name_b  similarity_score  \
56         Jerry Means  Commissioner Means          0.290064   
57        Means, Jerry  Commissioner Means          0.310387   
58         Jerry Means  Commissioner Means          0.290064   
59           Means, J.  Commissioner Means          0.302019   
109  Kate Brophy McGee    Supervisor McGee          0.454088   
112  Kate Brophy McGee    Supervisor McGee      