In [None]:
!pip install sentence-transformers scikit-learn

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

# Load an encoder-based model (BERT-based)
# This model is trained specifically for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

import pandas as pd

# To access BigQuery dataset
from google.cloud import bigquery

In [None]:
# Define GCP project and select candidates data

project_id = "clean-energy-projects"
client = bigquery.Client(project=project_id)

query = """
SELECT * FROM `county_officials_demo.candidates`
"""

In [None]:
# Create dataframe with candidates data
df_candidates = client.query(query).to_dataframe()

In [None]:
# Inspect
df_candidates.shape
df_candidates.head(15)

In [None]:
# Build vector embeddings of names for both a and b sides
embeddings_a = model.encode(df_candidates['name_a'].tolist())
embeddings_b = model.encode(df_candidates['name_b'].tolist())

In [None]:
# Calculate similarity for each pair of names
similarities = []
for i in range(len(embeddings_a)):
    sim = cosine_similarity([embeddings_a[i]], [embeddings_b[i]])[0][0]
    similarities.append(sim)

df_candidates['similarity_score'] = similarities

print(f"  Mean: {np.mean(similarities):.3f}")
print(f"  Min: {np.min(similarities):.3f}")
print(f"  Max: {np.max(similarities):.3f}")

In [None]:
# Add ground truth column
df_candidates['is_true_match'] = (df_candidates['true_id_a'] == df_candidates['true_id_b']).astype(int)

# Compare scores for matches vs non-matches
true_matches = df_candidates[df_candidates['is_true_match'] == 1]
false_matches = df_candidates[df_candidates['is_true_match'] == 0]

print(f"TRUE MATCHES (same person):")
print(f"  Count: {len(true_matches)}")
print(f"  Mean similarity: {true_matches['similarity_score'].mean():.3f}")
print(f"  Min similarity: {true_matches['similarity_score'].min():.3f}")
print(f"\nFALSE MATCHES (different people):")
print(f"  Count: {len(false_matches)}")
print(f"  Mean similarity: {false_matches['similarity_score'].mean():.3f}")
print(f"  Max similarity: {false_matches['similarity_score'].max():.3f}")

In [None]:
# Set multiple similarity thresholds
# Goal is to find best similarity threshold
thresholds = np.arange(0.2, 0.8, 0.05)
results = []

for threshold in thresholds:
    # Predict matches
    df_candidates['predicted_match'] = (df_candidates['similarity_score'] >= threshold).astype(int)

    # Find true positives (tp), false positives (fp), true negatives (tn),
    # and false negatives (fn)
    tp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 1)).sum()
    fp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 0)).sum()
    tn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 0)).sum()
    fn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 1)).sum()

    # Calculate classic evaluation metrics

    accuracy = (tp + tn) / len(df_candidates)

    # Precision
    if (tp + fp) > 0:
      precision = tp / (tp + fp)
    else:
      precision = 0

    # Recall
    if (tp + fn) > 0:
      recall = tp / (tp + fn)
    else:
      recall = 0

    # F1 Score
    if (precision + recall) > 0:
      f1 = 2 * (precision * recall) / (precision + recall)
    else:
      f1 = 0

    results.append({
        'threshold': threshold,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })

df_results = pd.DataFrame(results)
print(df_results.sort_values('f1', ascending=False).head())

In [None]:
# Apply best threshold - 0.55
BEST_THRESHOLD = 0.55
df_candidates['predicted_match'] = (df_candidates['similarity_score'] >= BEST_THRESHOLD).astype(int)

# Calculate final metrics
tp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 1)).sum()
fp = ((df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 0)).sum()
tn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 0)).sum()
fn = ((df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 1)).sum()

print(f"FINAL RESULTS (threshold = {BEST_THRESHOLD}):")
print(f"  True Positives: {tp}")
print(f"  False Positives: {fp}")
print(f"  True Negatives: {tn}")
print(f"  False Negatives: {fn}")
print(f"\n  Precision: {tp/(tp+fp):.1%}")
print(f"  Recall: {tp/(tp+fn):.1%}")
print(f"  Accuracy: {(tp+tn)/len(df_candidates):.1%}")

# Show example matches
print("\n=== EXAMPLE CORRECT MATCHES ===")
correct_matches = df_candidates[(df_candidates['predicted_match'] == 1) & (df_candidates['is_true_match'] == 1)]
print(correct_matches[['name_a', 'name_b', 'similarity_score']].head(5))

# Show missed matches (false negatives)
print("\n=== MISSED MATCHES (False Negatives) ===")
missed = df_candidates[(df_candidates['predicted_match'] == 0) & (df_candidates['is_true_match'] == 1)]
print(missed[['name_a', 'name_b', 'similarity_score', 'true_id_a']])