In [3]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [8]:
import timeit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from concurrent.futures import ProcessPoolExecutor

def combine_name_address(pair):
    """
    Combines the name and address of a given pair into a single string.

    Args:
    pair (tuple): A tuple containing name and address.

    Returns:
    str: A combined string of name and address.
    """
    return f"{pair[0]} {pair[1]}"

def compute_cosine_similarity(args):
    """
    Computes cosine similarity for given pairs of combined vectors.

    Args:
    args (tuple): A tuple containing combined vectors, indices, and similarity threshold.

    Returns:
    tuple: Indices of the pairs and their similarity score, and a boolean indicating if they are similar.
    """
    combined_vector1, combined_vector2, i, j, similarity_threshold = args
    vectorizer = TfidfVectorizer().fit(combined_vector1 + combined_vector2)
    tfidf_vector1 = vectorizer.transform(combined_vector1)
    tfidf_vector2 = vectorizer.transform(combined_vector2)
    cosine_sim = cosine_similarity(tfidf_vector1, tfidf_vector2)
    return i, j, cosine_sim[i, j] > similarity_threshold, cosine_sim[i, j]

def find_similar_pairs(vector1, vector2, similarity_threshold=0.5):
    """
    Finds pairs of similar names and addresses using cosine similarity.

    Args:
    vector1 (list): List of tuples with names and addresses.
    vector2 (list): List of tuples with names and addresses.
    similarity_threshold (float): Threshold for considering pairs as similar.

    Returns:
    list: List of tuples containing pairs and their similarity scores.
    """
    combined_vector1 = [combine_name_address(pair) for pair in vector1]
    combined_vector2 = [combine_name_address(pair) for pair in vector2]

    args = [(combined_vector1, combined_vector2, i, j, similarity_threshold) for i in range(len(vector1)) for j in range(len(vector2))]

    with ProcessPoolExecutor() as executor:
        results = executor.map(compute_cosine_similarity, args)

    similar_pairs = [(vector1[i], vector2[j], score) for i, j, is_similar, score in results if is_similar]

    return similar_pairs

def compute_fuzzy_similarity(args):
    """
    Computes fuzzy similarity for given pairs of combined vectors.

    Args:
    args (tuple): A tuple containing combined vectors, indices, and similarity threshold.

    Returns:
    tuple: Indices of the pairs and their similarity score, and a boolean indicating if they are similar.
    """
    combined_vector1, combined_vector2, i, j, similarity_threshold = args
    score = fuzz.ratio(combined_vector1[i], combined_vector2[j])
    return i, j, score > similarity_threshold, score

def find_fuzzy_similar_pairs(vector1, vector2, similarity_threshold=70):
    """
    Finds pairs of similar names and addresses using fuzzy matching.

    Args:
    vector1 (list): List of tuples with names and addresses.
    vector2 (list): List of tuples with names and addresses.
    similarity_threshold (int): Threshold for considering pairs as similar.

    Returns:
    list: List of tuples containing pairs and their similarity scores.
    """
    combined_vector1 = [combine_name_address(pair) for pair in vector1]
    combined_vector2 = [combine_name_address(pair) for pair in vector2]

    args = [(combined_vector1, combined_vector2, i, j, similarity_threshold) for i in range(len(vector1)) for j in range(len(vector2))]

    with ProcessPoolExecutor() as executor:
        results = executor.map(compute_fuzzy_similarity, args)

    similar_pairs = [(vector1[i], vector2[j], score) for i, j, is_similar, score in results if is_similar]

    return similar_pairs

def tokenize_bigrams(text):
    """
    Tokenizes text into bigrams.

    Args:
    text (str): Input text.

    Returns:
    list: List of bigrams.
    """
    return [text[i:i+2] for i in range(len(text) - 1)]

def jaccard_similarity(vector1, vector2):
    """
    Computes Jaccard similarity between two binary vectors.

    Args:
    vector1 (np.ndarray): First binary vector.
    vector2 (np.ndarray): Second binary vector.

    Returns:
    float: Jaccard similarity score.
    """
    intersection = np.logical_and(vector1, vector2).sum()
    union = np.logical_or(vector1, vector2).sum()
    return intersection / union

def compute_jaccard_similarity(args):
    """
    Computes Jaccard similarity for given pairs of binary vectors.

    Args:
    args (tuple): A tuple containing binary vectors, indices, and similarity threshold.

    Returns:
    tuple: Indices of the pairs and their similarity score, and a boolean indicating if they are similar.
    """
    binary_vector1, binary_vector2, i, j, similarity_threshold = args
    score = jaccard_similarity(binary_vector1[i], binary_vector2[j])
    return i, j, score > similarity_threshold, score

def find_jaccard_similar_pairs(vector1, vector2, similarity_threshold=0.5):
    """
    Finds pairs of similar names and addresses using Jaccard similarity.

    Args:
    vector1 (list): List of tuples with names and addresses.
    vector2 (list): List of tuples with names and addresses.
    similarity_threshold (float): Threshold for considering pairs as similar.

    Returns:
    list: List of tuples containing pairs and their similarity scores.
    """
    combined_vector1 = [combine_name_address(pair) for pair in vector1]
    combined_vector2 = [combine_name_address(pair) for pair in vector2]

    bigrams_vector1 = [tokenize_bigrams(text) for text in combined_vector1]
    bigrams_vector2 = [tokenize_bigrams(text) for text in combined_vector2]

    mlb = MultiLabelBinarizer()
    all_bigrams = bigrams_vector1 + bigrams_vector2
    mlb.fit(all_bigrams)
    binary_vector1 = mlb.transform(bigrams_vector1)
    binary_vector2 = mlb.transform(bigrams_vector2)

    args = [(binary_vector1, binary_vector2, i, j, similarity_threshold) for i in range(len(binary_vector1)) for j in range(len(binary_vector2))]

    with ProcessPoolExecutor() as executor:
        results = executor.map(compute_jaccard_similarity, args)

    similar_pairs = [(vector1[i], vector2[j], score) for i, j, is_similar, score in results if is_similar]

    return similar_pairs

# Example vectors of name/address pairs
vector1 = [
    ("John Doe", "123 Main St, Springfield"),
    ("Jane Smith", "456 Maple Ave, Springfield"),
    ("Jim Brown", "789 Oak St, Springfield"),
    ("Alice Johnson", "101 Pine St, Springfield"),
    ("Bob Davis", "202 Cedar Ave, Springfield"),
    ("Carol White", "303 Birch Blvd, Springfield"),
    ("David Harris", "404 Elm St, Springfield"),
    ("Eva Adams", "505 Spruce Ln, Springfield"),
    ("Frank Clark", "606 Willow Rd, Springfield"),
    ("Grace Lewis", "707 Fir Ct, Springfield")
]

vector2 = [
    ("Jonathan Doe", "123 Main St, Springfield"),
    ("Janet Smith", "456 Maple Avenue, Springfield"),
    ("James Brown", "789 Oak Street, Springfield"),
    ("Alicia Johnson", "101 Pine Street, Springfield"),
    ("Robert Davis", "202 Cedar Avenue, Springfield"),
    ("Caroline White", "303 Birch Boulevard, Springfield"),
    ("Davidson Harris", "404 Elm Street, Springfield"),
    ("Evelyn Adams", "505 Spruce Lane, Springfield"),
    ("Franklin Clark", "606 Willow Road, Springfield"),
    ("Gracie Lewis", "707 Fir Court, Springfield")
]

# Define the setup for timeit to ensure all functions and data are in scope
setup = """
from __main__ import find_similar_pairs, find_fuzzy_similar_pairs, find_jaccard_similar_pairs, vector1, vector2
"""

# Time the cosine similarity algorithm
cosine_time = timeit.timeit(
    "find_similar_pairs(vector1, vector2)",
    setup=setup,
    number=10
)

# Time the fuzzy matching algorithm
fuzzy_time = timeit.timeit(
    "find_fuzzy_similar_pairs(vector1, vector2)",
    setup=setup,
    number=10
)

# Time the Jaccard similarity algorithm
jaccard_time = timeit.timeit(
    "find_jaccard_similar_pairs(vector1, vector2)",
    setup=setup,
    number=10
)

# Print the results
print(f"Cosine Similarity time: {cosine_time:.2f} seconds")
print(f"Fuzzy Matching time: {fuzzy_time:.2f} seconds")
print(f"Jaccard Similarity time: {jaccard_time:.2f} seconds")


Cosine Similarity time: 5.29 seconds
Fuzzy Matching time: 0.73 seconds
Jaccard Similarity time: 1.15 seconds
