In [2]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.0


In [6]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (if not already downloaded)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    WordNetLemmatizer().lemmatize('test')
except LookupError:
    nltk.download('wordnet')
    nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer

# Load the datasets
# Assuming the dataframes are already loaded as per the initial script
# For demonstration, I'll re-create them here to be self-contained
data_resolved = {
    'Query_ID': [1, 2, 3, 4, 5],
    'Pre_Resolved_Query': [
        'Unable to connect to the internet',
        'Payment failed during checkout',
        'App crashes when opening settings',
        'Forgot password and unable to reset',
        'Unable to upload files to the server'
    ]
}
df_resolved = pd.DataFrame(data_resolved)

data_new = {
    'Variation_Query': [
        'Unabel to conect to the internet', 'Can’t connect to internet', 'Intenet not working',
        'Payment failed while chekout', 'Payment did not go through during chckout', 'Payment issue at check out',
        'Application crashes when opening setings', 'App crash when going to settings', 'Settings cause the app to chrash',
        'Forgot passwrd and cant reset', '"Forgotten password, unable to reset"', 'I can’t reset my password',
        'Unable to uplod file to server', "Can't upload files on to the server", 'File uploading to server not working',
        "No internet connection, please help", "Checkout page says payment failed", "Settings page crashes app immediately",
        "Password reset link not working", "Server upload fails with an error message"
    ],
    'Matches_With_Query_ID': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 1, 2, 3, 4, 5]
}
df_new = pd.DataFrame(data_new)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text, remove_stopwords=False, lemmatize=False):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    if remove_stopwords:
        tokens = text.split()
        text = ' '.join([word for word in tokens if word not in stop_words])

    if lemmatize:
        tokens = text.split()
        text = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return text

df_resolved['Cleaned_Query'] = df_resolved['Pre_Resolved_Query'].apply(clean_text)
df_new['Cleaned_Query'] = df_new['Variation_Query'].apply(clean_text)

def find_fuzzy_matches(new_query, resolved_df, scorer, threshold):
    best_match_id = None
    best_score = 0
    
    for idx, row in resolved_df.iterrows():
        score = scorer(new_query, row['Cleaned_Query'])
        if score > best_score:
            best_score = score
            best_match_id = row['Query_ID']
            
    if best_score >= threshold:
        return best_match_id, best_score
    else:
        return None, best_score

# --- Fuzzy Search with fuzz.token_set_ratio (Adjusted Threshold 60) ---
print("\n--- Fuzzy Search with fuzz.token_set_ratio (Adjusted Threshold 60) ---")
results_token_set_adj_low = []
threshold_token_set_adj_low = 60
for _, row in df_new.iterrows():
    match_id, score = find_fuzzy_matches(row['Cleaned_Query'], df_resolved, fuzz.token_set_ratio, threshold_token_set_adj_low)
    results_token_set_adj_low.append({
        'New_Query': row['Variation_Query'],
        'Actual_Match_ID': row['Matches_With_Query_ID'],
        'Predicted_Match_ID': match_id,
        'Similarity_Score': score
    })
df_results_token_set_adj_low = pd.DataFrame(results_token_set_adj_low)
accuracy_token_set_adj_low = (df_results_token_set_adj_low['Actual_Match_ID'] == df_results_token_set_adj_low['Predicted_Match_ID']).mean()
print(f"Token Set Ratio (Threshold={threshold_token_set_adj_low}):")
print(df_results_token_set_adj_low)
print(f"Accuracy: {accuracy_token_set_adj_low:.2f}")


--- Fuzzy Search with fuzz.token_set_ratio (Adjusted Threshold 60) ---
Token Set Ratio (Threshold=60):
                                    New_Query  Actual_Match_ID  \
0            Unabel to conect to the internet                1   
1                   Can’t connect to internet                1   
2                         Intenet not working                1   
3                Payment failed while chekout                2   
4   Payment did not go through during chckout                2   
5                  Payment issue at check out                2   
6    Application crashes when opening setings                3   
7            App crash when going to settings                3   
8            Settings cause the app to chrash                3   
9               Forgot passwrd and cant reset                4   
10      "Forgotten password, unable to reset"                4   
11                  I can’t reset my password                4   
12             Unable to uplod file to

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [7]:
# Apply more extensive cleaning for BoW/TF-IDF
df_resolved['Cleaned_Query_BOW'] = df_resolved['Pre_Resolved_Query'].apply(lambda x: clean_text(x, remove_stopwords=True, lemmatize=True))
df_new['Cleaned_Query_BOW'] = df_new['Variation_Query'].apply(lambda x: clean_text(x, remove_stopwords=True, lemmatize=True))

print("\nResolved Queries (Cleaned for BoW/TF-IDF):")
print(df_resolved[['Query_ID', 'Cleaned_Query_BOW']])
print("\nNew Queries (Cleaned for BoW/TF-IDF):")
print(df_new[['Variation_Query', 'Cleaned_Query_BOW']])

# --- TF-IDF Vectorization and Cosine Similarity ---
print("\n--- TF-IDF with Cosine Similarity ---")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit on resolved queries and transform both resolved and new queries
tfidf_matrix_resolved = tfidf_vectorizer.fit_transform(df_resolved['Cleaned_Query_BOW'])
tfidf_matrix_new = tfidf_vectorizer.transform(df_new['Cleaned_Query_BOW'])

results_tfidf = []
threshold_tfidf = 0.4 # Initial threshold

for i, new_query_vec in enumerate(tfidf_matrix_new):
    # Calculate cosine similarity between the new query and all resolved queries
    cosine_scores = cosine_similarity(new_query_vec, tfidf_matrix_resolved)
    
    # Get the best match
    best_score_idx = np.argmax(cosine_scores)
    best_score = cosine_scores[0, best_score_idx]
    
    predicted_match_id = None
    if best_score >= threshold_tfidf:
        predicted_match_id = df_resolved.loc[best_score_idx, 'Query_ID']
        
    results_tfidf.append({
        'New_Query': df_new.loc[i, 'Variation_Query'],
        'Actual_Match_ID': df_new.loc[i, 'Matches_With_Query_ID'],
        'Predicted_Match_ID': predicted_match_id,
        'Similarity_Score': best_score
    })

df_results_tfidf = pd.DataFrame(results_tfidf)
accuracy_tfidf = (df_results_tfidf['Actual_Match_ID'] == df_results_tfidf['Predicted_Match_ID']).mean()
print(f"TF-IDF Cosine Similarity (Threshold={threshold_tfidf}):")
print(df_results_tfidf)
print(f"Accuracy: {accuracy_tfidf:.2f}")

# Adjust threshold for TF-IDF if needed
print("\n--- TF-IDF with Cosine Similarity (Adjusted Threshold 0.5) ---")
results_tfidf_adj = []
threshold_tfidf_adj = 0.5

for i, new_query_vec in enumerate(tfidf_matrix_new):
    cosine_scores = cosine_similarity(new_query_vec, tfidf_matrix_resolved)
    best_score_idx = np.argmax(cosine_scores)
    best_score = cosine_scores[0, best_score_idx]
    
    predicted_match_id = None
    if best_score >= threshold_tfidf_adj:
        predicted_match_id = df_resolved.loc[best_score_idx, 'Query_ID']
        
    results_tfidf_adj.append({
        'New_Query': df_new.loc[i, 'Variation_Query'],
        'Actual_Match_ID': df_new.loc[i, 'Matches_With_Query_ID'],
        'Predicted_Match_ID': predicted_match_id,
        'Similarity_Score': best_score
    })
df_results_tfidf_adj = pd.DataFrame(results_tfidf_adj)
accuracy_tfidf_adj = (df_results_tfidf_adj['Actual_Match_ID'] == df_results_tfidf_adj['Predicted_Match_ID']).mean()
print(f"TF-IDF Cosine Similarity (Threshold={threshold_tfidf_adj}):")
print(df_results_tfidf_adj)
print(f"Accuracy: {accuracy_tfidf_adj:.2f}")


# --- BoW (CountVectorizer) Vectorization and Cosine Similarity ---
print("\n--- BoW with Cosine Similarity ---")

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit on resolved queries and transform both resolved and new queries
count_matrix_resolved = count_vectorizer.fit_transform(df_resolved['Cleaned_Query_BOW'])
count_matrix_new = count_vectorizer.transform(df_new['Cleaned_Query_BOW'])

results_bow = []
threshold_bow = 0.7 # Initial threshold (BoW scores can be higher than TF-IDF for short, similar texts)

for i, new_query_vec in enumerate(count_matrix_new):
    cosine_scores = cosine_similarity(new_query_vec, count_matrix_resolved)
    best_score_idx = np.argmax(cosine_scores)
    best_score = cosine_scores[0, best_score_idx]
    
    predicted_match_id = None
    if best_score >= threshold_bow:
        predicted_match_id = df_resolved.loc[best_score_idx, 'Query_ID']
        
    results_bow.append({
        'New_Query': df_new.loc[i, 'Variation_Query'],
        'Actual_Match_ID': df_new.loc[i, 'Matches_With_Query_ID'],
        'Predicted_Match_ID': predicted_match_id,
        'Similarity_Score': best_score
    })

df_results_bow = pd.DataFrame(results_bow)
accuracy_bow = (df_results_bow['Actual_Match_ID'] == df_results_bow['Predicted_Match_ID']).mean()
print(f"BoW Cosine Similarity (Threshold={threshold_bow}):")
print(df_results_bow)
print(f"Accuracy: {accuracy_bow:.2f}")

# Adjust threshold for BoW
print("\n--- BoW with Cosine Similarity (Adjusted Threshold 0.6) ---")
results_bow_adj = []
threshold_bow_adj = 0.6

for i, new_query_vec in enumerate(count_matrix_new):
    cosine_scores = cosine_similarity(new_query_vec, count_matrix_resolved)
    best_score_idx = np.argmax(cosine_scores)
    best_score = cosine_scores[0, best_score_idx]
    
    predicted_match_id = None
    if best_score >= threshold_bow_adj:
        predicted_match_id = df_resolved.loc[best_score_idx, 'Query_ID']
        
    results_bow_adj.append({
        'New_Query': df_new.loc[i, 'Variation_Query'],
        'Actual_Match_ID': df_new.loc[i, 'Matches_With_Query_ID'],
        'Predicted_Match_ID': predicted_match_id,
        'Similarity_Score': best_score
    })
df_results_bow_adj = pd.DataFrame(results_bow_adj)
accuracy_bow_adj = (df_results_bow_adj['Actual_Match_ID'] == df_results_bow_adj['Predicted_Match_ID']).mean()
print(f"BoW Cosine Similarity (Threshold={threshold_bow_adj}):")
print(df_results_bow_adj)
print(f"Accuracy: {accuracy_bow_adj:.2f}")


Resolved Queries (Cleaned for BoW/TF-IDF):
   Query_ID             Cleaned_Query_BOW
0         1       unable connect internet
1         2       payment failed checkout
2         3     app crash opening setting
3         4  forgot password unable reset
4         5     unable upload file server

New Queries (Cleaned for BoW/TF-IDF):
                              Variation_Query  \
0            Unabel to conect to the internet   
1                   Can’t connect to internet   
2                         Intenet not working   
3                Payment failed while chekout   
4   Payment did not go through during chckout   
5                  Payment issue at check out   
6    Application crashes when opening setings   
7            App crash when going to settings   
8            Settings cause the app to chrash   
9               Forgot passwrd and cant reset   
10      "Forgotten password, unable to reset"   
11                  I can’t reset my password   
12             Unable to upl

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [8]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
import numpy as np

# Load the datasets
try:
    df_variations = pd.read_csv('/kaggle/input/nlp-assignment-4/name_variations.csv')
    df_base = pd.read_csv('/kaggle/input/nlp-assignment-4/base_names.csv')
except FileNotFoundError:
    print("Ensure 'name_variations.csv' and 'base_names.csv' are in the same directory.")
    # Fallback for demonstration if files are not found
    data_variations = {
        'Variation': [
            'Thomas  King', 'ThomasKing', 'Maria Garcia', 'MaryLewis', 'Nancy W.', 'Dani3l Scott',
            'JOHN  smith', 'linda johnson', 'N@ncy Wright', 'William Davis', 'Susan  Clark', 'SusanClark',
            'Jennifer- Brown', "Michael O'Connor", 'Rodriguez James', 'Elizabeth Wil5on', 'Hill Sandra',
            'THOMAS KING', 'Christoph3r Green', 'Wi11iam Davis', 'linda johnson', 'John tihSm', 'Nancy Wright',
            'Robert L.', 'Christopher Green', 'Thomas King', 'Rob3rt Lee', 'Kar3n Young', 'DAVID  martinez',
            'N@ncy Wright', 'Elizab3th Wilson', 'Maria  Garcia', 'Robert  Lee', 'Nancy Wright', 'Maria G.',
            'S@ndr@ Hill', 'Mary Lew', 'Jessic@ Adams', 'William  Davis', 'Maria  Garcia', 'Jennifer Brown',
            'jennifer brown', 'Mary  Lewis', 'Susan Clark', "Michael  O'Connor", 'Clark Susan', 'Sandra Hill.',
            'Mich@el O\'Connor', 'Christopher  Green', 'Daniel Scott', 'J. Smith', 'Jessica  Adams',
            'Mary Lew', 'Elizab3th Wilson', 'sandra hill', 'linda johnson', 'Adams Jessica', 'Daniel  Scott',
            'PAUL ALLEN', 'Paul Allen.', 'Susan Clark', 'JamesRodriguez', 'Thomas King', 'Linda  Johnson',
            'elizabeth wilson', 'Karen Young', 'MICHAEL O\'CONNOR', 'Linda Johnson', 'Maria  Garcia', 'Sandra Hil',
            'J. Adams', 'Lewis Mary', 'Christopher  Green', 'Rodriguez James', 'christopher green',
            'William  Davis', 'James- Rodriguez', 'Robert L.', 'Rob Lee', 'Kar Young', 'elizabeth wilson',
            'David M.', 'Daniel  Scott', 'D@vid Martinez', 'Jam Rodriguez', 'Karen  Young', 'WilliamDavis',
            'John- Smith', 'John  Smith', 'David Martinez', 'Young Karen', 'Jessica Adams', 'Michael  O\'Connor',
            'paul allen', 'JENNIFER  brown', 'Jennifer- Brown', 'Daniel- Scott', 'David M.', 'Paul Allen.',
            'Paul  Allen'
        ],
        'Matches_With_Base_Name': [
            'Thomas King', 'Thomas King', 'Maria Garcia', 'Mary Lewis', 'Nancy Wright', 'Daniel Scott',
            'John Smith', 'Linda Johnson', 'Nancy Wright', 'William Davis', 'Susan Clark', 'Susan Clark',
            'Jennifer Brown', 'Michael O\'Connor', 'James Rodriguez', 'Elizabeth Wilson', 'Sandra Hill',
            'Thomas King', 'Christopher Green', 'William Davis', 'Linda Johnson', 'John Smith', 'Nancy Wright',
            'Robert Lee', 'Christopher Green', 'Thomas King', 'Robert Lee', 'Karen Young', 'David Martinez',
            'Nancy Wright', 'Elizabeth Wilson', 'Maria Garcia', 'Robert Lee', 'Nancy Wright', 'Maria Garcia',
            'Sandra Hill', 'Mary Lewis', 'Jessica Adams', 'William Davis', 'Maria Garcia', 'Jennifer Brown',
            'Jennifer Brown', 'Mary Lewis', 'Susan Clark', 'Michael O\'Connor', 'Susan Clark', 'Sandra Hill',
            'Michael O\'Connor', 'Christopher Green', 'Daniel Scott', 'John Smith', 'Jessica Adams',
            'Mary Lewis', 'Elizabeth Wilson', 'Sandra Hill', 'Linda Johnson', 'Jessica Adams', 'Daniel Scott',
            'Paul Allen', 'Paul Allen', 'Susan Clark', 'James Rodriguez', 'Thomas King', 'Linda Johnson',
            'Elizabeth Wilson', 'Karen Young', 'Michael O\'Connor', 'Linda Johnson', 'Maria Garcia', 'Sandra Hill',
            'Jessica Adams', 'Mary Lewis', 'Christopher Green', 'James Rodriguez', 'Christopher Green',
            'William Davis', 'James Rodriguez', 'Robert Lee', 'Robert Lee', 'Karen Young', 'Elizabeth Wilson',
            'David Martinez', 'Daniel Scott', 'David Martinez', 'James Rodriguez', 'Karen Young', 'William Davis',
            'John Smith', 'John Smith', 'David Martinez', 'Karen Young', 'Jessica Adams', 'Michael O\'Connor',
            'Paul Allen', 'Jennifer Brown', 'Jennifer Brown', 'Daniel Scott', 'David Martinez', 'Paul Allen',
            'Paul Allen'
        ]
    }
    df_variations = pd.DataFrame(data_variations)

    data_base = {
        'Base_Name_ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        'Base_Name': [
            'John Smith', 'Jennifer Brown', 'Michael O\'Connor', 'Maria Garcia', 'Robert Lee',
            'Linda Johnson', 'William Davis', 'Elizabeth Wilson', 'David Martinez', 'Susan Clark',
            'James Rodriguez', 'Mary Lewis', 'Paul Allen', 'Karen Young', 'Thomas King',
            'Nancy Wright', 'Daniel Scott', 'Sandra Hill', 'Christopher Green', 'Jessica Adams'
        ]
    }
    df_base = pd.DataFrame(data_base)

print("Name Variations (first 5 rows):")
print(df_variations.head())
print("\nBase Names:")
print(df_base.head())

# --- Preprocessing Function ---
def clean_name(name):
    name = str(name).lower()
    # Remove numbers and common special characters used as typos
    name = re.sub(r'[0-9@\-\.,\']', '', name) 
    name = re.sub(r'\s+', ' ', name).strip() # Replace multiple spaces with one, strip leading/trailing
    return name

# Apply cleaning to both datasets
df_variations['Cleaned_Variation'] = df_variations['Variation'].apply(clean_name)
df_base['Cleaned_Base_Name'] = df_base['Base_Name'].apply(clean_name)

print("\nCleaned Name Variations (first 5 rows):")
print(df_variations[['Variation', 'Cleaned_Variation']].head())
print("\nCleaned Base Names:")
print(df_base[['Base_Name', 'Cleaned_Base_Name']].head())

# --- Fuzzy Matching Implementation ---
def find_best_fuzzy_match(variation_name, base_names_df, scorer, threshold):
    best_match_name = None
    best_score = 0
    
    for _, row in base_names_df.iterrows():
        score = scorer(variation_name, row['Cleaned_Base_Name'])
        if score > best_score:
            best_score = score
            best_match_name = row['Base_Name'] # Return the original Base_Name
            
    if best_score >= threshold:
        return best_match_name, best_score
    else:
        return None, best_score # No match above threshold

# Experiment with fuzz.token_set_ratio and different thresholds

print("\n--- Fuzzy Matching with fuzz.token_set_ratio ---")
print("Choosing fuzz.token_set_ratio as it is robust to word order and partial matches.")

# Test with an initial high threshold (e.g., 85) for names to ensure high confidence
print("\nThreshold = 85:")
results_high_threshold = []
threshold = 85
for _, row in df_variations.iterrows():
    predicted_name, score = find_best_fuzzy_match(row['Cleaned_Variation'], df_base, fuzz.token_set_ratio, threshold)
    results_high_threshold.append({
        'Variation': row['Variation'],
        'Cleaned_Variation': row['Cleaned_Variation'],
        'Actual_Base_Name': row['Matches_With_Base_Name'],
        'Predicted_Base_Name': predicted_name,
        'Similarity_Score': score
    })

df_results_high = pd.DataFrame(results_high_threshold)
accuracy_high = (df_results_high['Actual_Base_Name'] == df_results_high['Predicted_Base_Name']).mean()
print(df_results_high.to_string())
print(f"\nAccuracy (Threshold={threshold}): {accuracy_high:.2f}")

# Test with a slightly lower threshold (e.g., 75) to capture more variations
print("\n--- Fuzzy Matching with fuzz.token_set_ratio ---")
print("Threshold = 75 (to capture more variations including initials/typos):")
results_medium_threshold = []
threshold = 75
for _, row in df_variations.iterrows():
    predicted_name, score = find_best_fuzzy_match(row['Cleaned_Variation'], df_base, fuzz.token_set_ratio, threshold)
    results_medium_threshold.append({
        'Variation': row['Variation'],
        'Cleaned_Variation': row['Cleaned_Variation'],
        'Actual_Base_Name': row['Matches_With_Base_Name'],
        'Predicted_Base_Name': predicted_name,
        'Similarity_Score': score
    })

df_results_medium = pd.DataFrame(results_medium_threshold)
accuracy_medium = (df_results_medium['Actual_Base_Name'] == df_results_medium['Predicted_Base_Name']).mean()
print(df_results_medium.to_string())
print(f"\nAccuracy (Threshold={threshold}): {accuracy_medium:.2f}")

# Test with an even lower threshold (e.g., 60) for very loose matching
print("\n--- Fuzzy Matching with fuzz.token_set_ratio ---")
print("Threshold = 60 (very loose matching):")
results_low_threshold = []
threshold = 60
for _, row in df_variations.iterrows():
    predicted_name, score = find_best_fuzzy_match(row['Cleaned_Variation'], df_base, fuzz.token_set_ratio, threshold)
    results_low_threshold.append({
        'Variation': row['Variation'],
        'Cleaned_Variation': row['Cleaned_Variation'],
        'Actual_Base_Name': row['Matches_With_Base_Name'],
        'Predicted_Base_Name': predicted_name,
        'Similarity_Score': score
    })

df_results_low = pd.DataFrame(results_low_threshold)
accuracy_low = (df_results_low['Actual_Base_Name'] == df_results_low['Predicted_Base_Name']).mean()
print(df_results_low.to_string())
print(f"\nAccuracy (Threshold={threshold}): {accuracy_low:.2f}")

Name Variations (first 5 rows):
      Variation Matches_With_Base_Name
0  Thomas  King            Thomas King
1    ThomasKing            Thomas King
2  Maria Garcia           Maria Garcia
3     MaryLewis             Mary Lewis
4      Nancy W.           Nancy Wright

Base Names:
   Base_Name_ID         Base_Name
0             1        John Smith
1             2    Jennifer Brown
2             3  Michael O'Connor
3             4      Maria Garcia
4             5        Robert Lee

Cleaned Name Variations (first 5 rows):
      Variation Cleaned_Variation
0  Thomas  King       thomas king
1    ThomasKing        thomasking
2  Maria Garcia      maria garcia
3     MaryLewis         marylewis
4      Nancy W.           nancy w

Cleaned Base Names:
          Base_Name Cleaned_Base_Name
0        John Smith        john smith
1    Jennifer Brown    jennifer brown
2  Michael O'Connor   michael oconnor
3      Maria Garcia      maria garcia
4        Robert Lee        robert lee

--- Fuzzy Matching wit