In [1]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import defaultdict

In [2]:
data1 = pd.read_excel("Unmatched_Data.xlsx", sheet_name="Saiba_Dump")
data2 = pd.read_excel("Unmatched_Data.xlsx", sheet_name='Lombard_Statement')

In [3]:
# Preprocess name function
def preprocess_name(name):
    if pd.isna(name):
        return ""
    name = str(name)
    # Define a list of words to omit
    words_to_omit = [
        'industry', 'industries', 'corp', 'corporation', 'inc', 'incorporated', 'foundation', 
        'company', 'co', 'limited', 'ltd', 'pvt', 'llc', 'llp', 'and', 'pvtltd', '&', 'm/s', 'ms'
    ]
    # Remove common titles and suffixes, and unwanted characters
    name = re.sub(r'\b(Mr|Ms|Ltd|LLP|Pvt|Private|Limited|LLC|LTD|Llp|ltd|lp|PLLP|Pllp|P.L.C.|ms|m/s|pvtltd)\b', '', name, flags=re.IGNORECASE)
    # Normalize "and", "AND", "And", "&" to "and"
    name = re.sub(r'\b(and|AND|And|&)\b', 'and', name, flags=re.IGNORECASE)
    # Remove dots (.) and commas (,)
    name = re.sub(r'[.,]', '', name)
    # Remove words to omit
    for word in words_to_omit:
        name = re.sub(r'\b' + word + r'\b', '', name, flags=re.IGNORECASE)
    # Remove spaces and make lowercase
    name = re.sub(r'\s+', '', name).lower()
    return name

# Compute similarity function
def compute_similarity(data1, data2, threshold=71):
    names1 = pd.Series(data1['CustName'].unique()).astype(str).apply(preprocess_name)
    names2 = pd.Series(data2['INSURED_CUSTOMER_NAME'].unique()).astype(str).apply(preprocess_name)
    results = {}
    index_dict_1 = defaultdict(list)
    index_dict_2 = defaultdict(list)
    
    for name1 in names1:
        for name2 in names2:
            similarity = fuzz.ratio(name1, name2)
            if similarity >= threshold:
                if similarity not in results:
                    results[similarity] = []
                results[similarity].append((name1, name2))
                index_dict_1[name1].extend(data1[data1['CustName'].apply(preprocess_name) == name1]['Index'].tolist())
                index_dict_2[name2].extend(data2[data2['INSURED_CUSTOMER_NAME'].apply(preprocess_name) == name2]['Index'].tolist())
    
    return results, index_dict_1, index_dict_2

In [4]:
# Compute similarity with a threshold of 71%
similarity_dict, index_dict_1, index_dict_2 = compute_similarity(data1, data2, threshold=71)

# Sort the similarity_dict by keys in descending order
sorted_similarity_dict = dict(sorted(similarity_dict.items(), key=lambda item: item[0], reverse=True))

name_data1 = []
name_data2 = []
index_list_1 = []
index_list_2 = []

# Display results
indexPairs = []
for similarity, pairs in sorted_similarity_dict.items():
    for pair in pairs:
        for i in index_dict_1[pair[0]]:
            for j in index_dict_2[pair[1]]:
                result_dict = {}
                result_dict[i] = j
                if result_dict not in indexPairs:
                    indexPairs.append(result_dict)
        name_data1.append(pair[0])
        name_data2.append(pair[1])
        index_list_1.extend(index_dict_1[pair[0]])
        index_list_2.extend(index_dict_2[pair[1]])

In [5]:
def sort_dicts_by_numeric_key(lst):
    # Define a helper function to extract the numeric part after 'S'
    def extract_numeric_key(d):
        key = list(d.keys())[0]  # Get the key (e.g., 'S219')
        return int(key[1:])      # Extract numeric part and convert to integer
    
    # Sort the list of dictionaries based on extracted numeric keys
    sorted_list = sorted(lst, key=extract_numeric_key)
    
    return sorted_list

In [6]:
sorted_list = sort_dicts_by_numeric_key(indexPairs)
#sorted_list

In [7]:
#filtered_data1 = data1[data1['Index'].isin(index_list_1)]
#filtered_data2 = data2[data2['Index'].isin(index_list_2)]