In [None]:
import pandas as pd
import re
from fuzzywuzzy import fuzz

# Load the dataset
data = pd.read_csv("C:\\Users\\HP\\OneDrive\\Desktop\\Book1.csv", encoding='ISO-8859-1')  # Replace with your file path

# Trim whitespace from column names
data.columns = data.columns.str.strip()

# Optionally, trim whitespace from specific columns (if needed)
data['NIBSS FullName'] = data['NIBSS FullName'].str.strip()
data['ACCT_NAME'] = data['ACCT_NAME'].str.strip()



In [None]:
# Function to clean up names by removing unwanted punctuation and special characters
def clean_name(name):
    cleaned_name = re.sub(r'[^a-zA-Z0-9\s.-]', '', name)  # Allow alphanumeric characters, spaces, hyphens, and periods
    return cleaned_name.strip()




In [None]:
# Function to classify names based on user-defined rules
def classify_name(nibss_name, acct_name):
    # Clean the names
    cleaned_nibss_name = clean_name(nibss_name)
    cleaned_acct_name = clean_name(acct_name)

    # Convert both names to lowercase for case-insensitive comparison
    normalized_nibss_name = cleaned_nibss_name.lower().replace('-', ' ')
    normalized_acct_name = cleaned_acct_name.lower().replace('-', ' ')

    # Split normalized names into words for comparison
    nibss_words = set(normalized_nibss_name.split())
    acct_words = set(normalized_acct_name.split())

    # Rule 5: Check for presence of business-like terms in any name
    business_terms = [
        "church", "enterprise", "company", "lodge", "family", "society", "foundation", "youth", 
        "services", "visa", "prepaid", "fbn", "est.", "est." , "limited" , "ltd" , "client" , "clients" , "admin", "union", "club", "cathedral", 
        "port harcourt", "ministries", "assemblies", "league", "first bank", "hospital", "age grade", 
        "local govt", "local government", "lga account", "anglican", "zone", "investment", 
        "institute", "family", "nipost", "ass", "association", "school" , "Nig" , "Technology", "Technologies" , "Nigeria"
    ]

    if any(term in nibss_words or term in acct_words for term in business_terms):
        return "Business"

    # Rule 3: Misarrangement - check if names are the same but ordered differently
    token_sort_ratio = fuzz.token_sort_ratio(normalized_nibss_name, normalized_acct_name)
    if token_sort_ratio == 100:
        return "Misarrangement"
    
    # Rule 1: Spelling - check similarity based on Levenshtein ratio
    token_set_ratio = fuzz.token_set_ratio(normalized_nibss_name, normalized_acct_name)
    if token_set_ratio >= 60 and token_set_ratio < 100:
        return "Spelling"
    
    # Rule 2: Somewhat - check if at least one word matches
    if nibss_words.intersection(acct_words):
        return "Somewhat"
    
    # Rule 4: X - no correlation
    return "X"


In [None]:
# Apply the function to the dataset and save results
data['Classification'] = data.apply(lambda row: classify_name(row['NIBSS FullName'], row['ACCT_NAME']), axis=1)


In [None]:
# Save the resulting dataset with classifications
data.to_csv('C:\\Users\\HP\\OneDrive\\Desktop\\output_file4.csv', index=False)  # Replace with your desired output file path
