In [None]:
# WORKING GREAT NO ID GENERATION 02-18-2025

import difflib
import re
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
import unicodedata
from unidecode import unidecode
from ftfy import fix_text

SUFFIXES = {
    'MD', 'PhD', 'DDS', 'DVM', 'Esq', 'Jr', 'Sr', 'II', 'III', 'IV', 'V', 'VI',
    'CPA', 'CFA', 'PE', 'MBE', 'OBE', 'CBE', 'KBE', 'DBE', 'RN', 'NP', 'RPh',
    'Ret', 'Emeritus', 'USN', 'USA', 'USAF', 'MS', 'MA', 'MBA', 'JD', 'LLM',
    'ThD', 'DMin', 'MTh', 'FAIA', 'FAAN', 'FACS', 'DPT', 'DC', 'DSc', 'MEng',
    'MPH', 'MFA', 'MLIS', 'EdD', 'PsyD', 'DPhil', 'DPA', 'DNP', 'DO', 'DM',
    'FRCS', 'FRCP', 'FRCOG', 'FRCPsych', 'FRSC', 'DCL', 'DLitt', 'DEng',
    'FCA', 'FCMA', 'CGMA', 'CMgr', 'ChFC', 'CLU', 'CFP', 'AIA', 'FIA', 'ASA',
    'ACAS', 'FCAS', 'FIA', 'MActSc', 'MSc', 'BSc', 'BA', 'BS', 'BEng', 'LLB'
}

PREFIXES = {
    'Dr', 'Prof', 'Mr', 'Mrs', 'Ms', 'Miss', 'Mx', 'Master', 'Sir', 'Dame',
    'Lady', 'Lord', 'Hon', 'Rev', 'Fr', 'Rabbi', 'Imam', 'Sheikh', 'Capt',
    'Col', 'Maj', 'Lt', 'Cpl', 'Sgt', 'Gen', 'Adm', 'President', 'Senator',
    'Governor', 'Ambassador', 'Judge', 'Justice', 'Attorney', 'Esq', 'Professor',
    'The Honorable', 'Sr', 'Elder', 'Brother', 'Sister', 'Pastor', 'Bishop',
    'Archbishop', 'Cardinal', 'Pope', 'Deacon', 'Venerable', 'Canon', 'Chaplain',
    'Father', 'Mother', 'Abbess', 'Prior', 'Abbot', 'Metropolitan', 'Ayatollah',
    'Mullah', 'Reverend Dr', 'Minister', 'Chancellor', 'Principal', 'Provost',
    'Dean', 'Regent', 'Chairman', 'Chairwoman', 'Commander', 'Commodore',
    'Brigadier', 'Marshal', 'Field Marshal', 'Rear Admiral', 'Vice Admiral',
    'Air Chief Marshal', 'Air Marshal', 'Air Vice Marshal', 'General of the Army',
    'Fleet Admiral', 'Supreme Commander', 'Grand Master', 'Warden', 'Sovereign'
}

def clean_name(name):
    """Robust cleaning with encoding repair and middle initial preservation"""
    # Fix multi-layer encoding errors
    name = fix_text(name, normalization='NFC')
    
    # Normalize Unicode and preserve middle initials
    name = unicodedata.normalize('NFKD', name)
    name = ''.join(c for c in name if not unicodedata.combining(c))
    
    # Transliterate special characters
    name = unidecode(name)
    
    # Enhanced middle initial detection
    name = re.sub(r'\b([A-Z])[.]?\s+', r'\1. ', name)  # Standardize initials
    
    # Remove prefixes/suffixes
    for prefix in sorted(PREFIXES, key=len, reverse=True):
        name = re.sub(fr'(?i)^\s*{re.escape(prefix)}\b[.,]?\s*', '', name)
    
    for suffix in sorted(SUFFIXES, key=len, reverse=True):
        name = re.sub(fr'(?i)\s*\b{re.escape(suffix)}[.,]?\s*$', '', name)
    
    # Final cleanup
    name = re.sub(r'[^\w\s.]', ' ', name)  # Keep periods for initials
    name = re.sub(r'[^\w\s.-]', ' ', name)  # Keep hyphens for compound names
    name = re.sub(r'\s+', ' ', name).strip().title()
    
    return name

def group_names(names):
    def parse_name(name):
        cleaned = clean_name(name)
        # Split names while preserving hyphenated components
        parts = re.split(r'(?<!-)\s+(?![a-z])', cleaned)  # Keep hyphenated names intact
        return {
            'original': name,
            'first': parts[0] if parts else '',
            'middles': parts[1:-1] if len(parts) > 2 else [],
            'last': parts[-1] if parts else ''
        }

    def is_initial(part):
        cleaned = part.replace('.', '').strip()
        return len(cleaned) == 1 and cleaned.isalpha()

    parsed = [parse_name(name) for name in names]
    last_name_groups = {}
    
    for p in parsed:
        last = p['last']
        last_name_groups.setdefault(last, []).append(p)

    grouped_results = {}

    for last, group in last_name_groups.items():
        # Process first names with hyphen handling
        full_firsts = {}
        initial_firsts = {}
        for p in group:
            first = p['first']
            # Split hyphenated first names but keep original
            base_first = re.split(r'-', first)[0].strip()
            if is_initial(base_first):
                initial = base_first.replace('.', '').upper()
                initial_firsts.setdefault(initial, []).append(p)
            else:
                full_firsts[base_first] = first  # Store both base and full version

        # Create first name mapping considering hyphens
        first_initial_map = {}
        for initial in initial_firsts:
            candidates = [name for base, name in full_firsts.items() 
                        if base.upper().startswith(initial)]
            if candidates:
                # Select longest version (prefer hyphenated names)
                first_initial_map[initial] = max(candidates, key=len)

        # Process middle names with full name prioritization
        middle_initial_map = {}
        middle_full_map = {}
        for p in group:
            for middle in p['middles']:
                if is_initial(middle):
                    initial = middle.replace('.', '').upper()
                    # Map initial to longest available full name
                    if initial in middle_initial_map:
                        current = middle_initial_map[initial]
                        if len(current) == 1:  # Replace initial with full name
                            middle_initial_map[initial] = middle_full_map.get(initial, current)
                    else:
                        middle_initial_map[initial] = middle_full_map.get(initial, middle)
                else:
                    initial = middle[0].upper()
                    middle_full_map[initial] = middle
                    # Update initial map if full name exists
                    if initial in middle_initial_map:
                        if len(middle) > len(middle_initial_map[initial]):
                            middle_initial_map[initial] = middle

        # Standardize names with enhanced mapping
        for p in group:
            # Handle hyphenated first names
            base_first = re.split(r'-', p['first'])[0].strip()
            if is_initial(base_first):
                initial = base_first.replace('.', '').upper()
                std_first = first_initial_map.get(initial, p['first'])
            else:
                std_first = full_firsts.get(base_first, p['first'])

            # Standardize middle names
            std_middles = []
            for middle in p['middles']:
                if is_initial(middle):
                    initial = middle.replace('.', '').upper()
                    std_middles.append(middle_initial_map.get(initial, middle))
                else:
                    std_middles.append(middle)

            # Create standardized key with hyphen handling
            std_key = (
                std_first.strip().title(),
                ' '.join(std_middles).strip(),
                last.strip().title()
            )
            
            if std_key not in grouped_results:
                grouped_results[std_key] = []
            grouped_results[std_key].append(p['original'])

    # Enhanced merging logic for hyphenated and initial variations
    merge_map = {}
    for key in list(grouped_results.keys()):
        first, middles, last = key
        # Consider base first name (without hyphens)
        base_first = re.split(r'-', first)[0].strip()
        base_key = (base_first, last)
        
        current_middles = [m[0].upper() for m in middles.split() if m]

        if base_key not in merge_map:
            merge_map[base_key] = {
                'main_key': key,
                'variations': grouped_results[key],
                'full_middles': middles.split(),
                'middle_initials': current_middles
            }
        else:
            existing = merge_map[base_key]
            
            # Check if middles are compatible through initials
            existing_initials = existing['middle_initials']
            if set(current_middles) == set(existing_initials):
                existing['variations'].extend(grouped_results[key])
                # Prefer longer middle name format
                if len(middles.split()) > len(existing['full_middles']):
                    existing['main_key'] = key
                    existing['full_middles'] = middles.split()
                del grouped_results[key]

    # Update grouped_results with merged entries
    for entry in merge_map.values():
        if entry['main_key'] not in grouped_results:
            grouped_results[entry['main_key']] = entry['variations']

    # Final processing with cleaned canonical names
    final_groups = []
    for (first, middles, last), variations in grouped_results.items():
        canonical_parts = [first]
        if middles:
            canonical_parts.extend(middles.split())
        canonical_parts.append(last)
        
        cleaned_canonical = clean_name(' '.join(canonical_parts))
        
        final_groups.append({
            'canonical_name': cleaned_canonical,
            'variations': sorted(list(set(variations)))
        })

    final_groups.sort(key=lambda x: x['canonical_name'])
    return final_groups

def cluster_protected_names(names, eps=0.2):  # Reduced eps for tighter clusters
    # Clean names and get rule groups
    cleaned_names = [clean_name(n) for n in names]
    rule_groups = group_names(names)
    
    # Create name to canonical mapping
    name_to_canonical = {}
    for group in rule_groups:
        for variation in group['variations']:
            name_to_canonical[variation] = group['canonical_name']
    
    # Build distance matrix using canonical names
    n = len(names)
    distance_matrix = np.ones((n, n))
    
    for i in range(n):
        for j in range(n):
            if name_to_canonical[names[i]] == name_to_canonical[names[j]]:
                distance_matrix[i,j] = 0.0  # Force same cluster for same group
            else:
                # Compare cleaned canonical names
                canon_i = clean_name(name_to_canonical.get(names[i], names[i]))
                canon_j = clean_name(name_to_canonical.get(names[j], names[j]))
                sim = difflib.SequenceMatcher(None, canon_i, canon_j).ratio()
                distance_matrix[i,j] = 1 - sim
            if i == j:
                distance_matrix[i,j] = 0

    # Cluster with DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=1, metric='precomputed')
    clusters = dbscan.fit_predict(distance_matrix)
    
    # Organize clusters
    cluster_dict = {}
    for idx, cluster_id in enumerate(clusters):
        cluster_dict.setdefault(cluster_id, []).append(names[idx])
    
    return cluster_dict, rule_groups

def create_attribution_df(rule_groups):
    """Create DataFrame with fully cleaned canonical names"""
    attribution_map = []
    
    for group in rule_groups:
        canonical = group['canonical_name']
        for variation in group['variations']:
            attribution_map.append({
                'Original Name': variation,
                'Attributed Name': canonical,
                'Cleaned Name': clean_name(variation)  # For verification
            })
    
    return pd.DataFrame(attribution_map)

if __name__ == '__main__':
    # Load names (make sure path is correct)
    names = pd.read_excel("E:\\Students\\Ace\\NameMatcherClass\\trial_input_1.xlsx")['name'].tolist()
    
    # Process names
    clusters, rule_groups = cluster_protected_names(names, eps=0.2)
    
    # Create attribution DataFrame USING RULE GROUPS
    result_df = create_attribution_df(rule_groups)  # <- Changed clusters to rule_groups
    
    # Print results
    print("Name Attribution Results:")
    print(result_df.head())
    
    # Save to Excel
    result_df.to_excel("E:\\Students\\Ace\\NameMatcherClass\\trial_output_0.2.xlsx", index=False)
    print("\nSaved Output to Excel!")