In [1]:
import pandas as pd
from collections import defaultdict

def read():
    dfa = pd.read_csv("../data/input/merged_profiles.csv")
    dfb = pd.read_csv("../data/input/personnel.csv")
    return dfa, dfb

dfa, dfb = read()

dfa["source"] = "wci"
dfa["agency"] = "n/a"

dfb["source"] = "llead"
dfb["officer_role"] ="n/a"
dfb["officer_context"] = "n/a"

df = pd.concat([dfa, dfb])


df.loc[:, "first_name"] = df.first_name.str.lower().str.strip()
df.loc[:, "last_name"] = df.last_name.str.lower().str.strip()

df.loc[:, "fc"] = df.first_name.fillna("").map(lambda x: x[:1])
df.loc[:, "lc"] = df.last_name.fillna("").map(lambda x: x[:1])

df = df[["first_name", "last_name","fc", "lc", "source", "uid", "agency"]]

print(f"DF SHAPE BEFORE {df.shape}")

df = df.drop_duplicates(subset=["uid"])

print(f"DF SHAPE AFTER {df.shape}")

df = df.reset_index(drop=True)

df.loc[:, "full_name"] = df.first_name.str.cat(df.last_name, sep=" ")

df = df[~((df.full_name.fillna("") == ""))]

full_names = df.first_name.str.cat(df.last_name, sep=" ")


def jaro_winkler_similarity(s1, s2):
    # Jaro-Winkler similarity implementation
    if not s1 or not s2:
        return 0.0

    # Find matching characters
    match_distance = (max(len(s1), len(s2)) // 2) - 1
    s1_matches = [0] * len(s1)
    s2_matches = [0] * len(s2)
    matches = 0
    transpositions = 0

    for i, ch in enumerate(s1):
        start = max(0, i - match_distance)
        end = min(i + match_distance + 1, len(s2))
        for j in range(start, end):
            if s2[j] == ch and s2_matches[j] == 0:
                s1_matches[i] = 1
                s2_matches[j] = 1
                matches += 1
                break

    if matches == 0:
        return 0.0

    # Count transpositions
    k = 0
    for i, ch in enumerate(s1):
        if s1_matches[i]:
            while s2_matches[k] == 0:
                k += 1
            if s1[i] != s2[k]:
                transpositions += 1
            k += 1

    # Calculate Jaro similarity
    jaro = ((matches / len(s1)) + (matches / len(s2)) + ((matches - transpositions / 2) / matches)) / 3

    # Calculate common prefix
    prefix = 0
    for i in range(min(len(s1), len(s2))):
        if s1[i] == s2[i]:
            prefix += 1
        else:
            break
    prefix = min(4, prefix)

    # Calculate Jaro-Winkler similarity
    jaro_winkler = jaro + (0.1 * prefix * (1 - jaro))

    return jaro_winkler

def custom_matcher(df):
    results = []
    pair_idx = 0
    
    # Create a dictionary to group records by first and last character
    groups = defaultdict(list)
    for _, row in df.iterrows():
        key = (row['fc'], row['lc'])
        groups[key].append(row)
    
    # Compare within groups
    for group in groups.values():
        for i, row1 in enumerate(group):
            for j, row2 in enumerate(group[i+1:], start=i+1):
                # Skip if same source
                if row1['source'] == row2['source']:
                    continue
                
                # Calculate similarity scores
                first_name_sim = jaro_winkler_similarity(row1['first_name'], row2['first_name'])
                last_name_sim = jaro_winkler_similarity(row1['last_name'], row2['last_name'])
                sim_score = (first_name_sim + last_name_sim) / 2
                
                # Only add results with similarity score >= 0.80
                if sim_score >= 0.80:
                    results.append({
                        'pair_idx': pair_idx,
                        'sim_score': sim_score,
                        'row_key1': row1.name,
                        'first_name1': row1['first_name'],
                        'last_name1': row1['last_name'],
                        'fc1': row1['fc'],
                        'source1': row1['source'],
                        'agency1': row1['agency'],
                        'row_key2': row2.name,
                        'first_name2': row2['first_name'],
                        'last_name2': row2['last_name'],
                        'fc2': row2['fc'],
                        'source2': row2['source'],
                        'agency2': row2['agency']
                    })
                    pair_idx += 1
    
    # Convert results to DataFrame
    result_df = pd.DataFrame(results)
    
    # Sort results by sim_score in descending order
    result_df = result_df.sort_values('sim_score', ascending=False)
    
    return result_df

review = custom_matcher(df)

review

DF SHAPE BEFORE (72060, 7)
DF SHAPE AFTER (71136, 7)


Unnamed: 0,pair_idx,sim_score,row_key1,first_name1,last_name1,fc1,source1,agency1,row_key2,first_name2,last_name2,fc2,source2,agency2
770,770,1.000000,589,david,wilson,d,wci,,1396,david,wilson,d,llead,new-orleans-pd
607,607,1.000000,572,marlon,defillo,m,wci,,22129,marlon,defillo,m,llead,new-orleans-constables-office
468,468,1.000000,591,sarah,johnson,s,wci,,43399,sarah,johnson,s,llead,natchitoches-pd
792,792,1.000000,589,david,wilson,d,wci,,38996,david,wilson,d,llead,department-of-corrections
385,385,1.000000,545,thomas,smegal,t,wci,,57151,thomas,smegal,t,llead,new-orleans-pd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,53,0.801250,486,david,dimaggio,d,wci,,51480,david,delaughter,d,llead,baton-rouge-pd
55,55,0.801250,486,david,dimaggio,d,wci,,54833,david,delaughter,d,llead,baton-rouge-pd
310,310,0.801058,535,robert,hankenhof,r,wci,,55357,robert,hattala,r,llead,hammond-pd
568,568,0.800397,559,denise,spencer,d,wci,,50210,dennis,sawyer,d,llead,new-orleans-so
