In [None]:
import pandas as pd
from collections import defaultdict
import hashlib
from sklearn.preprocessing import MinMaxScaler
import pickle

def read():
    dfa = pd.read_csv("../data/input/merged_profiles.csv")
    dfb = pd.read_csv("../data/input/personnel.csv")
    return dfa, dfb

def read_model(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

dfa, dfb = read()
model = read_model('../../ts-train-model/data/output/trained_lr_model.pkl')

dfa["source"] = "wci"
dfa["agency"] = "n/a"

dfb["source"] = "llead"
dfb["officer_role"] ="n/a"
dfb["officer_context"] = "n/a"
dfb = dfb[dfb.agency.str.contains("orleans-pd|orleans-so")]

def create_hash_uid(row):
    unique_string = f"{row['first_name1']}|{row['last_name1']}|{row['first_name2']}|{row['last_name2']}|{row['source1']}|{row['source2']}"
    return hashlib.md5(unique_string.encode()).hexdigest()

dfa = dfa.rename(columns={"person_uid":  "wcoi_uid"})
dfb = dfb.rename(columns={"uid": "llead_uid"})

df = pd.concat([dfa, dfb])

df.loc[:, "first_name"] = df.first_name.str.lower().str.strip()
df.loc[:, "last_name"] = df.last_name.str.lower().str.strip()

df.loc[:, "fc"] = df.first_name.fillna("").map(lambda x: x[:1])
df.loc[:, "lc"] = df.last_name.fillna("").map(lambda x: x[:1])

df = df[["first_name", "last_name", "fc", "lc", "source", "wcoi_uid", "llead_uid", "agency"]]

print(f"DF SHAPE BEFORE {df.shape}")
df = df.drop_duplicates(subset=["wcoi_uid", "llead_uid"])
print(f"DF SHAPE AFTER {df.shape}")

df = df.reset_index(drop=True)
df.loc[:, "full_name"] = df.first_name.str.cat(df.last_name, sep=" ")
df = df[~((df.full_name.fillna("") == ""))]

full_names = df.first_name.str.cat(df.last_name, sep=" ")

def calculate_similarity(row1, row2):
    features = [
        abs(len(row1['first_name']) - len(row2['first_name'])),
        abs(len(row1['last_name']) - len(row2['last_name'])),
        int(row1['first_name'][0] == row2['first_name'][0]),
        int(row1['last_name'][0] == row2['last_name'][0]),
        len(set(row1['first_name']) & set(row2['first_name'])) / max(len(row1['first_name']), len(row2['first_name'])),
        len(set(row1['last_name']) & set(row2['last_name'])) / max(len(row1['last_name']), len(row2['last_name']))
    ]
    
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform([features])
    similarity_score = model.predict_proba(scaled_features)[0][1]  # Probability of positive class
    
    return similarity_score

def custom_matcher(df):
    results = []
    
    wci_df = df[df['source'] == 'wci']
    llead_df = df[df['source'] == 'llead']
    
    for _, wci_row in wci_df.iterrows():
        best_match = None
        best_score = -1
        
        for _, llead_row in llead_df.iterrows():
            if wci_row['fc'] == llead_row['fc'] and wci_row['lc'] == llead_row['lc']:
                sim_score = calculate_similarity(wci_row, llead_row)
                
                if sim_score > best_score:
                    best_score = sim_score
                    best_match = llead_row
        
        if best_match is not None:
            results.append({
                'sim_score': best_score,
                'first_name1': wci_row['first_name'],
                'last_name1': wci_row['last_name'],
                'fc1': wci_row['fc'],
                'source1': wci_row['source'],
                'agency1': wci_row['agency'],
                'wcoi_uid1': wci_row['wcoi_uid'],
                'llead_uid1': wci_row['llead_uid'],
                'first_name2': best_match['first_name'],
                'last_name2': best_match['last_name'],
                'fc2': best_match['fc'],
                'source2': best_match['source'],
                'agency2': best_match['agency'],
                'wcoi_uid2': best_match['wcoi_uid'],
                'llead_uid2': best_match['llead_uid']
            })
    
    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values('sim_score', ascending=False)
    
    return result_df

df = custom_matcher(df)
df['person_uid'] = df.apply(create_hash_uid, axis=1)

df.to_csv("../data/output/merged_officer_profiles_with_best_matches.csv", index=False)
print(f"Final DataFrame shape: {df.shape}")
print(f"Number of unique WCI entities matched: {df['wcoi_uid1'].nunique()}")
print(f"Number of unique LLEAD entities matched: {df['llead_uid2'].nunique()}")