In [70]:
import pandas as pd
import pickle
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import BertTokenizer, BertModel
import nltk
from itertools import combinations
from sentence_transformers import SentenceTransformer
import jellyfish
from sklearn.preprocessing import MinMaxScaler

In [71]:
def read_csv():
    df = pd.read_csv("../../ts-feature-engineering/data/output/features.csv")

    df['normalized_label'] = (df['label']
                              .str.lower()
                              .str.strip()
                              .str.replace(r"(easy match|hard match)", "1", regex=True)
                              .str.replace(r"(easy non-match|hard non-match)", "0", regex=True)
    )

    df.loc[:, "label_numeric"] = df["normalized_label"].astype(int)
    return df 


In [72]:
def split_rows_with_multiple_officers(df):
    df = (
        df.drop("split_indv", axis=1)
        .join(
            df["split_indv"]
            .str.split("@", expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .rename("split_indv"),
            how="outer",
        )
        .reset_index(drop=True)
    )
    return df



In [73]:
nltk.download('punkt')  

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def tokenize_and_stem(context):
    tokens = word_tokenize(context)  
    stemmed_tokens = stem_tokens(tokens) 
    return stemmed_tokens

def calculate_string_similarity(s1, s2):
    return jellyfish.jaro_winkler_similarity(s1, s2)

def calculate_context_similarity(contexts1, contexts2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    embeddings1 = model.encode(contexts1, convert_to_tensor=True)
    embeddings2 = model.encode(contexts2, convert_to_tensor=True)
    
    embeddings1_np = embeddings1.cpu().numpy()
    embeddings2_np = embeddings2.cpu().numpy()
    
    cosine_similarities = np.diag(cosine_similarity(embeddings1_np, embeddings2_np)).tolist()
    
    return cosine_similarities

def generate_pairwise_comparisons(df):
    df = df.fillna("")
    comparison_data = []

    # Flatten the list of all blocking keys to find unique keys across the dataset
    all_keys = set([key for sublist in df['blocking_keys'] for key in sublist])
    
    for key in all_keys:
        # Find all records that have this blocking key
        filtered_df = df[df['blocking_keys'].apply(lambda x: key in x)]
        unique_entities = filtered_df['person_uid'].unique()

        # Generate all combinations of unique entities within this filtered group
        for entity_1, entity_2 in combinations(unique_entities, 2):
            entity_1_row = filtered_df[filtered_df['person_uid'] == entity_1].iloc[0]
            entity_2_row = filtered_df[filtered_df['person_uid'] == entity_2].iloc[0]

            # Compute similarities and differences between entities
            features = {
                'entity_1_uid': entity_1,
                'entity_1_first_name': entity_1_row['first_name'],
                'entity_1_last_name': entity_1_row['last_name'],
                'entity_1_role': entity_1_row.get('officer_role', ''),
                'entity_1_context': entity_1_row.get('officer_context', ''),
                'entity_2_uid': entity_2,
                'entity_2_first_name': entity_2_row['first_name'],
                'entity_2_last_name': entity_2_row['last_name'],
                'entity_2_role': entity_2_row.get('officer_role', ''),
                'entity_2_context': entity_2_row.get('officer_context', ''),
                'first_name_similarity': calculate_string_similarity(entity_1_row['first_name'], entity_2_row['first_name']),
                'last_name_similarity': calculate_string_similarity(entity_1_row['last_name'], entity_2_row['last_name']),
                'role_similarity': calculate_string_similarity(entity_1_row.get('officer_role', ''), entity_2_row.get('officer_role', '')),
                'first_name_length_diff': abs(len(entity_1_row['first_name']) - len(entity_2_row['first_name'])),
                'last_name_length_diff': abs(len(entity_1_row['last_name']) - len(entity_2_row['last_name'])),
                'context_similarity': calculate_context_similarity([entity_1_row.get('officer_context', '')], [entity_2_row.get('officer_context', '')])[0],
            }

            comparison_data.append(features)

    # Convert comparison data to DataFrame and scale features
    comparison_df = pd.DataFrame(comparison_data)
    scaler = MinMaxScaler()
    features_to_scale = ['first_name_similarity', 'last_name_similarity', 
                         'role_similarity', 'first_name_length_diff', 
                         'last_name_length_diff', 'context_similarity']
    comparison_df[features_to_scale] = scaler.fit_transform(comparison_df[features_to_scale])

    return comparison_df

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


In [74]:
df = read_csv()

df


df.loc[:, "indv_1"] = df.entity_1_first_name.str.cat(df.entity_1_last_name, sep="*").str.cat(df.entity_1_rank, sep="*").str.cat(df.entity_1_context, sep="*")
df.loc[:, "indv_2"] = df.entity_2_first_name.str.cat(df.entity_2_last_name, sep="*").str.cat(df.entity_2_rank, sep="*").str.cat(df.entity_2_context, sep="*")

df.loc[:, "split_indv"] = df.indv_1.str.cat(df.indv_2, sep="@")

df = df.pipe(split_rows_with_multiple_officers)

info = df.split_indv.str.extract(r"(.+)*(\w+)*(\w+)*(\w+)")

df.loc[:, "first_name"] = info[0]

df.loc[:, "last_name"] = info[1]

df.loc[:, "officer_role"] = info[2]

df.loc[:, "officer_context"] = info[3]



df


# df.loc[:, "person_1", "person_2"] = df.split_indv.str.split("@")






df

# df = df.iloc[:10]

# df = df.pipe(generate_pairwise_comparisons)
# df

Unnamed: 0,entity_1_first_name,entity_1_last_name,entity_1_rank,entity_1_context,entity_2_first_name,entity_2_last_name,entity_2_rank,entity_2_context,label,first_name_similarity,...,entity_2_uid,normalized_label,label_numeric,indv_1,indv_2,split_indv,first_name,last_name,officer_role,officer_context
0,,rodrigue,investigating detective,Directly involved in the investigation of the ...,,rodriguez,detective,Mentioned as someone the speaker went to the c...,Hard non-match,0.000000,...,dd4c983730a24553301ee69e1efb32e00fd03839ab1673...,0,0,,,,,,,
1,dewilliam,trepagnier #1504,detective,Mentioned in context with the search and inter...,dewilliam,trepagnier,detective,Mentioned in a context related to the crime la...,Easy match,1.000000,...,48313dc154307cb1cc114afc7b7084b2ecf73cbd4230c0...,1,1,dewilliam*trepagnier #1504*detective*Mentioned...,dewilliam*trepagnier*detective*Mentioned in a ...,dewilliam*trepagnier #1504*detective*Mentioned...,dewillia,,,m
2,dewilliam,trepagnier #1504,detective,Mentioned in context with the search and inter...,dewilliam,trepagnier,detective,Mentioned in a context related to the crime la...,Easy match,1.000000,...,48313dc154307cb1cc114afc7b7084b2ecf73cbd4230c0...,1,1,dewilliam*trepagnier #1504*detective*Mentioned...,dewilliam*trepagnier*detective*Mentioned in a ...,dewilliam*trepagnier*detective*Mentioned in a ...,dewillia,,,m
3,dewilliam,trepagnier #1504,detective,Mentioned in context with the search and inter...,dewilliam,trepagnier #1504,detective,Mentioned in relation to conducting interviews...,Easy match,1.000000,...,fd9e8ef4a53913e12dc042aad6006403ecd4bad2a2d826...,1,1,dewilliam*trepagnier #1504*detective*Mentioned...,dewilliam*trepagnier #1504*detective*Mentioned...,dewilliam*trepagnier #1504*detective*Mentioned...,dewillia,,,m
4,dewilliam,trepagnier #1504,detective,Mentioned in context with the search and inter...,dewilliam,trepagnier #1504,detective,Mentioned in relation to conducting interviews...,Easy match,1.000000,...,fd9e8ef4a53913e12dc042aad6006403ecd4bad2a2d826...,1,1,dewilliam*trepagnier #1504*detective*Mentioned...,dewilliam*trepagnier #1504*detective*Mentioned...,dewilliam*trepagnier #1504*detective*Mentioned...,dewillia,,,m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,debbie,coffee,lead detective,Mentioned as the lead police officer in a case...,louis,berard,detective,Testified as a Detective and Police Officer in...,easy non-match,0.455556,...,aa667086e0c5d4e2d5ae428eef9e55d33646832c7680f2...,0,0,debbie*coffee*lead detective*Mentioned as the ...,louis*berard*detective*Testified as a Detectiv...,louis*berard*detective*Testified as a Detectiv...,loui,,,s
2022,debbie,coffee,rape detective,Mentioned as working in the Rape Section.,louis,berard,homicide division detective,Mentioned as a Detective from the New Orleans ...,easy non-match,0.455556,...,a56ec822462dda139e8642d3dfd1f70bdd5423073c254a...,0,0,debbie*coffee*rape detective*Mentioned as work...,louis*berard*homicide division detective*Menti...,debbie*coffee*rape detective*Mentioned as work...,debbi,,,e
2023,debbie,coffee,rape detective,Mentioned as working in the Rape Section.,louis,berard,homicide division detective,Mentioned as a Detective from the New Orleans ...,easy non-match,0.455556,...,a56ec822462dda139e8642d3dfd1f70bdd5423073c254a...,0,0,debbie*coffee*rape detective*Mentioned as work...,louis*berard*homicide division detective*Menti...,louis*berard*homicide division detective*Menti...,loui,,,s
2024,debbie,coffee,lead detective,Mentioned as collaborating with Detective Al S...,louis,berard,detective (homicide division),Testified as a Police Officer from the Homicid...,easy non-match,0.455556,...,6590f2642b350b42e0acb0aa4348ddd6f6a8cc52987707...,0,0,debbie*coffee*lead detective*Mentioned as coll...,louis*berard*detective (homicide division)*Tes...,debbie*coffee*lead detective*Mentioned as coll...,debbi,,,e


In [75]:
# df.to_csv("../data/output/output-pairwise-scales.csv", index=False)