In [5]:
import pandas as pd
import pickle
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import BertTokenizer, BertModel
import nltk
from itertools import combinations
from sentence_transformers import SentenceTransformer
import jellyfish
from sklearn.preprocessing import MinMaxScaler

In [6]:
def read_csv():
    df = pd.read_csv("../../blocking/data/output/blocks.csv")
    return df 

# def read_csv():
#     df = pd.read_csv("../../preprocessing/data/output/clean.csv")
#     df = df.fillna("")
#     return df 

In [7]:
nltk.download('punkt')  

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def tokenize_and_stem(context):
    tokens = word_tokenize(context)  
    stemmed_tokens = stem_tokens(tokens) 
    return stemmed_tokens

def calculate_string_similarity(s1, s2):
    return jellyfish.jaro_winkler_similarity(s1, s2)

def calculate_context_similarity(contexts1, contexts2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    embeddings1 = model.encode(contexts1, convert_to_tensor=True)
    embeddings2 = model.encode(contexts2, convert_to_tensor=True)
    
    embeddings1_np = embeddings1.cpu().numpy()
    embeddings2_np = embeddings2.cpu().numpy()
    
    cosine_similarities = np.diag(cosine_similarity(embeddings1_np, embeddings2_np)).tolist()
    
    return cosine_similarities

def generate_pairwise_comparisons(df):
    df = df.fillna("")
    comparison_data = []

    # Flatten the list of all blocking keys to find unique keys across the dataset
    all_keys = set([key for sublist in df['blocking_keys'] for key in sublist])
    
    for key in all_keys:
        # Find all records that have this blocking key
        filtered_df = df[df['blocking_keys'].apply(lambda x: key in x)]
        unique_entities = filtered_df['person_uid'].unique()

        # Generate all combinations of unique entities within this filtered group
        for entity_1, entity_2 in combinations(unique_entities, 2):
            entity_1_row = filtered_df[filtered_df['person_uid'] == entity_1].iloc[0]
            entity_2_row = filtered_df[filtered_df['person_uid'] == entity_2].iloc[0]

            # Compute similarities and differences between entities
            features = {
                'entity_1_uid': entity_1,
                'entity_1_first_name': entity_1_row['first_name'],
                'entity_1_last_name': entity_1_row['last_name'],
                'entity_1_role': entity_1_row.get('officer_role', ''),
                'entity_1_context': entity_1_row.get('officer_context', ''),
                'entity_2_uid': entity_2,
                'entity_2_first_name': entity_2_row['first_name'],
                'entity_2_last_name': entity_2_row['last_name'],
                'entity_2_role': entity_2_row.get('officer_role', ''),
                'entity_2_context': entity_2_row.get('officer_context', ''),
                'first_name_similarity': calculate_string_similarity(entity_1_row['first_name'], entity_2_row['first_name']),
                'last_name_similarity': calculate_string_similarity(entity_1_row['last_name'], entity_2_row['last_name']),
                'role_similarity': calculate_string_similarity(entity_1_row.get('officer_role', ''), entity_2_row.get('officer_role', '')),
                'first_name_length_diff': abs(len(entity_1_row['first_name']) - len(entity_2_row['first_name'])),
                'last_name_length_diff': abs(len(entity_1_row['last_name']) - len(entity_2_row['last_name'])),
                'context_similarity': calculate_context_similarity([entity_1_row.get('officer_context', '')], [entity_2_row.get('officer_context', '')])[0],
            }

            comparison_data.append(features)

    # Convert comparison data to DataFrame and scale features
    comparison_df = pd.DataFrame(comparison_data)
    scaler = MinMaxScaler()
    features_to_scale = ['first_name_similarity', 'last_name_similarity', 
                         'role_similarity', 'first_name_length_diff', 
                         'last_name_length_diff', 'context_similarity']
    comparison_df[features_to_scale] = scaler.fit_transform(comparison_df[features_to_scale])

    return comparison_df

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


In [8]:
df = read_csv()

df

# df = df.iloc[:10]

# df = df.pipe(generate_pairwise_comparisons)
# df

Unnamed: 0,officer_context,officer_role,page_number,fn,query,prompt_template_for_hyde,prompt_template_for_model,chunk_size,chunk_overlap,temperature,...,num_of_queries,model,uid,officer_name,first_name,last_name,fc,lc,person_uid,blocking_keys
0,mentioned as one of the officers who verified ...,verifying officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Dalton,,dalton,,dalto,6123425393,"['dal', 'ton', 'cer']"
1,mentioned as providing assistance to officer d...,assisting officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Victoria Guidry,victoria,guidry,victo,guidr,7006194877,"['vic', 'ria', 'gui', 'dry', 'cer']"
2,mentioned as one of the officers who arrested ...,arresting officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Carolyn Dalton,carolyn,dalton,carol,dalto,3613126420,"['car', 'lyn', 'dal', 'ton', 'cer']"
3,mentioned as one of the officers who booked th...,booking officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Terry Bean,terry,bean,terry,bean,2271130809,"['ter', 'rry', 'bea', 'ean', 'cer']"
4,mentioned as one of the officers who verified ...,verifying officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Dalton,,dalton,,dalto,3153431315,"['dal', 'ton', 'cer']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5102,asked if he was the one who arrived to transpo...,investigating officer,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,Hoyt,,hoyt,,hoyt,6353150479,"['hoy', 'oyt', 'cer']"
5103,referred to as the one the witness wanted to b...,investigating officer,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,Dillman,,dillman,,dillm,8327113279,"['dil', 'man', 'cer']"
5104,involved in taking a statement and typed the s...,sergeant,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,London,,london,,londo,8822828103,"['lon', 'don', 'ant']"
5105,mentioned as being with other officers during ...,investigating officer,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,Dantagnan,,dantagnan,,danta,5489285986,"['dan', 'nan', 'cer']"


In [9]:
# df.to_csv("../data/output/output-pairwise-scales.csv", index=False)