In [6]:
import pandas as pd
import pickle
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import BertTokenizer, BertModel
import nltk
from itertools import combinations
from sentence_transformers import SentenceTransformer
import jellyfish
from sklearn.preprocessing import MinMaxScaler

In [7]:
def read_csv():
    df = pd.read_csv("../../preprocessing/data/output/clean.csv")
    return df 

In [8]:
nltk.download('punkt')  

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def tokenize_and_stem(context):
    tokens = word_tokenize(context)  
    stemmed_tokens = stem_tokens(tokens) 
    return stemmed_tokens

def calculate_string_similarity(s1, s2):
    return jellyfish.jaro_winkler_similarity(s1, s2)

def calculate_context_similarity(contexts1, contexts2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    embeddings1 = model.encode(contexts1, convert_to_tensor=True)
    embeddings2 = model.encode(contexts2, convert_to_tensor=True)
    
    embeddings1_np = embeddings1.cpu().numpy()
    embeddings2_np = embeddings2.cpu().numpy()
    
    cosine_similarities = np.diag(cosine_similarity(embeddings1_np, embeddings2_np)).tolist()
    
    return cosine_similarities

def split_entities(df):
    entity_1_cols = [col for col in df.columns if 'entity_1' in col] + ['entity_1_uid']
    entity_2_cols = [col for col in df.columns if 'entity_2' in col] + ['entity_2_uid']
    
    entity_1_df = df[entity_1_cols].copy()
    entity_2_df = df[entity_2_cols].copy()
    
    entity_1_df.columns = [col.replace('entity_1_', '') for col in entity_1_cols]
    entity_2_df.columns = [col.replace('entity_2_', '') for col in entity_2_cols]
    
    combined_df = pd.concat([entity_1_df, entity_2_df], ignore_index=True).drop_duplicates().reset_index(drop=True)
    return combined_df

def generate_pairwise_comparisons(df):
    df = df.fillna("")
    unique_entities = df['person_uid'].unique()
    pairs = list(combinations(unique_entities, 2))
    comparison_data = []

    for pair in pairs:
        entity_1, entity_2 = pair
        
        entity_1_row = df[df['person_uid'] == entity_1].iloc[0]
        entity_2_row = df[df['person_uid'] == entity_2].iloc[0]
        
        context_similarity = calculate_context_similarity([entity_1_row['officer_context']], [entity_2_row['officer_context']])[0]
        
        features = {
            'entity_1_uid': entity_1,
            'entity_1_first_name': entity_1_row['first_name'],
            'entity_1_last_name': entity_1_row['last_name'],
            'entity_1_role': entity_1_row['officer_role'],
            'entity_1_context': entity_1_row['officer_context'],
            'entity_2_uid': entity_2,
            'entity_2_first_name': entity_2_row['first_name'],
            'entity_2_last_name': entity_2_row['last_name'],
            'entity_2_role': entity_2_row['officer_role'],
            'entity_2_context': entity_2_row['officer_context'],
            'first_name_similarity': calculate_string_similarity(entity_1_row['first_name'], entity_2_row['first_name']),
            'last_name_similarity': calculate_string_similarity(entity_1_row['last_name'], entity_2_row['last_name']),
            'role_similarity': calculate_string_similarity(entity_1_row['officer_role'], entity_2_row['officer_role']),
            'first_name_length_diff': abs(len(entity_1_row['first_name']) - len(entity_2_row['first_name'])),
            'last_name_length_diff': abs(len(entity_1_row['last_name']) - len(entity_2_row['last_name'])),
            'context_similarity': context_similarity,
        }
        
        comparison_data.append(features)

    
    comparison_df = pd.DataFrame(comparison_data)
    scaler = MinMaxScaler()

    features_to_scale = ['first_name_similarity', 'last_name_similarity', 
                         'role_similarity', 'first_name_length_diff', 
                         'last_name_length_diff', 'context_similarity', ]
    
    comparison_df[features_to_scale] = scaler.fit_transform(comparison_df[features_to_scale])
    return comparison_df

[nltk_data] Downloading package punkt to /Users/ayyub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
df = read_csv()

df = df.iloc[:10]

df = df.pipe(generate_pairwise_comparisons)
df

Unnamed: 0,entity_1_uid,entity_1_first_name,entity_1_last_name,entity_1_role,entity_1_context,entity_2_uid,entity_2_first_name,entity_2_last_name,entity_2_role,entity_2_context,first_name_similarity,last_name_similarity,role_similarity,first_name_length_diff,last_name_length_diff,context_similarity
0,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,7006194877,victoria,guidry,assisting officer,mentioned as providing assistance to officer d...,0.0,0.0,0.060627,1.0,0.0,0.27756
1,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,3613126420,carolyn,dalton,arresting officer,mentioned as one of the officers who arrested ...,0.0,1.0,0.127816,0.875,0.0,0.590726
2,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,2271130809,terry,bean,booking officer,mentioned as one of the officers who booked th...,0.0,0.611111,0.0,0.625,1.0,0.339507
3,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,3153431315,,dalton,verifying officer,mentioned as one of the officers who verified ...,0.0,1.0,1.0,0.0,0.0,0.863474
4,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,3337373700,victoria,guidry,assisting officer,mentioned as assisting officer dalton in verif...,0.0,0.0,0.060627,1.0,0.0,0.540643
5,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,8146016027,carolyn,dalton,arresting officer,mentioned as one of the officers who took the ...,0.0,1.0,0.127816,0.875,0.0,0.441523
6,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,2694730618,terry,bean,assisting officer,mentioned as accompanying officer carolyn dalt...,0.0,0.611111,0.060627,0.625,1.0,0.258215
7,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,8803301450,,dalton,verifying officer,verified the subject's outstanding warrant for...,0.0,1.0,1.0,0.0,0.0,0.909304
8,6123425393,,dalton,verifying officer,mentioned as one of the officers who verified ...,8404085376,victoria,guidry,assisting officer,assisted officer dalton in verifying the subje...,0.0,0.0,0.060627,1.0,0.0,0.424056
9,7006194877,victoria,guidry,assisting officer,mentioned as providing assistance to officer d...,3613126420,carolyn,dalton,arresting officer,mentioned as one of the officers who arrested ...,0.490079,0.0,0.355859,0.125,0.0,0.245926


In [10]:
df.to_csv("../data/output/output-pairwise-scales.csv", index=False)