In [9]:
import pandas as pd
import pickle
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import BertTokenizer, BertModel
import nltk
from itertools import combinations
from sentence_transformers import SentenceTransformer
import jellyfish

In [10]:
def read_csv():
    df = pd.read_csv("../../preprocessing/data/output/output-clean.csv")
    return df 

In [11]:
nltk.download('punkt')  

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def tokenize_and_stem(context):
    tokens = word_tokenize(context)  
    stemmed_tokens = stem_tokens(tokens) 
    return stemmed_tokens

def calculate_string_similarity(s1, s2):
    return jellyfish.jaro_winkler_similarity(s1, s2)

def calculate_context_similarity(contexts1, contexts2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    embeddings1 = model.encode(contexts1, convert_to_tensor=True)
    embeddings2 = model.encode(contexts2, convert_to_tensor=True)
    
    embeddings1_np = embeddings1.cpu().numpy()
    embeddings2_np = embeddings2.cpu().numpy()
    
    cosine_similarities = np.diag(cosine_similarity(embeddings1_np, embeddings2_np)).tolist()
    
    return cosine_similarities

def split_entities(df):
    entity_1_cols = [col for col in df.columns if 'entity_1' in col] + ['entity_1_uid']
    entity_2_cols = [col for col in df.columns if 'entity_2' in col] + ['entity_2_uid']
    
    entity_1_df = df[entity_1_cols].copy()
    entity_2_df = df[entity_2_cols].copy()
    
    entity_1_df.columns = [col.replace('entity_1_', '') for col in entity_1_cols]
    entity_2_df.columns = [col.replace('entity_2_', '') for col in entity_2_cols]
    
    combined_df = pd.concat([entity_1_df, entity_2_df], ignore_index=True).drop_duplicates().reset_index(drop=True)
    return combined_df

def generate_pairwise_comparisons(df):
    df = df.fillna("")
    unique_entities = df['uid'].unique()
    pairs = list(combinations(unique_entities, 2))
    comparison_data = []

    for pair in pairs:
        entity_1, entity_2 = pair
        
        entity_1_row = df[df['uid'] == entity_1].iloc[0]
        entity_2_row = df[df['uid'] == entity_2].iloc[0]
        
        context_similarity = calculate_context_similarity([entity_1_row['officer_context']], [entity_2_row['officer_context']])[0]
        
        features = {
            'entity_1_uid': entity_1,
            'entity_1_first_name': entity_1_row['first_name'],
            'entity_1_last_name': entity_1_row['last_name'],
            'entity_1_role': entity_1_row['officer_role'],
            'entity_1_context': entity_1_row['officer_context'],
            'entity_2_uid': entity_2,
            'entity_2_first_name': entity_2_row['first_name'],
            'entity_2_last_name': entity_2_row['last_name'],
            'entity_2_role': entity_2_row['officer_role'],
            'entity_2_context': entity_2_row['officer_context'],
            'first_name_similarity': calculate_string_similarity(entity_1_row['first_name'], entity_2_row['first_name']),
            'last_name_similarity': calculate_string_similarity(entity_1_row['last_name'], entity_2_row['last_name']),
            'role_similarity': calculate_string_similarity(entity_1_row['officer_role'], entity_2_row['officer_role']),
            'first_name_length_diff': abs(len(entity_1_row['first_name']) - len(entity_2_row['first_name'])),
            'last_name_length_diff': abs(len(entity_1_row['last_name']) - len(entity_2_row['last_name'])),
            'context_similarity': context_similarity,
        }
        
        comparison_data.append(features)
    
    comparison_df = pd.DataFrame(comparison_data)
    return comparison_df


def generate_uid(row, desired_length=10):
    raw_uid = abs(hash(f"{row['first_name']}{row['last_name']}{row['officer_role']}{row['officer_context']}"))
    raw_uid_str = str(raw_uid)
    if len(raw_uid_str) > desired_length:
        uid_str = raw_uid_str[:desired_length]
    else:
        uid_str = raw_uid_str.zfill(desired_length)
    
    return int(uid_str)


[nltk_data] Downloading package punkt to /Users/ayyub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
df = read_csv()

df = df.rename(columns={"uid": "doc_uid"})
df = df[["first_name", "last_name", "officer_role", "officer_context"]]

df['uid'] = df.apply(lambda row: generate_uid(row, 10), axis=1)

df = df.iloc[:5]

df = df.pipe(generate_pairwise_comparisons)
df

Unnamed: 0,entity_1_uid,entity_1_first_name,entity_1_last_name,entity_1_role,entity_1_context,entity_2_uid,entity_2_first_name,entity_2_last_name,entity_2_role,entity_2_context,first_name_similarity,last_name_similarity,role_similarity,first_name_length_diff,last_name_length_diff,context_similarity
0,8257682566,,dalton,verifying officer,Mentioned as one of the officers who verified ...,6082939390,victoria,guidry,assisting officer,Mentioned as providing assistance to Officer D...,0.0,0.0,0.720588,8,0,0.52549
1,8257682566,,dalton,verifying officer,Mentioned as one of the officers who verified ...,7858057238,carolyn,dalton,arresting officer,Mentioned as one of the officers who arrested ...,0.0,1.0,0.740573,7,0,0.69113
2,8257682566,,dalton,verifying officer,Mentioned as one of the officers who verified ...,6922728269,terry,bean,booking officer,Mentioned as one of the officers who booked th...,0.0,0.611111,0.702555,5,2,0.558255
3,8257682566,,dalton,verifying officer,Mentioned as one of the officers who verified ...,3362754676,,dalton,verifying officer,Mentioned as one of the officers who verified ...,0.0,1.0,1.0,0,0,0.835393
4,6082939390,victoria,guidry,assisting officer,Mentioned as providing assistance to Officer D...,7858057238,carolyn,dalton,arresting officer,Mentioned as one of the officers who arrested ...,0.490079,0.0,0.808403,1,0,0.508758
5,6082939390,victoria,guidry,assisting officer,Mentioned as providing assistance to Officer D...,6922728269,terry,bean,booking officer,Mentioned as one of the officers who booked th...,0.55,0.0,0.702555,3,2,0.452902
6,6082939390,victoria,guidry,assisting officer,Mentioned as providing assistance to Officer D...,3362754676,,dalton,verifying officer,Mentioned as one of the officers who verified ...,0.0,0.0,0.720588,8,0,0.630117
7,7858057238,carolyn,dalton,arresting officer,Mentioned as one of the officers who arrested ...,6922728269,terry,bean,booking officer,Mentioned as one of the officers who booked th...,0.561905,0.611111,0.793464,2,2,0.611255
8,7858057238,carolyn,dalton,arresting officer,Mentioned as one of the officers who arrested ...,3362754676,,dalton,verifying officer,Mentioned as one of the officers who verified ...,0.0,1.0,0.740573,7,0,0.619506
9,6922728269,terry,bean,booking officer,Mentioned as one of the officers who booked th...,3362754676,,dalton,verifying officer,Mentioned as one of the officers who verified ...,0.0,0.611111,0.702555,5,2,0.607873


In [None]:
df.to_csv("../data/output/output-pairwise.csv", index=False)