In [16]:
import pandas as pd
import difflib
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
from sklearn.preprocessing import MinMaxScaler
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import hashlib
import torch  
from sentence_transformers import SentenceTransformer
import numpy as np
import jellyfish

nltk.download('punkt') 

stemmer = PorterStemmer()

def calculate_context_similarity(contexts1, contexts2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    embeddings1 = model.encode(contexts1, convert_to_tensor=True)
    embeddings2 = model.encode(contexts2, convert_to_tensor=True)
    
    embeddings1_np = embeddings1.cpu().numpy()
    embeddings2_np = embeddings2.cpu().numpy()
    
    cosine_similarities = np.diag(cosine_similarity(embeddings1_np, embeddings2_np)).tolist()
    
    return cosine_similarities

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def tokenize_and_stem(context):
    tokens = word_tokenize(context) 
    stemmed_tokens = stem_tokens(tokens)  
    return stemmed_tokens


def string_similarity(s1, s2):
    return jellyfish.jaro_winkler_similarity(s1, s2)

def feature_engineering(df, first_name_col_1, last_name_col_1, rank_col_1, context_col_1,
                        first_name_col_2, last_name_col_2, rank_col_2, context_col_2):
    
    df[first_name_col_1] = df[first_name_col_1].fillna("")
    df[first_name_col_2] = df[first_name_col_2].fillna("")
    
    df['first_name_similarity'] = df.apply(lambda x: string_similarity(x[first_name_col_1], x[first_name_col_2]), axis=1)
    df['last_name_similarity'] = df.apply(lambda x: string_similarity(x[last_name_col_1], x[last_name_col_2]), axis=1) 

    df['first_name_length_diff'] = df.apply(lambda x: abs(len(x[first_name_col_1]) - len(x[first_name_col_2])), axis=1)
    df['last_name_length_diff'] = df.apply(lambda x: abs(len(x[last_name_col_1]) - len(x[last_name_col_2])), axis=1)

    df['role_similarity'] = df.apply(lambda x: string_similarity(x[rank_col_1], x[rank_col_2]), axis=1)

    context_similarities = calculate_context_similarity(df[context_col_1].tolist(), df[context_col_2].tolist())
    df['context_similarity'] = context_similarities
    
    scaler = MinMaxScaler()

    features_to_scale = [
                         'first_name_length_diff', 'last_name_length_diff', 
                         'role_similarity',
                         'context_similarity', 'first_name_similarity', 'last_name_similarity', ]
    
    df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
    return df

def generate_entity_uids(df):
    """
    Generate UIDs for each entity based on their first name, last name, rank, and context.
    Handles NaN values by replacing them with an empty string before generating the UID.
    """
    for i in range(1, 3): 
        entity_cols = [f'entity_{i}_first_name', f'entity_{i}_last_name', f'entity_{i}_rank', f'entity_{i}_context']
        df[f'entity_{i}_uid'] = df[entity_cols].fillna('').apply(
            lambda x: hashlib.sha256('_'.join(x).encode()).hexdigest(),
            axis=1
        )

df = pd.read_csv("../../ts-cluster/data/output/training-set-2-23-2024.csv")

df = feature_engineering(df, 'entity_1_first_name', 'entity_1_last_name', 'entity_1_rank', 'entity_1_context',
                         'entity_2_first_name', 'entity_2_last_name', 'entity_2_rank', 'entity_2_context')

generate_entity_uids(df)
# df.head(5)
df

[nltk_data] Downloading package punkt to /Users/ayyub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,entity_1_first_name,entity_1_last_name,entity_1_rank,entity_1_context,entity_2_first_name,entity_2_last_name,entity_2_rank,entity_2_context,label,first_name_similarity,last_name_similarity,first_name_length_diff,last_name_length_diff,role_similarity,context_similarity,entity_1_uid,entity_2_uid
0,,rodrigue,investigating detective,Directly involved in the investigation of the ...,,rodriguez,detective,Mentioned as someone the speaker went to the c...,Hard non-match,0.000000,0.977778,0.000000,0.04,0.321339,0.438451,5cdd9e50d8ed1da53888c2c4af908863a31c0a6a963766...,dd4c983730a24553301ee69e1efb32e00fd03839ab1673...
1,dewilliam,trepagnier #1504,detective,Mentioned in context with the search and inter...,dewilliam,trepagnier,detective,Mentioned in a context related to the crime la...,Easy match,1.000000,0.925000,0.000000,0.24,1.000000,0.635558,03e5fd476f72b8e364d8a7a363b53591bb2dd8c7a48818...,48313dc154307cb1cc114afc7b7084b2ecf73cbd4230c0...
2,dewilliam,trepagnier #1504,detective,Mentioned in context with the search and inter...,dewilliam,trepagnier #1504,detective,Mentioned in relation to conducting interviews...,Easy match,1.000000,1.000000,0.000000,0.00,1.000000,0.643929,03e5fd476f72b8e364d8a7a363b53591bb2dd8c7a48818...,fd9e8ef4a53913e12dc042aad6006403ecd4bad2a2d826...
3,,p.o.sauvage,police officer,Met the detectives upon their arrival at the 6...,,p.o.sauvage#1462,police officer,Met with DETs. SPONG and URSIN upon their arri...,Easy match,0.000000,0.937500,0.000000,0.20,1.000000,0.604374,42e23de48e76151773d60451b8094a79cc650f942dc949...,e02d391d65be180e49e9c775b7abae4ef6af138e595b85...
4,,p.o.sauvage#1462,police officer,Met the detectives upon their arrival at the l...,,p.o.sauvage#1462,police officer,p.o.sauvage#1462,easy match,0.000000,1.000000,0.000000,0.00,1.000000,0.172406,7a187eac837729b03b2a5d90d7a02b338d44de868b22d3...,8e714fedeb62269a202f871be2da2a92696ce303634bd3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140,debbie,coffee,detective,Mentioned as someone who was believed to be in...,louis,berard,detective,Testified as a Detective and Police Officer in...,easy non-match,0.455556,0.000000,0.111111,0.00,1.000000,0.467386,2243b7516581bae31e3289d311d5b734e33dc703e3aff7...,aa667086e0c5d4e2d5ae428eef9e55d33646832c7680f2...
1141,debbie,coffee,rape detective,Mentioned as part of the Rape Section.,louis,berard,detective (homicide division),Testified as a Police Officer from the Homicid...,easy non-match,0.455556,0.000000,0.111111,0.00,0.421360,0.283393,a48d982ff150801cd74f8a3f56c8095ce5f9896366a39a...,6590f2642b350b42e0acb0aa4348ddd6f6a8cc52987707...
1142,debbie,coffee,lead detective,Mentioned as the lead police officer in a case...,louis,berard,detective,Testified as a Detective and Police Officer in...,easy non-match,0.455556,0.000000,0.111111,0.00,0.613027,0.535369,9dc9bcbeebc54071fb6c2c21bd38e8e9faef6c5dc91972...,aa667086e0c5d4e2d5ae428eef9e55d33646832c7680f2...
1143,debbie,coffee,rape detective,Mentioned as working in the Rape Section.,louis,berard,homicide division detective,Mentioned as a Detective from the New Orleans ...,easy non-match,0.455556,0.000000,0.111111,0.00,0.344828,0.278920,c7493e4dd525b927017d2393fdeac805cff357e11dcd7e...,a56ec822462dda139e8642d3dfd1f70bdd5423073c254a...


In [None]:
df.to_csv("../data/output/features.csv", index=False)