In [8]:
import pandas as pd
import difflib
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import BertTokenizer, BertModel
import torch

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
from itertools import combinations
import pickle


def create_blocking_keys(df, first_name_col, last_name_col):
    """
    Create blocking keys for the dataframe based on the first three letters of the first and last names.

    Args:
    df (pd.DataFrame): DataFrame containing the relevant columns.
    first_name_col (str): Column name for the first name.
    last_name_col (str): Column name for the last name.

    Returns:
    pd.DataFrame: DataFrame with an additional 'blocking_key' column.
    """
    # Extract the first three characters of first and last names for blocking keys
    # If names are shorter than three characters, use the entire name
    df['first_name_key'] = df[first_name_col].str.lower().str[:3]
    df['last_name_key'] = df[last_name_col].str.lower().str[:3]

    # Combine the keys to form a composite blocking key
    df['blocking_key'] = df['first_name_key'] + '_' + df['last_name_key']
    
    return df


def generate_pairs(df, blocking_key_col):
    """
    Generate record pairs within each block defined by the blocking key.

    Args:
    df (pd.DataFrame): DataFrame with a 'blocking_key' column.
    blocking_key_col (str): The name of the column containing the blocking keys.

    Returns:
    List of tuples: Each tuple contains the indices of two records to be compared.
    """
    pairs = []
    # Group the dataframe by the blocking key and iterate over the groups
    for _, group in df.groupby(blocking_key_col):
        # Generate all combinations of record pairs within each block
        block_pairs = list(combinations(group.index, 2))
        pairs.extend(block_pairs)
    return pairs

def create_comparison_dataframe(df, pairs_to_compare):
    """
    Create a new dataframe where each row represents a pair of records,
    with columns formatted like the training data.

    Args:
    df (pd.DataFrame): Original DataFrame containing the records.
    pairs_to_compare (List of tuples): Each tuple contains the indices of two records to be compared.

    Returns:
    pd.DataFrame: DataFrame with each row representing a pair of records.
    """
    comparison_data = []
    for index1, index2 in pairs_to_compare:
        record1 = df.loc[index1]
        record2 = df.loc[index2]

        pair_dict = {
            'entity_1_first_name': record1['first_name'],
            'entity_1_last_name': record1['last_name'],
            'entity_1_rank': record1['officer_role'],  #
            'entity_1_context': record1['officer_context'],
            'entity_2_first_name': record2['first_name'],
            'entity_2_last_name': record2['last_name'],
            'entity_2_rank': record2['officer_role'],
            'entity_2_context': record2['officer_context']
        }
        comparison_data.append(pair_dict)
    comparison_df = pd.DataFrame(comparison_data)
    return comparison_df


def calculate_context_similarity(contexts1, contexts2, tokenizer, model):
    # Tokenize and encode both sets of contexts
    encoded_input_1 = tokenizer(contexts1, padding=True, truncation=True, return_tensors='pt')
    encoded_input_2 = tokenizer(contexts2, padding=True, truncation=True, return_tensors='pt')
    
    # Forward pass through BERT model for both sets of contexts
    with torch.no_grad():
        output_1 = model(**encoded_input_1)
        output_2 = model(**encoded_input_2)
    
    # Extract embeddings and average across token dimension for each context
    embeddings_1 = output_1.last_hidden_state.mean(dim=1)
    embeddings_2 = output_2.last_hidden_state.mean(dim=1)
    
    # Convert embeddings to numpy arrays for cosine similarity calculation
    embeddings_1_np = embeddings_1.detach().cpu().numpy()
    embeddings_2_np = embeddings_2.detach().cpu().numpy()
    
    # Calculate cosine similarity between each pair of context embeddings
    similarities = [cosine_similarity(embeddings_1_np[i].reshape(1, -1), embeddings_2_np[i].reshape(1, -1))[0][0] for i in range(len(contexts1))]
    
    return similarities

nltk.download('punkt') 

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def tokenize_and_stem(context):
    tokens = word_tokenize(context)  
    stemmed_tokens = stem_tokens(tokens)  
    return stemmed_tokens

def calculate_stemmed_shared_tokens(context1, context2):
    tokens1 = set(tokenize_and_stem(context1))
    tokens2 = set(tokenize_and_stem(context2))
    return len(tokens1.intersection(tokens2))

def calculate_stemmed_jaccard_similarity(context1, context2):
    tokens1 = set(tokenize_and_stem(context1))
    tokens2 = set(tokenize_and_stem(context2))
    if not tokens1 or not tokens2: 
        return 1.0
    return len(tokens1.intersection(tokens2)) / len(tokens1.union(tokens2))

def string_similarity(s1, s2):
    return difflib.SequenceMatcher(None, s1, s2).ratio()


def feature_engineering(df, first_name_col_1, last_name_col_1, rank_col_1, context_col_1,
                        first_name_col_2, last_name_col_2, rank_col_2, context_col_2):
    
    df = df.fillna("")
    
    df[first_name_col_1] = df[first_name_col_1].fillna("")
    df[first_name_col_2] = df[first_name_col_2].fillna("")
    
    df['first_name_similarity'] = df.apply(lambda x: string_similarity(x[first_name_col_1], x[first_name_col_2]), axis=1)
    df['last_name_similarity'] = df.apply(lambda x: string_similarity(x[last_name_col_1], x[last_name_col_2]), axis=1)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    context_similarities = calculate_context_similarity(df[context_col_1].tolist(), df[context_col_2].tolist(), tokenizer, model)
    df['context_similarity'] = context_similarities

    return df


def generate_blocking_keys(df):
    df.fillna('', inplace=True)
    
    df['last_name_blocking_key'] = df['entity_1_last_name'].str.lower().str[:3] + '_' + df['entity_2_last_name'].str.lower().str[:3]
    df['first_name_blocking_key'] = df['entity_1_first_name'].str.lower().str[:3] + '_' + df['entity_2_first_name'].str.lower().str[:3]
    
    df['context_similar'] = np.where(df['context_similarity'].astype(float) > 0.7, 'context_similar', 'context_not_similar')
    
    df['combined_blocking_key'] = df.apply(lambda x: '_'.join([
        x['first_name_blocking_key'],
        x['last_name_blocking_key'],
        x['context_similar'],
    ]), axis=1)
    
    return df

def yield_record_pairs(df):
    grouped = df.groupby('combined_blocking_key')
    for _, group in grouped:
        if len(group) < 2:
            continue
        for pair in combinations(group.index, 2):
            yield pair

def prepare_features_for_model(df, record_pairs):
    features = []
    for pair in record_pairs:
        record1, record2 = df.loc[pair[0]], df.loc[pair[1]]
        feature_vector = [
            float(record1['first_name_similarity']),
            float(record1['last_name_similarity']),
            float(record1['context_similarity']),
        ]
        features.append(feature_vector)
    return np.array(features)  

def generate_candidate_pairs(df, trained_model):
    record_pairs = list(yield_record_pairs(df))
    X_test = prepare_features_for_model(df, record_pairs)  

    predictions = trained_model.predict(X_test)

    likely_matches = [pair for pair, prediction in zip(record_pairs, predictions) if prediction == 1]
    candidate_pairs = pd.DataFrame({
        'RecordID_1': [pair[0] for pair in likely_matches],
        'RecordID_2': [pair[1] for pair in likely_matches],
        'predicted_match': [1]*len(likely_matches)
    })
    
    return candidate_pairs


df = pd.read_csv("../../ts-cluster/data/output/output-clean.csv")

df_with_blocking_keys = create_blocking_keys(df, 'first_name', 'last_name')

pairs_to_compare = generate_pairs(df_with_blocking_keys, 'blocking_key')

comparison_df = create_comparison_dataframe(df_with_blocking_keys, pairs_to_compare)

comparison_df = comparison_df.drop_duplicates()
comparison_df = comparison_df.iloc[:500]
comparison_df

comparison_df = feature_engineering(comparison_df, 'entity_1_first_name', 'entity_1_last_name', 'entity_1_rank', 'entity_1_context',
                         'entity_2_first_name', 'entity_2_last_name', 'entity_2_rank', 'entity_2_context')

comparison_df = generate_blocking_keys(comparison_df)


with open('../../ts-train-model/data/output/trained_blocking_model.pkl', 'rb') as f:
    trained_model = pickle.load(f)

candidate_pairs = generate_candidate_pairs(comparison_df, trained_model)
candidate_pairs.predicted_match.unique()

# initial_candidates_count = len(pairs_to_compare)
# print(f"Initial number of candidate pairs for review: {initial_candidates_count}")

# candidates_after_model_count = len(candidate_pairs)
# print(f"Number of candidate pairs for review after model prediction: {candidates_after_model_count}")

# reduction_percentage = ((initial_candidates_count - candidates_after_model_count) / initial_candidates_count) * 100
# print(f"Reduction in candidate pairs due to model filtering: {reduction_percentage:.2f}%")

# candidate_pairs.to_csv('../data/output/candidate_pairs.csv', index=False)

[nltk_data] Downloading package punkt to /Users/ayyub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


array([1])