In [1]:
import pandas as pd
from itertools import product

In [2]:
def read_csv():
    df = pd.read_csv("../../preprocessing/data/output/clean.csv")
    df = df.fillna("")
    return df 

def create_blocking_keys(row, prefix_len=3, suffix_len=3):
    first_name, last_name, officer_role = row.get('first_name', ''), row.get('last_name', ''), row.get('officer_role', '')
    keys = []

    # Handling for first name
    if first_name:
        if len(first_name) >= prefix_len:
            keys.append(first_name[:prefix_len].lower())
        if len(first_name) >= suffix_len:
            keys.append(first_name[-suffix_len:].lower())
    
    # Handling for last name
    if last_name:
        if len(last_name) >= prefix_len:
            keys.append(last_name[:prefix_len].lower())
        if len(last_name) >= suffix_len:
            keys.append(last_name[-suffix_len:].lower())
    
    # New block: Handling for officer_role based on the last 3 characters
    if officer_role and len(officer_role) >= 3:
        keys.append(officer_role[-3:].lower())
    elif officer_role:  # If the role is shorter than 3 characters, use the entire role
        keys.append(officer_role.lower())

    # Fallback for records without enough name or role information
    if not keys:
        keys.append('missing')

    return keys

In [3]:
df = read_csv()
df['blocking_keys'] = df.apply(lambda row: create_blocking_keys(row), axis=1)

df

Unnamed: 0,officer_context,officer_role,page_number,fn,query,prompt_template_for_hyde,prompt_template_for_model,chunk_size,chunk_overlap,temperature,...,num_of_queries,model,uid,officer_name,first_name,last_name,fc,lc,person_uid,blocking_keys
0,mentioned as one of the officers who verified ...,verifying officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Dalton,,dalton,,dalto,6123425393,"[dal, ton, cer]"
1,mentioned as providing assistance to officer d...,assisting officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Victoria Guidry,victoria,guidry,victo,guidr,7006194877,"[vic, ria, gui, dry, cer]"
2,mentioned as one of the officers who arrested ...,arresting officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Carolyn Dalton,carolyn,dalton,carol,dalto,3613126420,"[car, lyn, dal, ton, cer]"
3,mentioned as one of the officers who booked th...,booking officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Terry Bean,terry,bean,terry,bean,2271130809,"[ter, rry, bea, ean, cer]"
4,mentioned as one of the officers who verified ...,verifying officer,"[1, 1, 1]",Magistrate - Arrest Report.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,612edb73,Dalton,,dalton,,dalto,3153431315,"[dal, ton, cer]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5102,asked if he was the one who arrived to transpo...,investigating officer,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,Hoyt,,hoyt,,hoyt,6353150479,"[hoy, oyt, cer]"
5103,referred to as the one the witness wanted to b...,investigating officer,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,Dillman,,dillman,,dillm,8327113279,"[dil, man, cer]"
5104,involved in taking a statement and typed the s...,sergeant,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,London,,london,,londo,8822828103,"[lon, don, ant]"
5105,mentioned as being with other officers during ...,investigating officer,"[2, 9, 113, 50, 52, 33, 30, 112, 170, 2, 98, 1...",Seward - Suppression Hearing Transcript.json,"Identify each individual in the transcript, by...",,,500,250,1,...,1,gpt-3.5-turbo-1603-finetuned-300-labels,2a8d20df,Dantagnan,,dantagnan,,danta,5489285986,"[dan, nan, cer]"


In [4]:
df.to_csv("../data/output/blocks.csv", index=False)