In [5]:
import pandas as pd
from itertools import product

In [6]:
def read_csv():
    df = pd.read_csv("../../preprocessing/data/output/clean.csv")
    df = df.fillna("")
    return df 

def create_blocking_keys(row, prefix_len=3, suffix_len=3):
    first_name, last_name, officer_role = row.get('first_name', ''), row.get('last_name', ''), row.get('officer_role', '')
    keys = []

    # Handling for first name
    if first_name:
        if len(first_name) >= prefix_len:
            keys.append(first_name[:prefix_len].lower())
        if len(first_name) >= suffix_len:
            keys.append(first_name[-suffix_len:].lower())
    
    # Handling for last name
    if last_name:
        if len(last_name) >= prefix_len:
            keys.append(last_name[:prefix_len].lower())
        if len(last_name) >= suffix_len:
            keys.append(last_name[-suffix_len:].lower())
    
    # New block: Handling for officer_role based on the last 3 characters
    if officer_role and len(officer_role) >= 3:
        keys.append(officer_role[-3:].lower())
    elif officer_role:  # If the role is shorter than 3 characters, use the entire role
        keys.append(officer_role.lower())

    # Fallback for records without enough name or role information
    if not keys:
        keys.append('missing')

    return keys

In [7]:
df = read_csv()
df['blocking_keys'] = df.apply(lambda row: create_blocking_keys(row), axis=1)

df

Unnamed: 0,officer_title,officer_context,officer_role,page_number,fn,query,chunk_size,chunk_overlap,temperature,k,...,uid,officer_name,first_name,middle_name,last_name,suffix,fc,lc,person_uid,blocking_keys
0,,mentioned as a police officer iv in the 1997 a...,"police officer iv, police sergeant","[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,7d5abe38,fred k austin,fred,k,austin,,fre,aus,4563497773,"[fre, red, aus, tin, ant]"
1,,disciplined for false or inaccurate reports in...,police officer,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,7d5abe38,fred k austin,fred,k,austin,,fre,aus,5468369957,"[fre, red, aus, tin, cer]"
2,,evaluation notes mention his knowledge of the ...,police lieutenant,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,7d5abe38,fred k austin,fred,k,austin,,fre,aus,8766617654,"[fre, red, aus, tin, ant]"
3,,evaluation notes highlight his broad job knowl...,police officer iii,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,7d5abe38,fred k austin,fred,k,austin,,fre,aus,9010557927,"[fre, red, aus, tin, iii]"
4,,evaluation notes emphasize his exemplary leade...,police lieutenant,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,7d5abe38,fred k austin,fred,k,austin,,fre,aus,6474835402,"[fre, red, aus, tin, ant]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30348,,witnessed evidence collection and testing proc...,deputy clerk of court,"[206, 30, 224, 1, 186, 226, 186, 224, 172, 186...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,98e16e00,michelle watson,michelle,,watson,,mic,wat,8032774515,"[mic, lle, wat, son, urt]"
30349,,contact person for evidence pickup in the refe...,deputy clerk of court,"[206, 2, 208, 18, 172, 12, 21, 121, 201, 186, ...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,98e16e00,t,t,,t,,t,t,6682040467,[urt]
30350,,contact person for evidence pickup in the refe...,deputy clerk of court,"[206, 2, 208, 18, 172, 12, 21, 121, 201, 186, ...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,98e16e00,a polk,a,,polk,,a,pol,2394570516,"[pol, olk, urt]"
30351,,identified as a witness for the items of physi...,deputy clerk of court,"[174, 186, 178, 195, 12, 228, 173, 168, 1, 172...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,98e16e00,michelle watson,michelle,,watson,,mic,wat,2023713203,"[mic, lle, wat, son, urt]"


In [8]:
df.to_csv("../data/output/clean-index-with-blocks.csv", index=False)