In [60]:
import pandas as pd
from lib import clean_column_names
import re

In [61]:
def read_csv():
    df = pd.read_csv("../data/input/reports.csv")
    
    return df

In [62]:
def split_rows_with_multiple_officers(df):
    df = (
        df.drop("officer_name", axis=1)
        .join(
            df["officer_name"]
            .str.split("and", expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .rename("officer_name"),
            how="outer",
        )
        .reset_index(drop=True)
    )
    return df


def sanitize_officer_context(df):
    df.loc[:, "officer_context"] = df.officer_context.str.lower().str.strip()   
    return df


def filter_by_role(df):
    df.loc[:, "officer_role"] = df.officer_role.str.lower().str.strip()

    officer_roles = ['officer', 'ofc.', 'lieutenant', 'lt.', 'detective', 'det.', 
                    'sergeant', 'sgt.', 'captain', 'cpt.', 'chief', 'inspector', 
                    'deputy', 'marshal', 'corporal', 'cpl.', 'commander', 'cmdr.', 
                    'agent', 'patrolman', 'trooper', 'sheriff', "coroner", "dr.", "doctor",
                    "p/o", "police officer"]

    regex_pattern = '|'.join(officer_roles)
    regex_pattern = r'\b(?:' + regex_pattern + r')\b'
    regex_pattern = re.compile(regex_pattern, re.IGNORECASE)

    df = df[df['officer_role'].fillna("").str.contains(regex_pattern)]

    df.loc[:, "officer_role"] = (df
                                .officer_role.str.replace(r"^n/a(.+)", "", regex=True)
                                .str.replace(r"(.+)?presumed(.+)", "", regex=True)
                                .str.replace(r"(.+)?(not explicitly mentioned as a law enforcement officer|not a law enforcement officer)(.+)?", "", regex=True)
                                .str.replace(r"(.+)?(suspect|civilian|unclear from the context|role not specified)(.+)?", "", regex=True)
    )

    df = df[~((df.officer_role == ""))]
    return df 


def remove_titles(df):
    df.loc[:, "officer_name"] = df.officer_name.str.lower().str.strip().str.replace(r"^(sgt\.?|sergeant|lt\.?|lieutenant|cpl\.|corporal|reporting officer|detective|det\.|p\/o|officer|police officer|chief|chief of trials|police|coroner\'s|coroner\'s investigator|unit #\d+of|driver|cap|superintendent|not specified)", "", regex=True)
    return df 

def blocking_keys(df):
    df.loc[:, "fc"] = df.fillna("").first_name.map(lambda x: x[:3])
    df.loc[:, "lc"] = df.fillna("").last_name.map(lambda x: x[:3])
    df = df.drop_duplicates(subset=["first_name", "last_name", "officer_context", "officer_role"])

    full_names = df.first_name.str.cat(df.last_name, sep=" ")
    df = df[~((df.first_name == "") & (df.last_name == ""))]
    return df, full_names
def split_officer_name(name):
    if pd.isna(name) or not isinstance(name, str):
        return pd.Series({'first_name': '', 'middle_name': '', 'last_name': '', 'suffix': ''})
    
    name = name.strip().rstrip('.,')
    
    suffixes = {'JR', 'SR', 'II', 'III', 'IV', 'V'}
    
    suffix = ''
    for s in suffixes:
        if name.upper().endswith(' ' + s) or name.upper().endswith(',' + s):
            suffix = s
            name = name[:-(len(s)+1)].strip().rstrip(',')
            break
    
    parts = re.split(r',\s*|\s+', name)
    
    if ',' in name:  
        last_name = parts[0]
        first_name = parts[1] if len(parts) > 1 else ''
        middle_name = ' '.join(parts[2:]) or ''
    elif name.isupper():  
        last_name = parts[0]
        first_name = parts[1] if len(parts) > 1 else ''
        middle_name = ' '.join(parts[2:]) or ''
    else:  
        first_name = parts[0]
        last_name = parts[-1]
        middle_name = ' '.join(parts[1:-1]) or ''
    
    return pd.Series({'first_name': first_name, 'middle_name': middle_name, 'last_name': last_name, 'suffix': suffix})

def generate_uid(row, desired_length=10):
    raw_uid = abs(hash(f"{row['first_name']}{row['last_name']}{row['officer_role']}{row['officer_context']}"))
    raw_uid_str = str(raw_uid)
    if len(raw_uid_str) > desired_length:
        uid_str = raw_uid_str[:desired_length]
    else:
        uid_str = raw_uid_str.zfill(desired_length)
    
    return int(uid_str)


In [63]:

df = read_csv()

df = df.pipe(clean_column_names).pipe(split_rows_with_multiple_officers).pipe(filter_by_role).pipe(sanitize_officer_context).pipe(remove_titles)
 
name_parts = df['officer_name'].apply(split_officer_name)

df[['first_name', 'middle_name', 'last_name', 'suffix']] = name_parts

df, full_names = df.pipe(blocking_keys)

df.officer_name.unique()

array(['fred k austin', 'fred k. austin', 'austin', ..., 'mcmillian',
       'michelle watson', 'a polk'], dtype=object)

In [64]:
df

Unnamed: 0,officer_title,officer_context,officer_role,page_number,fn,query,chunk_size,chunk_overlap,temperature,k,...,num_of_queries,model,uid,officer_name,first_name,middle_name,last_name,suffix,fc,lc
0,,mentioned as a police officer iv in the 1997 a...,"police officer iv, police sergeant","[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,7d5abe38,fred k austin,fred,k,austin,,fre,aus
1,,disciplined for false or inaccurate reports in...,police officer,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,7d5abe38,fred k austin,fred,k,austin,,fre,aus
2,,evaluation notes mention his knowledge of the ...,police lieutenant,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,7d5abe38,fred k austin,fred,k,austin,,fre,aus
3,,evaluation notes highlight his broad job knowl...,police officer iii,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,7d5abe38,fred k austin,fred,k,austin,,fre,aus
4,,evaluation notes emphasize his exemplary leade...,police lieutenant,"[36, 32, 56, 38, 60, 60, 58, 60, 8, 2, 44, 62,...",09091300.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,7d5abe38,fred k austin,fred,k,austin,,fre,aus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51577,,witnessed evidence collection and testing proc...,deputy clerk of court,"[206, 30, 224, 1, 186, 226, 186, 224, 172, 186...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,98e16e00,michelle watson,michelle,,watson,,mic,wat
51579,,contact person for evidence pickup in the refe...,deputy clerk of court,"[206, 2, 208, 18, 172, 12, 21, 121, 201, 186, ...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,98e16e00,t,t,,t,,t,t
51580,,contact person for evidence pickup in the refe...,deputy clerk of court,"[206, 2, 208, 18, 172, 12, 21, 121, 201, 186, ...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,98e16e00,a polk,a,,polk,,a,pol
51584,,identified as a witness for the items of physi...,deputy clerk of court,"[174, 186, 178, 195, 12, 228, 173, 168, 1, 172...",Reliagene Case # F-31598.json,"Identify each individual in the transcript, by...",500,250,1,20,...,1,gpt-3.5-turbo-0125,98e16e00,michelle watson,michelle,,watson,,mic,wat


In [65]:
df['person_uid'] = df.apply(lambda row: generate_uid(row, 10), axis=1)


In [None]:
df.to_csv("../data/output/clean.csv", index=False) 