In [11]:
import pandas as pd
from lib import clean_column_names
import re

In [12]:
def read_csv():
    dfa = pd.read_csv("../data/input/reports-new.csv")
    dfb = pd.read_csv("../data/input/transcripts-new.csv")
    
    filtered_dfa = dfa[~dfa['uid'].isin(dfb['uid'])]

    df = pd.concat([filtered_dfa, dfb], ignore_index=True)  
    
    return df

In [13]:
def split_rows_with_multiple_officers(df):
    df = (
        df.drop("officer_name", axis=1)
        .join(
            df["officer_name"]
            .str.split("and", expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .rename("officer_name"),
            how="outer",
        )
        .reset_index(drop=True)
    )
    return df


def sanitize_officer_context(df):
    df.loc[:, "officer_context"] = df.officer_context.str.lower().str.strip()   
    return df


def filter_by_role(df):
    df.loc[:, "officer_role"] = df.officer_role.str.lower().str.strip()

    officer_roles = ['officer', 'ofc.', 'lieutenant', 'lt.', 'detective', 'det.', 
                    'sergeant', 'sgt.', 'captain', 'cpt.', 'chief', 'inspector', 
                    'deputy', 'marshal', 'corporal', 'cpl.', 'commander', 'cmdr.', 
                    'agent', 'patrolman', 'trooper', 'sheriff', "coroner", "dr.", "doctor",
                    "p/o", "police officer"]

    regex_pattern = '|'.join(officer_roles)
    regex_pattern = r'\b(?:' + regex_pattern + r')\b'
    regex_pattern = re.compile(regex_pattern, re.IGNORECASE)

    df = df[df['officer_role'].fillna("").str.contains(regex_pattern)]

    df.loc[:, "officer_role"] = (df
                                .officer_role.str.replace(r"^n/a(.+)", "", regex=True)
                                .str.replace(r"(.+)?presumed(.+)", "", regex=True)
                                .str.replace(r"(.+)?(not explicitly mentioned as a law enforcement officer|not a law enforcement officer)(.+)?", "", regex=True)
                                .str.replace(r"(.+)?(suspect|civilian|unclear from the context|role not specified)(.+)?", "", regex=True)
    )

    df = df[~((df.officer_role == ""))]
    return df 


def split_name(name):
    if pd.isnull(name) or name.strip() == '':
        return ['Unknown', 'Unknown']

    name = re.sub(r'^(Reporting Officer|Detective|Det\.|P\/O|Officer|Police Officer|Chief|Chief of Trials|Police|Coroner\'s|Coroner\'s Investigator|Unit #\d+Of|Driver|Cap|Superintendent|Not specified)\s*', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s*(Badge\s*\d+|Unit\s*#\d+|Type\s*Name|Mentioned\s*Again|Later|\(.*?\))\s*$', '', name, flags=re.IGNORECASE)

    name = re.sub(r'(?<=[a-z])([A-Z])', r' \1', name)

    name = re.sub(r'^([A-Z])([A-Z][a-z]+)', r'\2', name)

    # Split the name
    parts = name.split()
    if len(parts) == 1:
        return ['Unknown', parts[0]]
    elif len(parts) >= 2:
        return [parts[0], ' '.join(parts[1:])]
    else:
        return ['Unknown', 'Unknown']


def sanitize_names(df):
    df.loc[:, "first_name"] = (df.first_name
                               .str.lower()
                               .str.strip()
                               .str.replace(r"unknown", "", regex=True)
                               .str.replace(r"^p\.o\.", "", regex=True)
    )
    df.loc[:, "last_name"] = (df.last_name.str.lower().str.strip()
                              .str.replace(r"unknown", "", regex=True)
                            #   .str.replace(r" ?#(.+)$", "", regex=True)
                              .str.replace(r"^ (\w+)", r"\1", regex=True)
                              .str.replace(r"(\w+) $", r"\1", regex=True)
    )
    df.loc[:, "fc"] = df.first_name.map(lambda x: x[:5])
    df.loc[:, "lc"] = df.last_name.map(lambda x: x[:5])
    df = df.drop_duplicates(subset=["first_name", "last_name", "officer_context", "officer_role"])

    full_names = df.first_name.str.cat(df.last_name, sep=" ")
    df = df[~((df.first_name == "") & (df.last_name == ""))]
    return df, full_names


def generate_uid(row, desired_length=10):
    raw_uid = abs(hash(f"{row['first_name']}{row['last_name']}{row['officer_role']}{row['officer_context']}"))
    raw_uid_str = str(raw_uid)
    if len(raw_uid_str) > desired_length:
        uid_str = raw_uid_str[:desired_length]
    else:
        uid_str = raw_uid_str.zfill(desired_length)
    
    return int(uid_str)


In [14]:

df = read_csv()

df = df.pipe(clean_column_names).pipe(split_rows_with_multiple_officers).pipe(filter_by_role).pipe(sanitize_officer_context)
 
df[['first_name', 'last_name']] = df['officer_name'].apply(lambda x: pd.Series(split_name(x)))

df, full_names = df.pipe(sanitize_names)

df['person_uid'] = df.apply(lambda row: generate_uid(row, 10), axis=1)

df.columns

Index(['officer_context', 'officer_role', 'page_number', 'fn', 'query',
       'prompt_template_for_hyde', 'prompt_template_for_model', 'chunk_size',
       'chunk_overlap', 'temperature', 'k', 'hyde', 'iteration',
       'num_of_queries', 'model', 'uid', 'officer_name', 'first_name',
       'last_name', 'fc', 'lc', 'person_uid'],
      dtype='object')

In [15]:
# df.to_csv("../data/output/clean.csv", index=False) 