In [27]:
from datamatch import (
    ThresholdMatcher,
    ColumnsIndex,
    JaroWinklerSimilarity,
    MaxScorer,
    SimSumScorer,
    AlterScorer,
    MultiIndex,
)
from datavalid.spinner import Spinner
import pandas as pd
from lib import clean_column_names
import re

In [28]:
def read():
    df = pd.read_csv("../data/input/reports-transcripts.csv").pipe(clean_column_names)
    return df

In [29]:
df = read()
df

Unnamed: 0,officer_name,officer_context,officer_role,page_number,fn,query,prompt_template_for_hyde,prompt_template_for_model,chunk_size,chunk_overlap,temperature,k,hyde,iteration,num_of_queries,model,uid
0,Wayne Tamborella,"Mentioned multiple times in various contexts, ...",Lead Detective (Homicide Unit),"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
1,CapJohn Hughes,Listed as the Night Supervisor at the scene of...,Supervising Officer,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
2,MichaeChimano,Mentioned as being part of the Third Police Di...,Sergeant,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
3,P/O Shawn Dupre,Listed as being part of the 301 unit in the Th...,Police Officer,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
4,P/O DarryCoulon and DarryGordon,Listed together as part of the 304 unit in the...,Police Officers,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12215,Travis Hayes,Referred to as the defendant in a postconvicti...,Defendant,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082
12216,MWolff,"Speaks on behalf of the state, objecting to th...",State Representative,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082
12217,Ryan Matthews,The case of Ryan Matthews is mentioned in rela...,Involved in a related case,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082
12218,,There are no specific individuals identified a...,,"[2, 1, 2, 1, 2, 2, 2, 1, 1, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,5,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082


In [30]:
def split_rows_with_multiple_officers(df):
    df = (
        df.drop("officer_name", axis=1)
        .join(
            df["officer_name"]
            .str.split("and", expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .rename("officer_name"),
            how="outer",
        )
        .reset_index(drop=True)
    )
    return df


def filter_by_role(df):
    df.loc[:, "officer_role"] = df.officer_role.str.lower().str.strip()

    officer_roles = ['officer', 'ofc.', 'lieutenant', 'lt.', 'detective', 'det.', 
                    'sergeant', 'sgt.', 'captain', 'cpt.', 'chief', 'inspector', 
                    'deputy', 'marshal', 'corporal', 'cpl.', 'commander', 'cmdr.', 
                    'agent', 'patrolman', 'trooper', 'sheriff', "coroner", "dr.", "doctor",
                    "p/o", "police officer"]

    regex_pattern = '|'.join(officer_roles)
    regex_pattern = r'\b(?:' + regex_pattern + r')\b'
    regex_pattern = re.compile(regex_pattern, re.IGNORECASE)

    df = df[df['officer_role'].fillna("").str.contains(regex_pattern)]

    df.loc[:, "officer_role"] = (df
                                .officer_role.str.replace(r"^n/a(.+)", "", regex=True)
                                .str.replace(r"(.+)?presumed(.+)", "", regex=True)
                                .str.replace(r"(.+)?(not explicitly mentioned as a law enforcement officer|not a law enforcement officer)(.+)?", "", regex=True)
                                .str.replace(r"(.+)?(suspect|civilian|unclear from the context|role not specified)(.+)?", "", regex=True)
    )

    df = df[~((df.officer_role == ""))]
    return df 


def split_name(name):
    if pd.isnull(name) or name.strip() == '':
        return ['Unknown', 'Unknown']

    name = re.sub(r'^(Reporting Officer|Detective|P\/O|Officer|Police Officer|Chief|Chief of Trials|Police|Coroner\'s|Coroner\'s Investigator|Unit #\d+Of|Driver|Cap|Superintendent|Not specified)\s*', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s*(Badge\s*\d+|Unit\s*#\d+|Type\s*Name|Mentioned\s*Again|Later|\(.*?\))\s*$', '', name, flags=re.IGNORECASE)

    name = re.sub(r'(?<=[a-z])([A-Z])', r' \1', name)

    name = re.sub(r'^([A-Z])([A-Z][a-z]+)', r'\2', name)

    # Split the name
    parts = name.split()
    if len(parts) == 1:
        return ['Unknown', parts[0]]
    elif len(parts) >= 2:
        return [parts[0], ' '.join(parts[1:])]
    else:
        return ['Unknown', 'Unknown']


def sanitize_names(df):
    df.loc[:, "first_name"] = df.first_name.str.lower().str.strip().str.replace(r"unknown", "", regex=True)
    df.loc[:, "last_name"] = df.last_name.str.lower().str.strip()
    df.loc[:, "fc"] = df.first_name.map(lambda x: x[:5])
    df.loc[:, "lc"] = df.last_name.map(lambda x: x[:5])
    df = df.drop_duplicates(subset=["first_name", "last_name", "officer_context", "officer_role"])

    full_names = df.first_name.str.cat(df.last_name, sep=" ")
    return df, full_names

In [31]:
df = df.pipe(split_rows_with_multiple_officers).pipe(filter_by_role)
 
df[['first_name', 'last_name']] = df['officer_name'].apply(lambda x: pd.Series(split_name(x)))

df, full_names = df.pipe(sanitize_names)

df = df[["first_name", "last_name", "officer_role", "officer_context"]]

df

Unnamed: 0,first_name,last_name,officer_role,officer_context
0,wayne,tamborella,lead detective (homicide unit),"Mentioned multiple times in various contexts, ..."
1,john,hughes,supervising officer,Listed as the Night Supervisor at the scene of...
2,michae,chimano,sergeant,Mentioned as being part of the Third Police Di...
3,shawn,dupre,police officer,Listed as being part of the 301 unit in the Th...
6,audie,jackson,police officer,Handling the district report at the scene of a...
...,...,...,...,...
6342,gwendolyn,norwood,lieutenant,Mentioned as the immediate supervisor of the i...
6343,gary,dupard,detective,Mentioned as someone who assisted the lead inv...
6344,la,fleur young,detective,Mentioned as the detective the lieutenant swit...
6345,,rice,sergeant,Mentioned as the person who was over the case ...


In [32]:
def jarowink_matcher(personnel):
    excel_path = "../data/output/jarowink_matches_tbl_full.xlsx"
    per = personnel
    per
    matcher = ThresholdMatcher(
        index=MultiIndex(
            [
                ColumnsIndex(["fc", "lc"]),
            ]
        ),
        scorer=MaxScorer(
            [
                AlterScorer(
                    scorer=SimSumScorer(
                        {
                            "first_name": JaroWinklerSimilarity(),
                            "last_name": JaroWinklerSimilarity(),
                        }
                    ),
                    # but for pairs that have the same name and their name is common
                    values=full_names,
                    # give a penalty of -.2 which is enough to eliminate them
                    alter=lambda score: score - 0.2,
                ),
            ]
        ),
        dfa=per,
    )
    decision = .5
    with Spinner("saving matched clusters to Excel file"):
        matcher.save_clusters_to_excel(excel_path, decision, lower_bound=decision)
    clusters = matcher.get_index_clusters_within_thresholds(decision)
    print("saved %d clusters to %s" % (len(clusters), excel_path))

    return clusters


In [33]:
jarowink_matcher(df)