In [161]:
from datamatch import (
    ThresholdMatcher,
    DissimilarFilter,
    NonOverlappingFilter,
    ColumnsIndex,
    JaroWinklerSimilarity,
    MaxScorer,
    SimSumScorer,
    AlterScorer,
    MultiIndex,
    AbsoluteScorer,
)
from datavalid.spinner import Spinner
import pandas as pd
from lib import clean_column_names
import re

### rules
# blocking features: first name, last name; rank description; officer context

In [162]:
def read():
    df = pd.read_csv("../data/input/reports-transcripts.csv").pipe(clean_column_names)
    return df

In [163]:
df = read()
df


Unnamed: 0,officer_name,officer_context,officer_role,page_number,fn,query,prompt_template_for_hyde,prompt_template_for_model,chunk_size,chunk_overlap,temperature,k,hyde,iteration,num_of_queries,model,uid
0,Wayne Tamborella,"Mentioned multiple times in various contexts, ...",Lead Detective (Homicide Unit),"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
1,CapJohn Hughes,Listed as the Night Supervisor at the scene of...,Supervising Officer,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
2,MichaeChimano,Mentioned as being part of the Third Police Di...,Sergeant,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
3,P/O Shawn Dupre,Listed as being part of the 301 unit in the Th...,Police Officer,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
4,P/O DarryCoulon and DarryGordon,Listed together as part of the 304 unit in the...,Police Officers,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12215,Travis Hayes,Referred to as the defendant in a postconvicti...,Defendant,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082
12216,MWolff,"Speaks on behalf of the state, objecting to th...",State Representative,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082
12217,Ryan Matthews,The case of Ryan Matthews is mentioned in rela...,Involved in a related case,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082
12218,,There are no specific individuals identified a...,,"[2, 1, 2, 1, 2, 2, 2, 1, 1, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,5,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082


In [164]:

def split_rows_with_multiple_officers(df):
    df = (
        df.drop("officer_name", axis=1)
        .join(
            df["officer_name"]
            .str.split("and", expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .rename("officer_name"),
            how="outer",
        )
        .reset_index(drop=True)
    )
    return df

df = df.pipe(split_rows_with_multiple_officers)

df

Unnamed: 0,officer_context,officer_role,page_number,fn,query,prompt_template_for_hyde,prompt_template_for_model,chunk_size,chunk_overlap,temperature,k,hyde,iteration,num_of_queries,model,uid,officer_name
0,"Mentioned multiple times in various contexts, ...",Lead Detective (Homicide Unit),"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328,Wayne Tamborella
1,Listed as the Night Supervisor at the scene of...,Supervising Officer,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328,CapJohn Hughes
2,Mentioned as being part of the Third Police Di...,Sergeant,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328,MichaeChimano
3,Listed as being part of the 301 unit in the Th...,Police Officer,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328,P/O Shawn Dupre
4,Listed together as part of the 304 unit in the...,Police Officers,"[21, 92, 31, 4, 96, 19, 90, 38, 38, 76, 90, 19...",Supplemental Police Report.json,"Identify each individual in the transcript, by...",input_variables=['question'] template='\n Y...,"input_variables=['docs', 'question'] template=...",500,250,1,20,1,1,1,gpt-3.5-turbo-1603-finetuned-300-labels,9cc6b328,P/O DarryCoulon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12801,Referred to as the defendant in a postconvicti...,Defendant,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082,Travis Hayes
12802,"Speaks on behalf of the state, objecting to th...",State Representative,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082,MWolff
12803,The case of Ryan Matthews is mentioned in rela...,Involved in a related case,"[1, 2, 1, 1, 2, 1, 2, 2, 2, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,3,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082,Ryan Matthews
12804,There are no specific individuals identified a...,,"[2, 1, 2, 1, 2, 2, 2, 1, 1, 1]","Exhibit 1b Hayes, Travis excerpt of transcript...","Identify each individual in the transcript, by...",,,500,250,1,20,1,5,1,gpt-3.5-turbo-1603-finetuned-300-labels,9c161082,


In [165]:
# Drop rows where the officer's role does not correspond to law enforcement 

df.loc[:, "officer_role"] = df.officer_role.str.lower().str.strip()

officer_roles = ['officer', 'ofc.', 'lieutenant', 'lt.', 'detective', 'det.', 
                 'sergeant', 'sgt.', 'captain', 'cpt.', 'chief', 'inspector', 
                 'deputy', 'marshal', 'corporal', 'cpl.', 'commander', 'cmdr.', 
                 'agent', 'patrolman', 'trooper', 'sheriff', "coroner", "dr.", "doctor",
                 "p/o", "police officer"]

regex_pattern = '|'.join(officer_roles)
regex_pattern = r'\b(?:' + regex_pattern + r')\b'
regex_pattern = re.compile(regex_pattern, re.IGNORECASE)

df = df[df['officer_role'].fillna("").str.contains(regex_pattern)]

df.loc[:, "officer_role"] = (df
                             .officer_role.str.replace(r"^n/a(.+)", "", regex=True)
                             .str.replace(r"(.+)?presumed(.+)", "", regex=True)
                             .str.replace(r"(.+)?(not explicitly mentioned as a law enforcement officer|not a law enforcement officer)(.+)?", "", regex=True)
                             .str.replace(r"(.+)?(suspect|civilian|unclear from the context|role not specified)(.+)?", "", regex=True)
)

df = df[~((df.officer_role == ""))]

df.shape

df.officer_role.unique()

array(['lead detective (homicide unit)', 'supervising officer',
       'sergeant', 'police officer', 'detective', 'captain',
       'reporting officer', 'patrol officer', 'deputy', 'corporal',
       "coroner's office investigator", 'lead detective',
       'homicide detective', 'investigating detective',
       'transporting officer', 'arresting officer', 'warrant officer',
       'officer', 'investigating officer',
       'officer attending magistrate court', 'first arresting officer',
       'second arresting officer', 'crime scene technician sergeant',
       'responding officer', 'assisting officer', 'assistant coroner',
       'lead detective (homicide division)',
       'detective, assigned to the m. c. i. unit of the seventh district',
       'detective, assigned to the m.c.i. unit of the seventh district',
       'coroner', "coroner's office staff",
       'officer name: p.o. raymond loosemore \nofficer context: cleared several burglaries in the area through the identification

In [166]:
def split_name(name):
    if pd.isnull(name) or name.strip() == '':
        return ['Unknown', 'Unknown']

    # Remove common prefixes, suffixes, and job titles with case-insensitive matching
    name = re.sub(r'^(Reporting Officer|Detective|P\/O|Officer|Police Officer|Chief|Chief of Trials|Police|Coroner\'s|Coroner\'s Investigator|Unit #\d+Of|Driver|Cap|Superintendent|Not specified)\s*', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s*(Badge\s*\d+|Unit\s*#\d+|Type\s*Name|Mentioned\s*Again|Later|\(.*?\))\s*$', '', name, flags=re.IGNORECASE)

    # Adjust regex to handle concatenated names better, including those with no clear separator
    # This pattern looks for a lowercase letter followed by an uppercase letter and inserts a space
    name = re.sub(r'(?<=[a-z])([A-Z])', r' \1', name)

    # Handle names starting with two capital letters, removing the first if followed by lowercase letters
    name = re.sub(r'^([A-Z])([A-Z][a-z]+)', r'\2', name)

    # Split the name
    parts = name.split()
    if len(parts) == 1:
        return ['Unknown', parts[0]]
    elif len(parts) >= 2:
        return [parts[0], ' '.join(parts[1:])]
    else:
        return ['Unknown', 'Unknown']


df[['first_name', 'last_name']] = df['officer_name'].apply(lambda x: pd.Series(split_name(x)))

df.loc[:, "first_name"] = df.first_name.str.lower().str.strip()
df.loc[:, "last_name"] = df.last_name.str.lower().str.strip()

df.loc[:, "fc"] = df.first_name.map(lambda x: x[:5])
df.loc[:, "lc"] = df.last_name.map(lambda x: x[:5])

full_names = df.first_name.str.cat(df.last_name, sep=" ")


df = df[["first_name", "last_name", "fc", "lc", "officer_role", "officer_context"]]

df

Unnamed: 0,first_name,last_name,fc,lc,officer_role,officer_context
0,wayne,tamborella,wayne,tambo,lead detective (homicide unit),"Mentioned multiple times in various contexts, ..."
1,john,hughes,john,hughe,supervising officer,Listed as the Night Supervisor at the scene of...
2,michae,chimano,micha,chima,sergeant,Mentioned as being part of the Third Police Di...
3,shawn,dupre,shawn,dupre,police officer,Listed as being part of the 301 unit in the Th...
6,audie,jackson,audie,jacks,police officer,Handling the district report at the scene of a...
...,...,...,...,...,...,...
12745,gwendolyn,norwood,gwend,norwo,lieutenant,Mentioned as the immediate supervisor of the i...
12746,gary,dupard,gary,dupar,detective,Mentioned as someone who assisted the lead inv...
12747,la,fleur young,la,fleur,detective,Mentioned as the detective the lieutenant swit...
12748,unknown,rice,unkno,rice,sergeant,Mentioned as the person who was over the case ...


In [167]:
df.loc[:, "first_name"] = df.first_name.str.lower().str.strip()
df.loc[:, "last_name"] = df.last_name.str.lower().str.strip()

df.loc[:, "fc"] = df.first_name.map(lambda x: x[:5])
df.loc[:, "lc"] = df.last_name.map(lambda x: x[:5])

full_names = df.first_name.str.cat(df.last_name, sep=" ")


df = df[["first_name", "last_name", "fc", "lc", "officer_role", "officer_context"]]

def jarowink_matcher(personnel):
    excel_path = "../data/output/jarowink_matches.xlsx"
    per = personnel
    per
    matcher = ThresholdMatcher(
        index=MultiIndex(
            [
                ColumnsIndex(["fc", "lc"]),
            ]
        ),
        scorer=MaxScorer(
            [
                AlterScorer(
                    scorer=SimSumScorer(
                        {
                            "first_name": JaroWinklerSimilarity(),
                            "last_name": JaroWinklerSimilarity(),
                        }
                    ),
                    # but for pairs that have the same name and their name is common
                    values=full_names,
                    # give a penalty of -.2 which is enough to eliminate them
                    alter=lambda score: score - 0.2,
                ),
            ]
        ),
        dfa=per,
    )
    decision = .5
    with Spinner("saving matched clusters to Excel file"):
        matcher.save_clusters_to_excel(excel_path, decision, lower_bound=decision)
    clusters = matcher.get_index_clusters_within_thresholds(decision)
    print("saved %d clusters to %s" % (len(clusters), excel_path))

    return clusters


In [168]:
jarowink_matcher(df)

  saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving mat

KeyboardInterrupt: 