In [13]:
import pandas as pd
import re
from lib import clean_column_names

In [14]:
def split_officer_name(name):
    if pd.isna(name) or not isinstance(name, str):
        return pd.Series({'first_name': None, 'middle_name': None, 'last_name': None, 'suffix': None})
    
    name = name.strip().rstrip('.,')
    
    suffixes = {'JR', 'SR', 'II', 'III', 'IV', 'V'}
    
    suffix = None
    for s in suffixes:
        if name.upper().endswith(' ' + s) or name.upper().endswith(',' + s):
            suffix = s
            name = name[:-(len(s)+1)].strip().rstrip(',')
            break
    
    parts = re.split(r',\s*|\s+', name)
    
    if ',' in name:  
        last_name = parts[0].title()
        first_name = parts[1].title() if len(parts) > 1 else None
        middle_name = ' '.join(parts[2:]).title() or None
    elif name.isupper():  
        last_name = parts[0].title()
        first_name = parts[1].title() if len(parts) > 1 else None
        middle_name = ' '.join(parts[2:]).title() or None
    else:  
        first_name = parts[0].title()
        last_name = parts[-1].title()
        middle_name = ' '.join(parts[1:-1]).title() or None
    
    return pd.Series({'first_name': first_name, 'middle_name': middle_name, 'last_name': last_name, 'suffix': suffix})

df = pd.read_csv("../../post-extract/data/output/all_officers_output.csv")

name_parts = df['Officer Name'].apply(split_officer_name)

df[['first_name', 'middle_name', 'last_name', 'suffix']] = name_parts


def clean_matching_data(df):
    df.loc[:, "first_name"] = df.first_name.str.lower().str.strip()
    df.loc[:, "middle_name"] = df.middle_name.str.lower().str.strip()
    df.loc[:, "last_name"] = df.last_name.str.lower().str.strip()
    df.loc[:, "suffix"] = df.suffix.str.lower().str.strip()

    df.loc[:, "agency_name"] = (df.agency_name
                                .str.lower()
                                .str.strip()
                                .str.replace(r"\s+", "-", regex=True)
                                .str.replace(r"(\w{1})\.(\w{1})\.", r"\1\2", regex=True)
    )
    return df 

def filter_for_orleans_officers(df):
    grouped = df.groupby('person_id')
    
    def has_orleans(group):
        return group['agency_name'].str.contains('orleans', case=False).any()

    orleans_officers = grouped.filter(has_orleans)
    
    return orleans_officers


print(f"Number of rows before filtering: {len(df)}")
df = df.pipe(clean_column_names).pipe(clean_matching_data).pipe(filter_for_orleans_officers)
df

Number of rows before filtering: 671


Unnamed: 0,hash_id,person_id,officer_name,officer_sex,agency_name,status,hire_date,separation_date,reason_for_separation,filename,page_number,first_name,middle_name,last_name,suffix
15,f1f787032861e3c306e6c4593ee42cac,4715216ab590dde009e36d953a27ba20,"AGUILLARD, DYNASTY JARANEISHA",Female,new-orleans-pd,,11/10/2019,,,LA_Peace_Officer_Reports_7_12_24.pdf,2,dynasty,jaraneisha,aguillard,
16,fc11cc693d7bd1d68d0af761e84fdd53,67abadc2d1d03144a6a580777aeabdfa,"ALLEN, KIRK",Male,new-orleans-pd,FULL-TIME,8/8/1981,5/8/2016,,LA_Peace_Officer_Reports_7_12_24.pdf,3,kirk,,allen,
18,15e0a96b310deb691687a19e685bde47,da0100ca799212ad1943f0af0ded62fe,"ARROYO, STEVEN RICHARD",Male,new-orleans-pd,FULL-TIME,1/8/1989,12/19/2023,RETIRED,LA_Peace_Officer_Reports_7_12_24.pdf,5,steven,richard,arroyo,
19,92040c696328862497ca68c3dc1a530e,da0100ca799212ad1943f0af0ded62fe,"ARROYO, STEVEN RICHARD",Male,st-tammany-parish-so,RETIRED,,,,LA_Peace_Officer_Reports_7_12_24.pdf,5,steven,richard,arroyo,
20,fe4d56f45be7e7b5d281759898e8a23e,4002f7918e096b0b87198a354596258e,"BRADFORD, HENRY GERARD, SR.",Male,new-orleans-pd,RETIRED,,,,LA_Peace_Officer_Reports_7_12_24.pdf,6,henry,gerard,bradford,sr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,f01b4cb7837e365c1169edd6b2dbe28b,057776af2a606666781c3b79ff840514,"CAHN, MIKE",Male,new-orleans-pd,,12/11/99,,,LA_Peace_Officer_Reports_7_12_24_pt.4.pdf,18,mike,,cahn,
663,11eaa142d0fc50188ad5155fe17fd9fc,cfd6710c4b7defe302d5e88eeeb44f66,"THOMAS, JOHN DERRICK",Male,new-orleans-pd,FULL-TIME,8/19/1991,12/11/2021,Other,LA_Peace_Officer_Reports_7_12_24_pt.4.pdf,22,john,derrick,thomas,
664,62229b9e62ef31a4a19578aa8da8235d,c588dd666c8b97378518928cb15242cd,"BLAIR, CRAIG",Male,new-orleans-pd,FULL-TIME,6/14/1981,,,LA_Peace_Officer_Reports_7_12_24_pt.4.pdf,24,craig,,blair,
665,aef22f8e11bdadaf7bc8178ae725f0ac,c588dd666c8b97378518928cb15242cd,"BLAIR, CRAIG",Male,new-orleans-pd,RETIRED,6/14/1981,10/22/2015,,LA_Peace_Officer_Reports_7_12_24_pt.4.pdf,24,craig,,blair,


In [15]:
df.to_csv("../data/output/post_clean.csv", index=False)