In [40]:
import pandas as pd
import re
from datetime import datetime
import numpy as np

In [41]:
def clean_agency(df):
    df.loc[:, "agency_name"] = df.agency_name.str.replace(r"P\.D\.", "Police Department", regex=True)
    return df 

def convert_date(date_value):
    if pd.isna(date_value):
        return np.nan  
    
    try:
        date_str = str(date_value)
        date_obj = datetime.strptime(date_str, '%m/%d/%y')
        return date_obj.strftime('%Y-%m-%d')
    except ValueError:
        print(f"Warning: Could not convert date '{date_value}'")
        return np.nan


def clean_dates(df):
    df['start_date'] = df['start_date'].apply(convert_date)
    df['end_date'] = df['end_date'].apply(convert_date)
    return df

def split_names(df):
    names = df['full_name'].str.extract(r"(\w+),\s+(\w+)\s*(.+)?")
    
    df['last_name'] = names[0]
    df['first_name'] = names[1]
    
    suffixes = r"\b(Jr\.?|Sr\.?|I{2,3}|IV)\b"
    
    def split_rest(rest):
        if pd.isna(rest):
            return pd.Series({'middle_name': "", 'suffix': ""})
        
        # Check for suffix first
        suffix_match = re.search(suffixes, rest)
        if suffix_match:
            suffix = suffix_match.group()
            # Remove suffix from rest
            rest = re.sub(suffixes, '', rest).strip()
            # If anything remains, it's the middle name
            middle_name = rest if rest else ""
        else:
            suffix = ""
            middle_name = rest.strip() if rest.strip() else ""
        
        return pd.Series({'middle_name': middle_name, 'suffix': suffix})
    
    # Apply the split_rest function
    rest_split = names[2].apply(split_rest)
    
    df['middle_name'] = rest_split['middle_name']
    df['suffix'] = rest_split['suffix']
    
    return df

def read_tbl():
    df = pd.read_csv("../../../../data/KY/2022-10-13/ky-employment-history.csv")
    df = df.rename(columns={"FULL_NAME": "full_name", "gender": "sex", "Certification": "certification", 
                            "Certification Status": "certification_status",
                            "Acadmey Id": "academy_id", "Pops Certification Number": "person_nbr", 
                            "YearOfBirth": "year_of_birth", "Employing Agency": "agency_name", 
                            "Employment Title": "rank", "Employment Hire Date": "start_date", "Employment End Date": "end_date"})
    return df 


df = read_tbl()

df = df.pipe(clean_agency).pipe(clean_dates).pipe(split_names)
df.to_csv("../data/output/kentucky_index.csv", index=False)

In [42]:
df.agency_name.unique()

array(['Paducah Police Department', 'Fayette Co. Sheriff',
       'Bardstown Police Department', 'Lebanon Police Department',
       'Springfield Police Department', 'Audubon Park Police Department',
       'Louisville Metro Police Department', 'Marion Co. Sheriff',
       'Cumberland Co. Sheriff', 'Burkesville Police Department',
       'Commercial Vehicle Enforcement', 'Lexington Police Department',
       'Harlan Co. Sheriff', 'KSP Academy', 'Erlanger Police Department',
       'KY Dept. of Fish & Wildlife', 'Franklin Co. Sheriff',
       'Frankfort Police Department',
       'Cincinnati/N. KY Airport Police Department', 'Laurel Co. Sheriff',
       'Oldham Co. Sheriff', 'Daviess Co. Sheriff',
       'KY Office of Charitable Gaming', 'Trigg Co. Sheriff',
       'Cadiz Police Department', 'Cold Spring Police Department',
       'Pendleton Co. Sheriff', 'Military Leave', 'Nelson Co. Sheriff',
       'Danville Police Department', 'Mayfield Police Department',
       'Magoffin Co. Sheri