In [353]:
import pandas as pd
import re

In [354]:
def clean_person_nbr(df):
    df.loc[:, "person_nbr"] = df.person_nbr.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    return df 

def clean_case_id(df):
    df.loc[:, "case_id"] = df.case_id.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    return df 

In [355]:
def read_employment():
    df = pd.read_csv("../../data/GA/5-10-2024/officer_employment.csv")
    
    df = df.rename(columns={"NAME": "full_name", 
                            "START DATE": "start_date", 
                            "END DATE":"end_date", 
                            "OKEY": "person_nbr",
                            "AGENCY": "agcy_name",
                            "RANK": "ofc_rank",
                            "STATUS": "employment_status"
       
                           })
    return df

def split_names(df):
    # Convert to lowercase, strip whitespace, and ensure last name is separated by comma
    df.loc[:, "full_name"] = (df
                              .full_name
                              .str.lower()
                              .str.strip()
                              .str.replace(r"\.", "", regex=True)
                              .str.replace(r"^(\w+)\/ ?(\w+)", r"\1 \2", regex=True)
                              .str.replace(r"\((\w+)\/?(\w+)\) ", "", regex=True)
                              .str.replace(r" n\/a$", "", regex=True)
                              .str.replace(r"^([\w\'\-]+) (.+)", r"\1, \2", regex=True)
                              .str.replace(r"(\w+)\’$", r"\1", regex=True)
                              .str.replace(r"(\w+)\`$", r"\1", regex=True)
                              .str.replace(r"de\`andrea", r"de'andrea", regex=True)
                              .str.replace(r"\, \/", ", ", regex=True)
                              .str.replace(r"n\/a (\w+)$", r"\1", regex=True)
    )
    
    # Remove extra spaces
    df.loc[:, "full_name"] = df.full_name.str.replace(r'(\w+)  (\w+)', r'\1 \2', regex=True)
    
    # Define regex patterns
    name_pattern = r'^([\w\'\-]+),\s*([\w\’?\'?\-?\w?\s]+?)(?:\s+([\w\’?\w?\s]+?))?(?:\s+(jr\.?|sr\.?|i{2,3}|iv))?$'
    suffixes = r'\b(jr\.?|sr\.?|i{2,3}|iv)\b'
    
    # Extract name components
    names = df['full_name'].str.extract(name_pattern, flags=re.IGNORECASE)
    
    # Assign columns
    df['last_name'] = names[0]
    df['first_name'] = names[1]
    df['middle_name'] = names[2]
    df['suffix'] = names[3]
    
    # Clean up middle name and suffix
    df['suffix'] = df['suffix'].fillna('').str.strip()
    
    # Handle cases where suffix might be in the middle_name column
    mask = df['middle_name'].str.contains(suffixes, case=False, na=False)
    df.loc[mask, 'suffix'] = df.loc[mask, 'middle_name'].str.extract(f'({suffixes})', flags=re.IGNORECASE)[0]
    df.loc[mask, 'middle_name'] = df.loc[mask, 'middle_name'].str.replace(suffixes, '', flags=re.IGNORECASE).str.strip()
    return df[~((df.last_name.fillna("") == ""))]

def remove_suffix(df):
    df.loc[:, "middle_name"] = (df
                                .middle_name
                                .str.replace(r"\s+", "", regex=True)
                                .str.replace(r"^iii$", "", regex=True)

    )
    return df 


def clean_rank(df):
    df.loc[:, "rank"] = (
        df.ofc_rank.str.lower()
        .str.strip()
        .str.replace(r"^ (\w+)", r"\1", regex=True)
        .str.replace(r"(\w+) $", r"\1", regex=True)
        .str.replace(r"comm\.", "communications", regex=True)
        .str.replace(r"corr\. \/", "corrections", regex=True)
        .str.replace(
            r"transfer ofc \(corr ofc certified\)",
            "transfer officer (certified corrrections officer)",
            regex=True,
        )
        .str.replace(
            r"unit manager \(corr ofc certified\)",
            "unit manager (certified corrections officer)",
            regex=True,
        )
        .str.replace(
            r"reserve ofc - lt \(peace ofc\)",
            "reserve lieutenant (peace officer)",
            regex=True,
        )
        .str.replace(r"(.+)? ?fire (.+)?", "", regex=True)
        .str.replace(r" \(s\.a\.c\.\)", "", regex=True)
        .str.replace(
            r"reserve ofc-lt\. col\.\(peace ofc\)",
            "reserve lieutenant colonel (peace officer)",
            regex=True,
        )
        .str.replace(r"\(corr ofc - sworn\)", "(sworn corrections officer)")
        .str.replace(r"corr\. off\.", "corrections officer", regex=True)
        .str.replace(r"ofc$", "officer", regex=True)
        .str.replace(r"asst\.? ", "assistant ", regex=True)
        .str.replace(r"med\.", "medical", regex=True)
        .str.replace(r"admin\.", "administrative", regex=True)
        .str.replace(
            r"reserve ofc - sgt \(peace ofc\)",
            "reserve sergeant (peace officer)",
            regex=True,
        )
        .str.replace(r"crime scene inv ", "crime scene investigator ", regex=True)
        .str.replace(r"- peace ofc \(sworn\)", "(sworn peace officer)", regex=True)
        .str.replace(r"reserve ofc - major \(peace ofc\)", "reserve major", regex=True)
        .str.replace(r"\(corr ofc - sworn\)", "(sworn corrections officer)", regex=True)
        .str.replace(r"part time", "part-time", regex=False)
        .str.replace(r"\(corr ofc certified\)", "(certified corrections officer)")
        .str.replace(r"mun\. prob\.", "municipal probation", regex=True)
        .str.replace(r"reg\. director", "regional director", regex=True)
        .str.replace(
            r"\(corr ofc certified\)", "certified corrections officer", regex=True
        )
        .str.replace(r"inv\.", "investigator", regex=True)
        .str.replace(r"reserve ofc - cpt \(peace ofc\)", "reserve captain", regex=True)
        .str.replace(r"dep\.", "deputy", regex=True)
        .str.replace(r"commissnr", "commissioner", regex=False)
        .str.replace(r"comm ", "communications ", regex=False)
        .str.replace(r"chief probation ofc", "chief probation officer", regex=False)
        .str.replace(r"i\.d\. technician", "identification technician", regex=True)
        .str.replace(r"gbi agent", "georgia bureau of investigation agent", regex=False)
        .str.replace(r"(\w+)-(\w+)", r"\1 \2", regex=True)
        .str.replace(r"non sworn", "non-sworn", regex=False)
        .str.replace(r"part time", "part-time", regex=False)
        .str.replace(r"(\w+)- (\w+)", r"\1-\2", regex=True)
    ).str.title()

    df.loc[:, "agcy_name"] = df.agcy_name.str.replace(r"Iii", "III", regex=False)
    return df


def clean_agency(df):
    df.loc[:, "agcy_name"] = (
        df.agcy_name.str.lower()
        .str.strip()
        .str.replace(r"^g(\w{4}) (.+)", r"\2", regex=True)
        .str.replace(r" \/ (\w+)$", "", regex=True)
        .str.replace(r" 911$", "", regex=True)
        .str.replace(r"c\.i\.", "correctional institution", regex=True)
        .str.replace(r"dept\.?", "department ", regex=True)
        .str.replace(r"sheriffs", "sheriff's", regex=False)
        .str.replace(r"dept\.$", "department", regex=True)
        .str.replace(r"^ (\w+)", r"\1", regex=True)
        .str.replace(r"(\w+) $", r"\1", regex=True)
        .str.replace(
            r"georgia d\.n\.r\. (.+)",
            "georgia department of natural resources",
            regex=True,
        )
        .str.replace(r"\/(inactive|18 mos\.)$", "", regex=True)
        .str.replace(r"l\.e\.a\.", "law enforcement academy", regex=True)
        .str.replace(r"d\.p\.s\.", "department of public safety", regex=True)
        .str.replace(r" \(inactive\)$", "", regex=True)
        .str.replace(r"c\. ?i?\.?$", "correctional institution", regex=True)
        .str.replace(r"^not found$", "", regex=True)
        .str.replace(r" & ", " and ", regex=False)
        .str.replace(r"juv\.justice", "juvenile justice", regex=True)
        .str.replace(r"(\w)  (\w+)", r"\1 \2", regex=True)
        .str.replace(r"metro\.", "metro", regex=True)
        .str.replace(r"tech\.", "tech", regex=True)
        .str.replace(r"^gdc ", "georgia department of corrections", regex=True)
        .str.replace(r"d\.o\.t\.", "department of transportation", regex=True)
        .str.replace(
            r"eot \(equivalency of training\)", "equivalency of training", regex=True
        )
        .str.replace(r"co\. ", "county ", regex=True)
        .str.replace(r"e?-?9-1-1", "911", regex=True)
        .str.replace(r" ci$", "correctional institute", regex=True)
        .str.replace(r"cherokee co\.", "cherokee county", regex=True)
        .str.replace(r"^ (\w+)", r"\1", regex=True)
        .str.replace(r"(\w+) $", r"\1", regex=True)
    ).str.title()

    df.loc[:, "agcy_name"] = df.agcy_name.str.replace(r"\'S", "s", regex=True)
    return df

def clean_employment_status(df):
    df.loc[:, "employment_status"] = (
        df.employment_status.str.replace(r"^ (\w+)", r"\1", regex=True)
        .str.replace(r"^zz$", "", regex=True)
        .str.replace(r"w\/in", "within", regex=True)
        .str.title()
    )
    return df

def upper_case(df):
    for col in df.columns:
        df = df.apply(lambda col: col.astype(str).str.upper())

    return df
dfa = read_employment()

dfa = dfa.pipe(split_names).pipe(clean_person_nbr).pipe(remove_suffix).pipe(clean_rank).pipe(clean_agency).pipe(clean_employment_status)

dfa

  mask = df['middle_name'].str.contains(suffixes, case=False, na=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "rank"] = (


Unnamed: 0,person_nbr,full_name,agcy_name,ofc_rank,employment_status,start_date,end_date,last_name,first_name,middle_name,suffix,rank
0,o143810,"a'giza, dalila",Dekalb County Police Department,PEACE OFFICER,Voluntary Resignation,2007-09-10,2007-09-10,a'giza,dalila,,,Peace Officer
1,o255181,"aagaard, jeffrey alan",Franklin County Sheriffs Office,JAILOR,Actively Employed In Law Enforcement,2022-11-25,0000-00-00,aagaard,jeffrey,alan,,Jailor
2,o246465,"aamir, wishah",Elbert County,COMM. OFFICER,Actively Employed In Law Enforcement,2020-06-01,0000-00-00,aamir,wishah,,,Communications Officer
3,o095227,"aanerud, damon h",Chatham County Sheriffs Office,JAILOR,Voluntary Resignation,1999-04-19,2000-10-07,aanerud,damon,h,,Jailor
4,o095227,"aanerud, damon h",Savannah Police Department,PEACE OFFICER,Voluntary Resignation,2000-10-09,2001-03-30,aanerud,damon,h,,Peace Officer
...,...,...,...,...,...,...,...,...,...,...,...,...
482658,o110791,"zygaj, stephen m",Atlanta Police Department,PEACE OFFICER,Rank Change - Promotion,2001-12-18,2020-01-08,zygaj,stephen,m,,Peace Officer
482659,o110791,"zygaj, stephen m",Atlanta Police Department,CAPTAIN,Rank Change - Promotion,2020-01-09,2022-04-13,zygaj,stephen,m,,Captain
482660,o110791,"zygaj, stephen m",Atlanta Police Department,MAJOR,Career Retirement,2022-04-14,2023-09-20,zygaj,stephen,m,,Major
482661,o226212,"zysk, justin michael",Smyrna Police Department,PEACE OFFICER,Voluntary Resignation,2016-09-26,2017-02-03,zysk,justin,michael,,Peace Officer


In [356]:
def read_data():
    df = pd.read_csv("../../data/GA/5-10-2024/officer_data.csv")
    df = df.rename(columns={"OKEY": "person_nbr", "YOB": "birth_year", "SEX": "sex", "RACE": "race"})
    return df

dfb = read_data()

dfb = dfb.pipe(clean_person_nbr)

dfb = dfb[["person_nbr", "birth_year","race", "sex"]]

personnel = pd.merge(dfa, dfb, on="person_nbr")

personnel = personnel.rename(columns={"ofc_rank": "rank"})

personnel = personnel.pipe(upper_case)

personnel.to_csv("data/bln/ga-2024-index.csv", index=False)