In [1]:
import pandas as pd

In [2]:
def read_tbl():
    df = pd.read_csv(
        "../data/input/006_PR_AllOfficers_w_AppointmentsAndFinalActions.csv"
    )

    # df = df.drop(columns=["suffix", "year_of_birth", "age", "type"])

    df = df.rename(
        columns={
            "Term_Desc": "separation_reason",
            "AppointedDate": "start_date",
            "TerminationDate": "end_date",
            "Last_Name": "last_name",
            "First_Name": "first_name",
            "Middle_Name": "middle_name",
            "Agency": "agency_name",
            "PostId": "person_nbr",
        }
    )

    df = df[
        [
            "agency_name",
            "person_nbr",
            "first_name",
            "middle_name",
            "last_name",
            "start_date",
            "end_date",
            "separation_reason",
        ]
    ]
    return df


def clean_separation_reason(df):
    df.loc[:, "separation_reason"] = (
        df.separation_reason.str.lower()
        .str.strip()
        .fillna("")
        .str.replace(
            r"misconduct - no|active|medical|other\/unknown", "", regex=True
        )
        .str.replace(r"misconduct - yes", "misconduct", regex=True)
        .str.replace(r"active", "", regex=True)
    )
    return df


def clean_agency(df):
    df.loc[:, "agency_name"] = (
        df.agency_name.str.lower()
        .str.strip()
        .str.replace(r"^az ", "arizona ", regex=True)
        .str.replace(r" dept ", " department ", regex=False)
        .str.replace(r"dept$", "department", regex=True)
        .str.replace(r"departme$", "department", regex=True)
        .str.replace(r" enf ", " enforcement ", regex=False)
        .str.replace(r" az ", " arizona ", regex=False)
        .str.replace(r" & ", " and ", regex=False)
        .str.replace(r"contr$", "control", regex=True)
        .str.replace(r"pd$", "police department", regex=True)
        .str.replace(r"departm$", "department", regex=True)
        .str.replace(r" cty ", " county ", regex=False)
        .str.replace(r"-(\w+)$", r"- \1", regex=True)
        .str.replace(r"(\w+)\,(\w+)", r"\1, \2", regex=True)
        .str.replace(r"animal se", "animal services", regex=False)
        .str.replace(r"^ret\,? ", "", regex=True)
    )
    return df


df = read_tbl()

df = df.pipe(clean_separation_reason).pipe(clean_agency)

In [3]:
df.to_csv("../data/output/arizona_index.csv", index=False)