In [7]:
import pandas as pd


def read_tbl():
    df = pd.read_excel("../data/input/ms_response_copyofpostrosterfile.xlsx")

    df = df.rename(
        columns={
            "FIRST": "first_name",
            "LAST": "last_name",
            "STATUS": "employment_status",
            "DOH": "start_date",
            "DOT": "end_date",
            "CERT #": "person_nbr",
        }
    )
    return df


def filter_uid(df):
    # Create number pattern for 4 or 5 digits
    pattern = r"^\d{4,5}$"

    # Filter rows where cert matches pattern
    mask = df["person_nbr"].astype(str).str.match(pattern)

    return df[mask]


def fix_agency_name(df):
    df.loc[:, "TYPE"] = df.TYPE.str.replace(r"(\w+) $", r"\1", regex=True)
    df.loc[:, "agency_name"] = df.Agency.str.cat(df.TYPE, sep=" ")
    return df


def clean_dates(df):
    df["start_date"] = pd.to_datetime(df["start_date"], errors="coerce")
    df["end_date"] = pd.to_datetime(df["end_date"], errors="coerce")

    df.loc[:, "start_date"] = (
        df.start_date.astype(str)
        .str.lower()
        .str.replace(r"nat", "", regex=True)
    )
    df.loc[:, "end_date"] = (
        df.end_date.astype(str).str.lower().str.replace(r"nat", "", regex=True)
    )
    return df[~((df.start_date == ""))]


def clean_agency_name(df):
    df.loc[:, "agency_name"] = (
        df.agency_name.str.lower()
        .str.strip()
        .str.replace(r" (so|sheriff)$", " sheriff's office", regex=True)
        .str.replace(r" pd$", " police department", regex=True)
        .str.replace(r"(\w+)\s\+(\w+)", r"\1 \2", regex=True)
        .str.replace(r" dist ", " district ", regex=True)
        .str.replace(r" co ", " county ", regex=False)
        .str.replace(r"(\w+)  (\w+)", r"\1 \2", regex=True)
        .str.replace(r" off$", " office", regex=True)
        .str.replace(r" dept ", " department ", regex=False)
        .str.replace(r"lev brd", "levee board", regex=False)
    )
    return df[~((df.agency_name.fillna("").str.contains("fire")))]


def drop_cols(df):
    df = df.drop(columns=["Agency", "TYPE"])


df = read_tbl()

df = (
    df.pipe(filter_uid)
    .pipe(fix_agency_name)
    .pipe(clean_dates)
    .pipe(clean_agency_name)
)

df.sort_values("person_nbr").head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_date'] = pd.to_datetime(df['start_date'], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

Unnamed: 0,first_name,last_name,Agency,TYPE,employment_status,start_date,end_date,person_nbr,agency_name
36224,Jerry,Hinton,Reservoir Patrol,PD,Full-Time,2001-04-25,2010-04-30,1001,reservoir patrol police department
2850,Bruce,Addison,Brookhaven,PD,Full-Time,2014-12-15,2015-04-07,1002,brookhaven police department
39471,Arvin,Addison,Summit,PD,Part-Time,2008-04-21,2012-06-20,1002,summit police department
20250,Ronald,Sumrall,Jones Co Schools,Schools,Full-Time,1994-09-16,,1004,jones county schools schools
20180,Ronald,Sumrall,Jones Co,SO,Part-Time,2005-11-08,,1004,jones county sheriff's office
487,Don,Scott,AG - MFCU,State,Full-Time,2002-10-01,2008-04-02,1005,ag - mfcu state
20035,Don,Scott,Jones Co,SO,Full-Time,2008-05-01,2016-08-31,1005,jones county sheriff's office
25692,Bobby,Johnson,MDOC,State,Full-Time,1998-10-05,,1006,mdoc state
464,,Dearman,AG - MFCU,State,Full-Time,1998-08-14,2008-03-31,1009,ag - mfcu state
21401,Walter,Martin,Laurel,Chief,Full-Time,1978-04-17,2011-05-27,1010,laurel chief


In [8]:
## done


df.to_csv("../data/output/mississippi-processed.csv", index=False)