In [106]:
import pandas as pd
import datetime

In [107]:
## dropping status date because I don't know what they mean

In [108]:
def read_tbl():
    df = pd.read_csv("../data/input/EmployeePublicRecordRequestReport.csv")
    return df 

df = read_tbl()


### proceed with cleaning

def clean_agency(df):
    df.loc[:, "agency_name"] = (df
                           .agency_name
                           .str.lower()
                           .str.strip()
                           .str.replace(r"dept\.", "department", regex=True)
                           .str.replace(r"&", "and", regex=False)
                           .str.replace(r"\bpd\b", "police department", regex=True)
                           .str.replace(r"(\w+) \s+(\w+)", r"\1 \2", regex=True)
    )
    df = df[~((df.agency_name.str.contains("fire")))]
    return df

def rename_cols(df):
    df = df.rename(columns={"First Name": "first_name", "Last Name": "last_name", 
                            "Start Date": "start_date", "End Date": "end_date",
                            "End Action": "separation_reason", "Agency": "agency_name", 
                            "Certification": "certification_type", "Status": "employment_status",
                            "POST ID": "person_nbr"})
    return df

def drop_duplicate_rows(df):
    df = df.drop_duplicates(subset=["person_nbr", "start_date", "end_date"])
    return df 


def clean_date(date_str):
    try:
        year = int(date_str[:4])
        if year < 1800 or year > 2100:  # Adjust the range as needed
            return None
        return date_str
    except:
        return None


def collapse_contiguous_stints(
    df: pd.DataFrame, bycols=["person_nbr", "agency_name"]
) -> pd.DataFrame:
    # Convert dates to datetime format first
    df.loc[:, "start_date"] = pd.to_datetime(df.start_date)
    df.loc[:, "end_date"] = pd.to_datetime(df.end_date)
    
    # assume missing end dates are current employment, and use today's date for sorting purposes
    import datetime  # Make sure to import datetime module
    
    one_day = pd.to_timedelta(1, "days")
    today = pd.to_datetime(datetime.datetime.today(), utc=False)  # Fixed: use datetime.datetime.today()
    ancient = pd.to_datetime("1800-01-01", utc=False)
    working = df.sort_values(["person_nbr", "agency_name", "start_date"], inplace=False)
    
    # No need to apply clean_date and convert again since we already have datetime objects
    # working["start_date"] = working["start_date"].apply(clean_date)
    # working["end_date"] = working["end_date"].apply(clean_date)
    # working["start_date"] = pd.to_datetime(working.start_date, utc=False).fillna(ancient)
    # working["end_date"] = pd.to_datetime(working.end_date, utc=False).fillna(today)
    
    # Just fill missing values
    working["start_date"] = working["start_date"].fillna(ancient)
    working["end_date"] = working["end_date"].fillna(today)
    
    working.loc[working.start_date < ancient, "start_date"] = ancient
    working.loc[working.end_date > today, "end_date"] = today
    grouped = working.groupby(bycols)
    working["prv_end"] = grouped["end_date"].shift(1, fill_value=today)
    working["new_stint"] = (working.start_date - working.prv_end) > one_day
    working["stint_id"] = grouped["new_stint"].cumsum()
    collapsible = working.groupby(bycols + ["stint_id"])
    summaries = {k: lambda x: x.tail(1) for k in df.columns if k not in bycols}
    summaries["start_date"] = "min"
    summaries["end_date"] = "max"
    out = collapsible.aggregate(summaries).reset_index()
    out["start_date"] = out["start_date"].dt.strftime("%Y-%m-%d")
    out["end_date"] = out["end_date"].dt.strftime("%Y-%m-%d")
    out.loc[out.end_date == today.strftime("%Y-%m-%d"), "end_date"] = None
    out.loc[out.start_date == ancient.strftime("%Y-%m-%d"), "start_date"] = None
    return out.drop(["stint_id"], axis=1, inplace=False)



df = df.pipe(rename_cols).pipe(drop_duplicate_rows)


df.loc[:, "full_name"] = df.first_name + " " + df.last_name

df = df.pipe(collapse_contiguous_stints)

df.to_csv("../data/output/id-index-2025.csv", index=False)

In [109]:
# df.to_csv("../data/output/idaho_index_new.csv", index=False)