In [1]:
import re

import pandas as pd

In [2]:
def clean_person_nbr(df):
    df.loc[:, "person_nbr"] = (
        df.person_nbr.str.lower()
        .str.strip()
        .str.replace(r"\s+", "", regex=True)
    )
    return df


def clean_case_id(df):
    df.loc[:, "case_id"] = (
        df.case_id.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    )
    return df

In [3]:
def read_employment():
    df = pd.read_csv("../data/input/5-10-2024/officer_employment.csv")

    df = df.rename(
        columns={
            "NAME": "full_name",
            "START DATE": "start_date",
            "END DATE": "end_date",
            "OKEY": "person_nbr",
            "AGENCY": "agency_name",
            "RANK": "rank",
            "STATUS": "employment_status",
        }
    )
    return df


def split_names(df):
    # Convert to lowercase, strip whitespace, and ensure last name is separated by comma
    df.loc[:, "full_name"] = (
        df.full_name.str.lower()
        .str.strip()
        .str.replace(r"\.", "", regex=True)
        .str.replace(r"^(\w+)\/ ?(\w+)", r"\1 \2", regex=True)
        .str.replace(r"\((\w+)\/?(\w+)\) ", "", regex=True)
        .str.replace(r" n\/a$", "", regex=True)
        .str.replace(r"^([\w\'\-]+) (.+)", r"\1, \2", regex=True)
        .str.replace(r"(\w+)\’$", r"\1", regex=True)
        .str.replace(r"(\w+)\`$", r"\1", regex=True)
        .str.replace(r"de\`andrea", r"de'andrea", regex=True)
        .str.replace(r"\, \/", ", ", regex=True)
        .str.replace(r"n\/a (\w+)$", r"\1", regex=True)
    )

    # Remove extra spaces
    df.loc[:, "full_name"] = df.full_name.str.replace(
        r"(\w+)  (\w+)", r"\1 \2", regex=True
    )

    # Define regex patterns
    name_pattern = r"^([\w\'\-]+),\s*([\w\’?\'?\-?\w?\s]+?)(?:\s+([\w\’?\w?\s]+?))?(?:\s+(jr\.?|sr\.?|i{2,3}|iv))?$"
    suffixes = r"\b(jr\.?|sr\.?|i{2,3}|iv)\b"

    # Extract name components
    names = df["full_name"].str.extract(name_pattern, flags=re.IGNORECASE)

    # Assign columns
    df["last_name"] = names[0]
    df["first_name"] = names[1]
    df["middle_name"] = names[2]
    df["suffix"] = names[3]

    # Clean up middle name and suffix
    df["suffix"] = df["suffix"].fillna("").str.strip()

    # Handle cases where suffix might be in the middle_name column
    mask = df["middle_name"].str.contains(suffixes, case=False, na=False)
    df.loc[mask, "suffix"] = df.loc[mask, "middle_name"].str.extract(
        f"({suffixes})", flags=re.IGNORECASE
    )[0]
    df.loc[mask, "middle_name"] = (
        df.loc[mask, "middle_name"]
        .str.replace(suffixes, "", flags=re.IGNORECASE)
        .str.strip()
    )
    return df[~((df.last_name.fillna("") == ""))]


def remove_suffix(df):
    df.loc[:, "middle_name"] = df.middle_name.str.replace(
        r"\s+", "", regex=True
    ).str.replace(r"^iii$", "", regex=True)
    return df


dfa = read_employment()

dfa = dfa.pipe(split_names).pipe(clean_person_nbr).pipe(remove_suffix)

  mask = df['middle_name'].str.contains(suffixes, case=False, na=False)
  df.loc[mask, 'middle_name'] = df.loc[mask, 'middle_name'].str.replace(suffixes, '', flags=re.IGNORECASE).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


In [4]:
def read_data():
    df = pd.read_csv("../data/input/5-10-2024/officer_data.csv")
    df = df.rename(
        columns={
            "OKEY": "person_nbr",
            "YOB": "year_of_birth",
            "SEX": "sex",
            "RACE": "race",
        }
    )
    return df


dfb = read_data()

dfb = dfb.pipe(clean_person_nbr)

dfb = dfb[["person_nbr", "year_of_birth", "race", "sex"]]

personnel = pd.merge(dfa, dfb, on="person_nbr")

personnel.to_csv("georgia_index.csv", index=False)

In [5]:
def read_sanctions():
    df = pd.read_csv("../data/input/5-10-2024/officer_sanctions.csv")
    df = df.rename(
        columns={
            "OKEY": "person_nbr",
            "CASE": "case_id",
            "DATE": "sanction_date",
            "SANCTION": "sanction",
        }
    )
    return df


dfc = read_sanctions().pipe(clean_person_nbr).pipe(clean_case_id)

dfc = dfc[["case_id", "person_nbr", "sanction", "sanction_date"]]

In [6]:
def read_violations():
    df = pd.read_csv("../data/input/5-10-2024/officer_violations.csv")
    df = df.rename(
        columns={
            "CASE": "case_id",
            "OKEY": "person_nbr",
            "VIOLATION": "violation",
            "VIOLATION DATE": "violation_date",
        }
    )
    return df


dfd = read_violations()

dfd = dfd.pipe(clean_person_nbr).pipe(clean_case_id)


dfd = dfd[["case_id", "person_nbr", "violation", "violation_date"]]

cprr = pd.merge(dfc, dfd, on=["case_id", "person_nbr"])

cprr = cprr[~((cprr.violation_date == "0000-00-00"))]

cprr

Unnamed: 0,case_id,person_nbr,sanction,sanction_date,violation,violation_date
1,0045701195,o061330,ADMINISTRATIVE DISMISSAL,1996-05-09,DEPARTMENTAL RULE(S) VIOLATIONS,1994-07-11
12,0024490603,o097012,REVOKE CERTIFICATION,2004-04-08,VIOLATION OF OATH,2003-04-29
13,0024490603,o097012,REVOKE CERTIFICATION,2004-04-08,SEXUAL ASSAULT AGAINST PERSONS IN CUSTODY,2003-04-29
26,0059401005,o073211,REVOKE CERTIFICATION,2006-10-05,VIOLATION OF OATH,2005-10-20
27,0059401005,o073211,REVOKE CERTIFICATION,2006-10-05,"EAVESDROPPING, SURVEILLANCE WHICH INVADES PRIV...",2005-10-20
...,...,...,...,...,...,...
78915,0054361007,o133176,PROBATION 36 MONTHS,2011-06-08,TESTED POSITIVE FOR DRUGS IN SYSTEM,2007-09-05
78916,0054361007,o133176,PROBATION 36 MONTHS,2011-06-08,POSSESSION OF CONTROLLED DRUGS - COCAINE,2007-09-05
78917,0054361007,o133176,REINSTATEMENT OF CERTIFICATION,2011-06-08,DEPARTMENTAL RULE(S) VIOLATIONS,2007-09-05
78918,0054361007,o133176,REINSTATEMENT OF CERTIFICATION,2011-06-08,TESTED POSITIVE FOR DRUGS IN SYSTEM,2007-09-05


In [7]:
merged_df = pd.merge(cprr, personnel, on="person_nbr", how="left")

# Filter to keep only records where violation_date is between start_date and end_date
merged_df = merged_df[
    (merged_df["violation_date"] >= merged_df["start_date"])
    & (merged_df["violation_date"] <= merged_df["end_date"])
]


def normalize_dataframe(df):
    for col in df.columns:
        if df[col].dtype == "object":
            print(f"Cleaning column: {col}")
            # Convert to string, handle NaN values, then clean
            df[col] = (
                df[col].astype(str).replace("nan", "").str.lower().str.strip()
            )
        else:
            print(f"Skipping non-object column: {col}")
    return df


merged_df = normalize_dataframe(merged_df)


def norm_cols(df):
    df.loc[:, "year_of_birth"] = df.year_of_birth.astype(str).str.replace(
        r"\.0", "", regex=True
    )
    df.loc[:, "sanction"] = df.sanction.str.title()
    df.loc[:, "violation"] = df.violation.str.title()
    return df


merged_df = merged_df.pipe(norm_cols)

merged_df = merged_df.drop(columns=["employment_status"])

Cleaning column: case_id
Cleaning column: person_nbr
Cleaning column: sanction
Cleaning column: sanction_date
Cleaning column: violation
Cleaning column: violation_date
Cleaning column: full_name
Cleaning column: agency_name
Cleaning column: rank
Cleaning column: employment_status
Cleaning column: start_date
Cleaning column: end_date
Cleaning column: last_name
Cleaning column: first_name
Cleaning column: middle_name
Cleaning column: suffix
Skipping non-object column: year_of_birth
Cleaning column: race
Cleaning column: sex


In [8]:
merged_df.to_csv("../data/output/georgia-discipline_index.csv", index=False)