In [19]:
import pandas as pd
import re

In [20]:
def clean_person_nbr(df):
    df.loc[:, "person_nbr"] = df.person_nbr.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    return df 

def clean_case_id(df):
    df.loc[:, "case_id"] = df.case_id.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    return df 

In [21]:
def read_employment():
    df = pd.read_csv("../../data/GA/5-10-2024/officer_employment.csv")
    
    df = df.rename(columns={"NAME": "full_name", 
                            "START DATE": "start_date", 
                            "END DATE":"end_date", 
                            "OKEY": "person_nbr",
                            "AGENCY": "agency_name",
                            "RANK": "rank",
                            "STATUS": "employment_status"
       
                           })
    return df

def split_names(df):
    df.loc[:, "full_name"] = df.full_name.str.lower().str.strip().str.replace(r"^(\w+) (.+)", r"\1, \2", regex=True)
    names = df['full_name'].str.extract(r"(\w+),\s+(\w+)\s*(.+)?")
    
    df['last_name'] = names[0]
    df['first_name'] = names[1]
    
    suffixes = r"\b(Jr\.?|Sr\.?|I{2,3}|IV)\b"
    
    def split_rest(rest):
        if pd.isna(rest):
            return pd.Series({'middle_name': "", 'suffix': ""})
        
        # Check for suffix first
        suffix_match = re.search(suffixes, rest)
        if suffix_match:
            suffix = suffix_match.group()
            # Remove suffix from rest
            rest = re.sub(suffixes, '', rest).strip()
            # If anything remains, it's the middle name
            middle_name = rest if rest else ""
        else:
            suffix = ""
            middle_name = rest.strip() if rest.strip() else ""
        
        return pd.Series({'middle_name': middle_name, 'suffix': suffix})
    
    # Apply the split_rest function
    rest_split = names[2].apply(split_rest)
    
    df['middle_name'] = rest_split['middle_name']
    df['suffix'] = rest_split['suffix']
    
    return df


dfa = read_employment()

dfa = dfa.pipe(split_names).pipe(clean_person_nbr)

In [22]:
def read_data():
    df = pd.read_csv("../../data/GA/5-10-2024/officer_data.csv")
    df = df.rename(columns={"OKEY": "person_nbr", "YOB": "year_of_birth", "SEX": "sex", "RACE": "race"})
    return df 

dfb = read_data()

dfb = dfb.pipe(clean_person_nbr)

dfb = dfb[["person_nbr", "year_of_birth","race", "sex"]]

personnel = pd.merge(dfa, dfb, on="person_nbr")



In [23]:

def read_sanctions():
    df = pd.read_csv("../../data/GA/5-10-2024/officer_sanctions.csv")
    df = df.rename(columns={"OKEY": "person_nbr", "CASE": "case_id", "DATE": "sanction_date", "SANCTION": "sanction"})
    return df 

dfc = read_sanctions().pipe(clean_person_nbr).pipe(clean_case_id)

dfc = dfc[["case_id", "person_nbr", "sanction", "sanction_date"]]


In [24]:
def read_violations():
    df = pd.read_csv("../../data/GA/5-10-2024/officer_violations.csv")
    df = df.rename(columns={"CASE": "case_id", "OKEY": "person_nbr", "VIOLATION": "violation", "VIOLATION DATE": "violation_date"})
    return df 

dfd = read_violations()

dfd = dfd.pipe(clean_person_nbr).pipe(clean_case_id)


dfd = dfd[["case_id", "person_nbr", "violation", "violation_date"]]

cprr = pd.merge(dfc, dfd, on=["case_id", "person_nbr"])

cprr = cprr[~((cprr.violation_date == "0000-00-00"))]

cprr

Unnamed: 0,case_id,person_nbr,sanction,sanction_date,violation,violation_date
1,0045701195,o061330,ADMINISTRATIVE DISMISSAL,1996-05-09,DEPARTMENTAL RULE(S) VIOLATIONS,1994-07-11
12,0024490603,o097012,REVOKE CERTIFICATION,2004-04-08,VIOLATION OF OATH,2003-04-29
13,0024490603,o097012,REVOKE CERTIFICATION,2004-04-08,SEXUAL ASSAULT AGAINST PERSONS IN CUSTODY,2003-04-29
26,0059401005,o073211,REVOKE CERTIFICATION,2006-10-05,VIOLATION OF OATH,2005-10-20
27,0059401005,o073211,REVOKE CERTIFICATION,2006-10-05,"EAVESDROPPING, SURVEILLANCE WHICH INVADES PRIV...",2005-10-20
...,...,...,...,...,...,...
78915,0054361007,o133176,PROBATION 36 MONTHS,2011-06-08,TESTED POSITIVE FOR DRUGS IN SYSTEM,2007-09-05
78916,0054361007,o133176,PROBATION 36 MONTHS,2011-06-08,POSSESSION OF CONTROLLED DRUGS - COCAINE,2007-09-05
78917,0054361007,o133176,REINSTATEMENT OF CERTIFICATION,2011-06-08,DEPARTMENTAL RULE(S) VIOLATIONS,2007-09-05
78918,0054361007,o133176,REINSTATEMENT OF CERTIFICATION,2011-06-08,TESTED POSITIVE FOR DRUGS IN SYSTEM,2007-09-05


In [25]:
merged_df = pd.merge(
    cprr,
    personnel,
    on='person_nbr',
    how='left'
)

# Filter to keep only records where violation_date is between start_date and end_date
merged_df = merged_df[
    (merged_df['violation_date'] >= merged_df['start_date']) &
    (merged_df['violation_date'] <= merged_df['end_date'])
]

def normalize_dataframe(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            print(f"Cleaning column: {col}")
            # Convert to string, handle NaN values, then clean
            df[col] = df[col].astype(str).replace('nan', '').str.lower().str.strip()
        else:
            print(f"Skipping non-object column: {col}")    
    return df

merged_df = normalize_dataframe(merged_df)

merged_df

Cleaning column: case_id
Cleaning column: person_nbr
Cleaning column: sanction
Cleaning column: sanction_date
Cleaning column: violation
Cleaning column: violation_date
Cleaning column: full_name
Cleaning column: agency_name
Cleaning column: rank
Cleaning column: employment_status
Cleaning column: start_date
Cleaning column: end_date
Cleaning column: last_name
Cleaning column: first_name
Cleaning column: middle_name
Cleaning column: suffix
Skipping non-object column: year_of_birth
Cleaning column: race
Cleaning column: sex


Unnamed: 0,case_id,person_nbr,sanction,sanction_date,violation,violation_date,full_name,agency_name,rank,employment_status,start_date,end_date,last_name,first_name,middle_name,suffix,year_of_birth,race,sex
0,0045701195,o061330,administrative dismissal,1996-05-09,departmental rule(s) violations,1994-07-11,"aaron, audrey r",g1276 metro state prison/inactive,peace officer,terminated,1994-02-01,1994-07-11,aaron,audrey,r,,1951.0,black or african american (not hispanic or lat...,female
1,0024490603,o097012,revoke certification,2004-04-08,violation of oath,2003-04-29,"abad, gilberto",g1505 atlanta police department,peace officer,resigned in lieu of termination,1999-12-21,2003-07-03,abad,gilberto,,,1972.0,hispanic or latino,male
2,0024490603,o097012,revoke certification,2004-04-08,sexual assault against persons in custody,2003-04-29,"abad, gilberto",g1505 atlanta police department,peace officer,resigned in lieu of termination,1999-12-21,2003-07-03,abad,gilberto,,,1972.0,hispanic or latino,male
3,0059401005,o073211,revoke certification,2006-10-05,violation of oath,2005-10-20,"abair, perry l",g1682 americus police department,peace officer,resigned in lieu of termination,1996-04-01,2005-11-03,abair,perry,l,,1969.0,white (not hispanic or latino),male
4,0059401005,o073211,revoke certification,2006-10-05,"eavesdropping, surveillance which invades priv...",2005-10-20,"abair, perry l",g1682 americus police department,peace officer,resigned in lieu of termination,1996-04-01,2005-11-03,abair,perry,l,,1969.0,white (not hispanic or latino),male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120787,0054361007,o133176,probation 36 months,2011-06-08,tested positive for drugs in system,2007-09-05,"zwierko, michael r",g1515 lee county sheriffs office,peace officer,terminated,2005-09-23,2007-09-05,zwierko,michael,r,,1981.0,white (not hispanic or latino),male
120794,0054361007,o133176,probation 36 months,2011-06-08,possession of controlled drugs - cocaine,2007-09-05,"zwierko, michael r",g1515 lee county sheriffs office,peace officer,terminated,2005-09-23,2007-09-05,zwierko,michael,r,,1981.0,white (not hispanic or latino),male
120801,0054361007,o133176,reinstatement of certification,2011-06-08,departmental rule(s) violations,2007-09-05,"zwierko, michael r",g1515 lee county sheriffs office,peace officer,terminated,2005-09-23,2007-09-05,zwierko,michael,r,,1981.0,white (not hispanic or latino),male
120808,0054361007,o133176,reinstatement of certification,2011-06-08,tested positive for drugs in system,2007-09-05,"zwierko, michael r",g1515 lee county sheriffs office,peace officer,terminated,2005-09-23,2007-09-05,zwierko,michael,r,,1981.0,white (not hispanic or latino),male


In [29]:
print(merged_df.violation.value_counts(ascending=False).head(20))

violation
departmental rule(s) violations           8180
conduct unbecoming an officer             2909
failed to notify post of arrest           2028
driving under the influence               1717
involvement with inmates                  1571
undue force                               1340
insubordination                           1133
simple battery                            1074
domestic violence                          914
untrue or deceptive statements             898
deceptive in an internal investigation     739
negligence                                 565
falsified departmental records             538
tested positive for drugs in system        537
inside guard line with contraband          509
theft by taking                            501
sexual harassment                          499
misuse of departmental equipment           442
sleeping on duty                           408
sexual misconduct                          349
Name: count, dtype: int64


In [28]:
print(merged_df.sanction.value_counts(ascending=False).head(20))

sanction
revoke certification                 11790
public reprimand                      6019
probation 24 months                   4643
post application denied               3087
no action                             2759
probation 12 months                   1897
administrative dismissal              1791
probation 36 months                   1503
approve certification                 1092
accept surrender of certification      585
administratively closed                326
suspension                             278
reinstatement of certification         249
probation 18 months                    185
12 months suspension                   163
reinstatement/denied                   153
not sustained                          106
24 months suspension                    97
6 months suspension                     81
rescind previous action                 60
Name: count, dtype: int64
