In [102]:
import pandas as pd
from helper import clean_column_names
from uid import gen_uid

In [104]:
def standardize_key(df):
    df.loc[:, "OKEY"] = df.OKEY.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    return df 

def standardize_case(df):
    df.loc[:, "CASE"] = df.CASE.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    return df 


def read_sanctions():
    df = pd.read_csv("../../../data/GA/5-10-2024/officer_sanctions.csv")
    return df

def read_violations():
    df = pd.read_csv("../../../data/GA/5-10-2024/officer_violations.csv")
    return df

def read_demo():
    df = pd.read_csv("../../../data/GA/5-10-2024/officer_data.csv")
    return df


def read_history():
    df = pd.read_csv("../../../data/GA/5-10-2024/officer_employment.csv")
    return df

dfa = read_sanctions()
 
dfa = dfa.pipe(standardize_key).pipe(standardize_case)

dfb = read_violations()

dfb = dfb.pipe(standardize_key).pipe(standardize_case)

# dfb

dfc = read_demo()

dfc = dfc.pipe(standardize_key)


dfd = read_history()

dfd = dfd.pipe(standardize_key)

df1 = pd.merge(dfc, dfd, on="OKEY")
df1 = df1.pipe(clean_column_names)

df1



Unnamed: 0,okey,last_name,first_name,middle,suffix,yob,sex,race,name,agency,rank,status,start_date,end_date
0,o143810,A'GIZA,DALILA,,,1976,Female,Black or African American (Not Hispanic or Lat...,A'GIZA DALILA,G1720 DEKALB COUNTY POLICE DEPARTMENT,PEACE OFFICER,Voluntary Resignation,2007-09-10,2007-09-10
1,o255181,AAGAARD,JEFFREY,ALAN,,1974,Male,White (Not Hispanic or Latino),AAGAARD JEFFREY ALAN,G1318 FRANKLIN COUNTY SHERIFFS OFFICE,JAILOR,Actively Employed in Law Enforcement,2022-11-25,0000-00-00
2,o246465,AAMIR,WISHAH,,,1996,Female,Asian (Not Hispanic or Latino),AAMIR WISHAH,G1928 ELBERT COUNTY 911,COMM. OFFICER,Actively Employed in Law Enforcement,2020-06-01,0000-00-00
3,o095227,AANERUD,DAMON,H,,1972,Male,White (Not Hispanic or Latino),AANERUD DAMON H,G1161 CHATHAM COUNTY SHERIFFS OFFICE,JAILOR,Voluntary Resignation,1999-04-19,2000-10-07
4,o095227,AANERUD,DAMON,H,,1972,Male,White (Not Hispanic or Latino),AANERUD DAMON H,G1244 SAVANNAH POLICE DEPARTMENT,PEACE OFFICER,Voluntary Resignation,2000-10-09,2001-03-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482658,o110791,ZYGAJ,STEPHEN,M,,1962,Male,White (Not Hispanic or Latino),ZYGAJ STEPHEN M,G1505 ATLANTA POLICE DEPARTMENT,PEACE OFFICER,Rank Change - Promotion,2001-12-18,2020-01-08
482659,o110791,ZYGAJ,STEPHEN,M,,1962,Male,White (Not Hispanic or Latino),ZYGAJ STEPHEN M,G1505 ATLANTA POLICE DEPARTMENT,CAPTAIN,Rank Change - Promotion,2020-01-09,2022-04-13
482660,o110791,ZYGAJ,STEPHEN,M,,1962,Male,White (Not Hispanic or Latino),ZYGAJ STEPHEN M,G1505 ATLANTA POLICE DEPARTMENT,MAJOR,Career Retirement,2022-04-14,2023-09-20
482661,o226212,ZYSK,JUSTIN,MICHAEL,,1989,Male,White (Not Hispanic or Latino),ZYSK JUSTIN MICHAEL,G1072 SMYRNA POLICE DEPARTMENT,PEACE OFFICER,Voluntary Resignation,2016-09-26,2017-02-03


In [105]:

dfa = dfa[["SANCTION", "DATE", "CASE"]]

df2 = pd.merge(dfa, dfb, on="CASE")

df2 = df2.pipe(clean_column_names)

df2 = df2.rename(columns={"date": "action_date", "sanction": "action", "violation": "allegation", "violation_date": "allegation_date"})

def clean_action(df):
    df.loc[:, "action"] = df.action.str.lower().str.strip().str.title()
    return df

def clean_allegation(df):
    df.loc[:, "allegation"] = df.allegation.str.lower().str.strip()
    return df 

def extract_years(df):
    df.loc[:, "action_year"] = df.action_date.str.replace(r"^(\w{4})(.+)", r"\1", regex=True)
    df.loc[:, "allegation_year"] = df.allegation_date.str.replace(r"^(\w{4})(.+)", r"\1", regex=True)
    return df 

df2 = df2.pipe(clean_action).pipe(clean_allegation).pipe(extract_years)

df2 = df2.drop(columns=["name", "action_year", "allegation_year"])

df2.okey.nunique()

31924

In [106]:
df2.head()

Unnamed: 0,action,action_date,case,okey,allegation,allegation_date
0,Probation 24 Months,2019-01-08,11400218,o239177,conduct unbecoming an officer,0000-00-00
1,Administrative Dismissal,1996-05-09,45701195,o061330,departmental rule(s) violations,1994-07-11
2,No Action,2018-10-12,42591017,o152547,sexual harassment,0000-00-00
3,Probation 24 Months,2013-08-14,5801011,o152547,conduct unbecoming an officer,0000-00-00
4,Probation 24 Months,2013-08-14,5801011,o152547,insubordination,0000-00-00


In [107]:
def custom_date_parser(date_string):
    if pd.isna(date_string) or date_string == "0000-00-00":
        return pd.NaT  
    try:
        return pd.to_datetime(date_string)
    except ValueError:
        return pd.NaT  

df1['start_date'] = df1['start_date'].apply(custom_date_parser)
df1['end_date'] = df1['end_date'].apply(custom_date_parser)
df2['action_date'] = df2['action_date'].apply(custom_date_parser)
df2['allegation_date'] = df2['allegation_date'].apply(custom_date_parser)

df_merged = pd.merge(df1, df2, on='okey', how='left')

# Create a new column to indicate if action_date or allegation_date falls within start and end date range
df_merged['within_range'] = (
    ((df_merged['action_date'] >= df_merged['start_date']) & (df_merged['action_date'] <= df_merged['end_date'])) |
    ((df_merged['allegation_date'] >= df_merged['start_date']) & (df_merged['allegation_date'] <= df_merged['end_date'])) |
    (df_merged['end_date'].isnull())
)

# Filter the merged DataFrame to keep only rows where action_date or allegation_date is within range
df_final = df_merged[df_merged['within_range']]

df_final

Unnamed: 0,okey,last_name,first_name,middle,suffix,yob,sex,race,name,agency,rank,status,start_date,end_date,action,action_date,case,allegation,allegation_date,within_range
1,o255181,AAGAARD,JEFFREY,ALAN,,1974,Male,White (Not Hispanic or Latino),AAGAARD JEFFREY ALAN,G1318 FRANKLIN COUNTY SHERIFFS OFFICE,JAILOR,Actively Employed in Law Enforcement,2022-11-25,NaT,,NaT,,,NaT,True
2,o246465,AAMIR,WISHAH,,,1996,Female,Asian (Not Hispanic or Latino),AAMIR WISHAH,G1928 ELBERT COUNTY 911,COMM. OFFICER,Actively Employed in Law Enforcement,2020-06-01,NaT,,NaT,,,NaT,True
6,o095227,AANERUD,DAMON,H,,1972,Male,White (Not Hispanic or Latino),AANERUD DAMON H,G1177 POOLER POLICE DEPARTMENT,SERGEANT,Actively Employed in Law Enforcement,2009-09-28,NaT,,NaT,,,NaT,True
9,o258654,AARON,ANN,MYREE,,1997,Female,White (Not Hispanic or Latino),AARON ANN MYREE,G1533 OGLETHORPE COUNTY SHERIFFS OFFICE,COMM. OFFICER,Actively Employed in Law Enforcement,2023-07-24,NaT,,NaT,,,NaT,True
15,o061330,AARON,AUDREY,R,,1951,Female,Black or African American (Not Hispanic or Lat...,AARON AUDREY R,G1276 METRO STATE PRISON/INACTIVE,PEACE OFFICER,Terminated,1994-02-01,1994-07-11,Administrative Dismissal,1996-05-09,0045701195,departmental rule(s) violations,1994-07-11,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620771,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Probation 36 Months,2011-06-08,0054361007,possession of controlled drugs - cocaine,2007-09-05,True
620772,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Reinstatement Of Certification,2011-06-08,0054361007,departmental rule(s) violations,2007-09-05,True
620773,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Reinstatement Of Certification,2011-06-08,0054361007,tested positive for drugs in system,2007-09-05,True
620774,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Reinstatement Of Certification,2011-06-08,0054361007,possession of controlled drugs - cocaine,2007-09-05,True


In [108]:
# Filter out rows where the 'allegation' column is missing (NaN)
df_final_allegation = df_final[df_final['allegation'].notna()]

df_final_allegation

Unnamed: 0,okey,last_name,first_name,middle,suffix,yob,sex,race,name,agency,rank,status,start_date,end_date,action,action_date,case,allegation,allegation_date,within_range
15,o061330,AARON,AUDREY,R,,1951,Female,Black or African American (Not Hispanic or Lat...,AARON AUDREY R,G1276 METRO STATE PRISON/INACTIVE,PEACE OFFICER,Terminated,1994-02-01,1994-07-11,Administrative Dismissal,1996-05-09,0045701195,departmental rule(s) violations,1994-07-11,True
121,o097012,ABAD,GILBERTO,,,1972,Male,Hispanic or Latino,ABAD GILBERTO,G1505 ATLANTA POLICE DEPARTMENT,PEACE OFFICER,Resigned in Lieu of Termination,1999-12-21,2003-07-03,Revoke Certification,2004-04-08,0024490603,violation of oath,2003-04-29,True
122,o097012,ABAD,GILBERTO,,,1972,Male,Hispanic or Latino,ABAD GILBERTO,G1505 ATLANTA POLICE DEPARTMENT,PEACE OFFICER,Resigned in Lieu of Termination,1999-12-21,2003-07-03,Revoke Certification,2004-04-08,0024490603,sexual assault against persons in custody,2003-04-29,True
127,o080105,ABAD,JOSEPH,A,,1972,Male,White (Not Hispanic or Latino),ABAD JOSEPH A,G1065 TIFTON POLICE DEPARTMENT,POLICE OFFICER,Career Retirement,2014-11-30,2023-02-28,No Action,2016-04-08,0057951114,false accusations against fellow employees,NaT,True
128,o080105,ABAD,JOSEPH,A,,1972,Male,White (Not Hispanic or Latino),ABAD JOSEPH A,G1065 TIFTON POLICE DEPARTMENT,POLICE OFFICER,Career Retirement,2014-11-30,2023-02-28,No Action,2016-04-08,0057951114,neglect of duty,NaT,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620770,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Probation 36 Months,2011-06-08,0054361007,tested positive for drugs in system,2007-09-05,True
620771,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Probation 36 Months,2011-06-08,0054361007,possession of controlled drugs - cocaine,2007-09-05,True
620772,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Reinstatement Of Certification,2011-06-08,0054361007,departmental rule(s) violations,2007-09-05,True
620773,o133176,ZWIERKO,MICHAEL,R,,1981,Male,White (Not Hispanic or Latino),ZWIERKO MICHAEL R,G1693 ELLAVILLE POLICE DEPARTMENT,PEACE OFFICER,Actively Employed in Law Enforcement,2020-11-06,NaT,Reinstatement Of Certification,2011-06-08,0054361007,tested positive for drugs in system,2007-09-05,True


In [109]:
df_final_allegation.okey.nunique()

20195