In [203]:
# person_nbr
# full_name
# last_name
# first_name
# middle_name
# middle_initial
# suffix
# birth_year
# dob
# age
# agcy_name
# type
# rank
# start_date
# end_date

In [204]:
# ADD to README: 

# RANKS
# 1. the array has values for both "peace officer" and "police officer". should one be picked?
# 2. is "certified corrections officer" analagous to "sworn corrections officer"? Currently these are two distinct values in the rank 
# 3. reserve officer ranks were simplified by removing "officer" when the rank contained a higher rank. 
#    For example "reserve ofc - lt (peace ofc)" was changed to reserve lieutenant (peace officer)

# AGENCIES
# 1. agency_data-20231220144255.csv was used as the groundtruth dataset. 
# Agency data in employment-history.csv was replaced by agency_data-20231220144255.csv.
# Both datasets were nearly analagous, however, as you can see from the get_unique_fuzzy_matches function 
# there was a discrepancy in 11 rows between the datasets. 
# the chosen rows were derived from agency_data-20231220144255.csv

In [205]:
import pandas as pd
from fuzzywuzzy import fuzz

In [206]:
def read_personnel_and_employment_hist():
    df = pd.read_csv("../../../../data/GA/2022-09-27/employment-history.csv")
    
    df = df.rename(columns={" NAME": "full_name", 
                            " START DATE": "start_date", 
                            " END DATE ":"end_date", 
                            "OKEY": "person_nbr",
                            " AGENCY": "agcy_name",
                            " RANK": "ofc_rank",
                            " STATUS": "type"
       
                           })
    return df


def split_names(df):
    names = df.full_name.str.title().str.strip().str.extract(r"(\w+) (\w+) ?(.+)")

    df.loc[:, "last_name"] = names[0]
    df.loc[:, "middle_name"] = names[2].str.title()
    df.loc[:, "first_name"] = names[1]

    df.loc[:, "full_name"] = df.full_name.str.title()
    return df 


def clean_per_nbr(df):
    df.loc[:, "person_nbr"] = df.person_nbr.str.lower().str.strip().str.replace(r"\s+", "", regex=True)
    return df 


def clean_rank(df):
    df.loc[:, "rank"] = (df
                             .ofc_rank
                             .str.lower()
                             .str.strip()
                             .str.replace(r"^ (\w+)", r"\1", regex=True)
                             .str.replace(r"(\w+) $", r"\1", regex=True)
                             .str.replace(r"comm\.", "communications", regex=True)
                             .str.replace(r"corr\. \/", "corrections", regex=True)
                             .str.replace(r"transfer ofc \(corr ofc certified\)", "transfer officer (certified corrrections officer)", regex=True)
                             .str.replace(r"unit manager \(corr ofc certified\)", "unit manager (certified corrections officer)", regex=True)
                             .str.replace(r"reserve ofc - lt \(peace ofc\)", "reserve lieutenant (peace officer)", regex=True)
                             .str.replace(r"(.+)? ?fire (.+)?", "", regex=True)
                             .str.replace(r" \(s\.a\.c\.\)", "", regex=True)
                             .str.replace(r"reserve ofc-lt\. col\.\(peace ofc\)", "reserve lieutenant colonel (peace officer)", regex=True)
                             .str.replace(r"\(corr ofc - sworn\)", "(sworn corrections officer)")
                             .str.replace(r"corr\. off\.", "corrections officer", regex=True)
                             .str.replace(r"ofc$", "officer", regex=True)
                             .str.replace(r"asst\.? ", "assistant ", regex=True)
                             .str.replace(r"med\.", "medical", regex=True)
                             .str.replace(r"admin\.", "administrative", regex=True)
                             .str.replace(r"reserve ofc - sgt \(peace ofc\)", "reserve sergeant (peace officer)", regex=True)
                             .str.replace(r"crime scene inv ", "crime scene investigator ", regex=True)
                             .str.replace(r"- peace ofc \(sworn\)", "(sworn peace officer)", regex=True)
                             .str.replace(r"reserve ofc - major \(peace ofc\)", "reserve major", regex=True)
                             .str.replace(r"\(corr ofc - sworn\)", "(sworn corrections officer)", regex=True)
                             .str.replace(r"part time", "part-time", regex=False)
                             .str.replace(r"\(corr ofc certified\)", "(certified corrections officer)")
                             .str.replace(r"mun\. prob\.", "municipal probation", regex=True)
                             .str.replace(r"reg\. director", "regional director", regex=True)
                             .str.replace(r"\(corr ofc certified\)", "certified corrections officer", regex=True)
                             .str.replace(r"inv\.", "investigator", regex=True)
                             .str.replace(r"reserve ofc - cpt \(peace ofc\)", "reserve captain", regex=True)
                             .str.replace(r"dep\.", "deputy", regex=True)
                             .str.replace(r"commissnr", "commissioner", regex=False)
                             .str.replace(r"comm ", "communications ", regex=False)
                             .str.replace(r"chief probation ofc", "chief probation officer", regex=False)
                             .str.replace(r"i\.d\. technician", "identification technician", regex=True)
                             .str.replace(r"gbi agent", "georgia bureau of investigation agent", regex=False)
                             .str.replace(r"(\w+)-(\w+)", r"\1 \2", regex=True)
                             .str.replace(r"non sworn", "non-sworn", regex=False)
                             .str.replace(r"part time", "part-time", regex=False)
                             .str.replace(r"(\w+)- (\w+)", r"\1-\2", regex=True)
    ).str.title()

    df.loc[:, "agcy_name"] = df.agcy_name.str.replace(r"Iii", "III", regex=False)
    return df

dfa = read_personnel_and_employment_hist()


dfa = dfa.pipe(split_names).pipe(clean_per_nbr).pipe(clean_rank)


dfa = dfa[~((dfa.agcy_name.fillna("") == ""))]

dfa = dfa.drop(columns=["ofc_rank"])


In [207]:
def read_demo_data():
    df = pd.read_csv("../../../../data/GA/2022-09-27/officer_data-20231220144255.csv", encoding="latin1")

    df = df.rename(columns={" SEX": "sex", 
                            " RACE ": "race", 
                              " YOB": "birth_year", 
                              " MIDDLE": "middle_name",
                              " LAST NAME": "last_name", 
                              " FIRST NAME": "first_name",
                                "OKEY": "person_nbr",
                              })
    return df


dfb = read_demo_data()


dfb = dfb.pipe(clean_per_nbr)


dfb = dfb[["birth_year", "person_nbr"]]

In [208]:
df = pd.merge(dfa, dfb, on="person_nbr")


df = df.drop(df.columns[0], axis=1)

In [209]:
def clean_agency(df):
    df.loc[:, "agcy_name"] = (df.agcy_name
                              .str.lower()
                              .str.strip()
                              .str.replace(r"^g(\w{4}) (.+)", r"\2", regex=True)
                              .str.replace(r" \/ (\w+)$", "", regex=True)
                              .str.replace(r" 911$", "", regex=True)
                              .str.replace(r"c\.i\.", "correctional institution", regex=True)
                              .str.replace(r"dept\.?", "department ", regex=True)
                              .str.replace(r"sheriffs", "sheriff's", regex=False)
                              .str.replace(r"dept\.$", "department", regex=True)
                              .str.replace(r"^ (\w+)", r"\1", regex=True)
                              .str.replace(r"(\w+) $", r"\1", regex=True)
                              .str.replace(r"georgia d\.n\.r\. (.+)", "georgia department of natural resources", regex=True)
                              .str.replace(r"\/(inactive|18 mos\.)$", "", regex=True)
                              .str.replace(r"l\.e\.a\.", "law enforcement academy", regex=True)
                              .str.replace(r"d\.p\.s\.", "department of public safety", regex=True)
                              .str.replace(r" \(inactive\)$", "", regex=True)
                              .str.replace(r"c\. ?i?\.?$", "correctional institution", regex=True)
                              .str.replace(r"^not found$", "", regex=True)
                              .str.replace(r" & ", " and ", regex=False)
                              .str.replace(r"juv\.justice", "juvenile justice", regex=True)
                              .str.replace(r"(\w)  (\w+)", r"\1 \2", regex=True)
                              .str.replace(r"metro\.", "metro", regex=True)
                              .str.replace(r"tech\.", "tech", regex=True)
                              .str.replace(r"^gdc ", "georgia department of corrections", regex=True)
                              .str.replace(r"d\.o\.t\.", "department of transportation", regex=True)
                              .str.replace(r"eot \(equivalency of training\)", "equivalency of training", regex=True)
                              .str.replace(r"co\. ", "county ", regex=True)
                              .str.replace(r"e?-?9-1-1", "911", regex=True)
                              .str.replace(r" ci$", "correctional institute", regex=True) 
                              .str.replace(r"cherokee co\.", "cherokee county", regex=True)
                              .str.replace(r"^ (\w+)", r"\1", regex=True)
                              .str.replace(r"(\w+) $", r"\1", regex=True)
                 
    ).str.title()

    df.loc[:, "agcy_name"] = df.agcy_name.str.replace(r"\'S", "s", regex=True)
    return df 

def extract_agcy_uid(df):
  df.loc[:, "agcy_uid"] = df.agcy_name.str.replace(r"G(\w+) (.+)", r"G\1", regex=True)
  return df 

df = df.pipe(extract_agcy_uid).pipe(clean_agency)


In [210]:
def read_agency():
    df = pd.read_csv("../../../../data/GA/2022-09-27/agency_data-20231220144255.csv", encoding="latin1")

    df = df.rename(columns={"AKEY": "agcy_name"})

    df = df[["agcy_name"]]
    return df

agencies = read_agency().reset_index()

agencies = agencies.rename(columns={"index": "agcy_uid"})

agencies.loc[:, "agcy_uid"] = agencies.agcy_uid.str.replace(r"^ (\w+)", r"\1", regex=True).str.replace(r"(\w+) $", r"\1", regex=True)

def clean_gt_agcy(df): 
    df.loc[:, "agcy_name"] = (df
                                    .agcy_name
                                    .str.lower()
                                    .str.strip()
                                    .str.replace(r"^ (\w+)", r"\1", regex=True)
                                    .str.replace(r"(\w+) $", r"\1", regex=True)
                                    .str.replace(r" ?\/ ?i]nactive$", "", regex=True)
                                    .str.replace(r"dept\.?", "department", regex=True)
                                    .str.replace(r"dept$", "department", regex=True)
                                    .str.replace(r"d\.n\.r\.", "department of natural resources", regex=True)
                                    .str.replace(r"^gdc", "georgia department of corrections", regex=True)
                                    .str.replace(r"c\.? ?i\.?$", "correctional institution", regex=True)
                                    .str.replace(r"d\.p\.s\.", "department of public safety", regex=True)
                                    .str.replace(r"d\.o\.t\.", "department of transportation", regex=True)
                                    .str.replace(r" ?\/ ?inactive$", "", regex=True)
                                    .str.replace(r"co\.", "county", regex=True)
                                    .str.replace(r"e - 911", "911", regex=False)
                                    .str.replace(r"&", "and", regex=False)
                                    .str.replace(r"juv\.justice", "juvenile justice", regex=True)
                                    .str.replace(r"sheriffs", "sheriff's", regex=True)
                                    .str.replace(r"tech\.", "tech", regex=True)
                                    .str.replace(r"metro\.", "metro", regex=True)
                                    .str.replace(r"standards-investigation", "standards and investigation", regex=False)
                                    .str.replace(r"(\w+)  (\w+)", r"\1 \2", regex=True)
                                    .str.replace(r"(.+)? fire (.+)?", "", regex=True)
                                    .str.replace(r"spalding co ", "spalding county ", regex=False)
                                    .str.replace(r"athens-clarke co ", "athens-clarke county ", regex=False)
                                    ).str.title()
    df.loc[:, "agcy_name"] = df.agcy_name.str.replace(r"\'S", "s", regex=True)
    return df[~((df.agcy_name.fillna("") == ""))] 

agencies = agencies.pipe(clean_gt_agcy)

agencies


Unnamed: 0,agcy_uid,agcy_name
0,G1603,A.B.A.C. Police Department
1,G1639,Abbeville Police Department
2,G1510,Acworth Police Department
3,G1129,Adairsville Police Department
4,G2496,Adel City Marshals Office
...,...,...
1211,G1003,Worth County Sheriffs Office
1212,G1358,Wrens Police Department
1213,G1520,Wrightsville Police Department
1214,G1773,Young Harris College Dps


In [211]:
merged_df = pd.merge(df, agencies, on="agcy_uid")

# calculate scores
merged_df['fuzzy_score'] = merged_df.apply(lambda x: fuzz.ratio(x['agcy_name_x'], x['agcy_name_y']), axis=1)

In [212]:
def get_unique_fuzzy_matches(df, threshold):
    # Create an empty list to store the results
    results = []
    
    # Get the unique fuzzy scores between the threshold and 100
    unique_scores = df[(df['fuzzy_score'] < 100) & (df['fuzzy_score'] >= threshold)]['fuzzy_score'].unique()
    unique_scores = sorted(unique_scores)
    
    for score in unique_scores:
        subset_df = df[df['fuzzy_score'] == score]
        agcy_name_x = subset_df['agcy_name_x'].iloc[0]
        agcy_name_y = subset_df['agcy_name_y'].iloc[0]
        results.append({'fuzzy_score': score, 'agcy_name_x': agcy_name_x, 'agcy_name_y': agcy_name_y})
    
    comparison_df = pd.DataFrame(results)
    return comparison_df

get_unique_fuzzy_matches(merged_df, 1)

Unnamed: 0,fuzzy_score,agcy_name_x,agcy_name_y
0,81,Georgia Department Of Natural Resources,Georgia Department Of Natural Resources (Law E...
1,83,Griffin Spalding Co,Griffin Spalding County 911
2,85,Tift County,Tift County 911
3,86,Towns County,Towns County 911
4,87,Elbert County,Elbert County 911
5,88,Lowndes County,Lowndes County 911
6,89,Oglethorpe County,Oglethorpe County 911
7,91,Jackson Countycorrectional Institute,Jackson County Correctional Institution
8,94,Georgia Department Of Correctionsoffice Of Pro...,Georgia Department Of Corrections Office Of Pr...
9,97,Georgia Department Of Correctionsspecial Opera...,Georgia Department Of Corrections Special Oper...


In [213]:
merged_df = merged_df.drop(columns=["agcy_name_x", "agcy_uid", "fuzzy_score"])

cleaned_df = merged_df.rename(columns={"agcy_name_y": "agcy_name"})

cleaned_df

Unnamed: 0,person_nbr,full_name,type,start_date,end_date,last_name,middle_name,first_name,rank,birth_year,agcy_name
0,o246465,Aamir Wishah,Actively Employed in Law Enforcement,2020-06-01,0000-00-00,Aamir,H,Wisha,Communications Officer,1996,Elbert County 911
1,o095227,Aanerud Damon H,Voluntary Resignation,1999-04-19,2000-10-07,Aanerud,H,Damon,Jailor,1972,Chatham County Sheriffs Office
2,o095227,Aanerud Damon H,Voluntary Resignation,2000-10-09,2001-03-30,Aanerud,H,Damon,Peace Officer,1972,Savannah Police Department
3,o095227,Aanerud Damon H,Rank Change - Promotion,2001-03-10,2009-09-27,Aanerud,H,Damon,Corporal,1972,Pooler Police Department
4,o095227,Aanerud Damon H,Actively Employed in Law Enforcement,2009-09-28,0000-00-00,Aanerud,H,Damon,Sergeant,1972,Pooler Police Department
...,...,...,...,...,...,...,...,...,...,...,...
134658,o136250,Zydonyk Ashby L,Rank Change - Promotion,2020-01-20,2021-01-01,Zydonyk,L,Ashby,Captain,1985,Newington Police Department
134659,o136250,Zydonyk Ashby L,Actively Employed in Law Enforcement,2021-01-01,0000-00-00,Zydonyk,L,Ashby,Chief,1985,Newington Police Department
134660,o110791,Zygaj Stephen M,Rank Change - Promotion,2001-12-18,2020-01-08,Zygaj,M,Stephen,Peace Officer,1962,Atlanta Police Department
134661,o110791,Zygaj Stephen M,Rank Change - Promotion,2020-01-09,2022-04-13,Zygaj,M,Stephen,Captain,1962,Atlanta Police Department


In [214]:
def case_types(df):
    df.loc[:, "type"] = (df.type.str.replace(r"^ (\w+)", r"\1", regex=True)
                             .str.replace(r"^zz$", "", regex=True)
                             .str.replace(r"w\/in", "within", regex=True)
                             .str.capitalize())
    return df 

cleaned_df = cleaned_df.pipe(case_types)

cleaned_df.person_nbr.value_counts(ascending=False)

review = cleaned_df[cleaned_df.person_nbr.str.contains("o112776")]

review

Unnamed: 0,person_nbr,full_name,type,start_date,end_date,last_name,middle_name,first_name,rank,birth_year,agcy_name
67332,o112776,Knight Adam L,Voluntary resignation,2002-04-01,2003-01-05,Knight,L,Adam,Jail Officer,1981,Acworth Police Department
67333,o112776,Knight Adam L,Transfer,2002-10-14,2003-06-22,Knight,L,Adam,Reserve Officer,1981,Euharlee Police Department
67334,o112776,Knight Adam L,Voluntary resignation,2003-06-23,2003-10-28,Knight,L,Adam,Peace Officer,1981,Euharlee Police Department
67335,o112776,Knight Adam L,Voluntary resignation,2003-07-28,2003-10-28,Knight,L,Adam,Peace Officer,1981,Emerson Police Department
67336,o112776,Knight Adam L,Voluntary resignation,2003-10-30,2005-04-30,Knight,L,Adam,Peace Officer,1981,Cartersville Police Department
67337,o112776,Knight Adam L,Voluntary resignation,2005-05-02,2005-08-02,Knight,L,Adam,Peace Officer,1981,Rome Police Department
67338,o112776,Knight Adam L,Voluntary resignation,2005-08-03,2006-01-20,Knight,L,Adam,Peace Officer,1981,Cartersville Police Department
67339,o112776,Knight Adam L,Voluntary resignation,2006-01-22,2007-02-20,Knight,L,Adam,Peace Officer,1981,Floyd County Police Department
67340,o112776,Knight Adam L,Voluntary resignation,2006-02-28,2007-11-10,Knight,L,Adam,Peace Officer,1981,City Of White Police Department
67341,o112776,Knight Adam L,Reduction in force/lay off,2006-06-20,2007-04-23,Knight,L,Adam,Peace Officer,1981,Emerson Police Department


In [215]:
# # Assuming your DataFrame is named 'df'
# specified_values = [
#     "Terminated",
#     "Resigned in lieu of termination",
#     "Retired in lieu of termination",
#     "Resigned while under investigation",
#     "Demotion (disciplinary)",
#     "Retired while under investigation"
# ]

# # Filter the DataFrame based on the specified values
# filtered_df = cleaned_df[cleaned_df['type'].isin(specified_values)]

# filtered_df

# filtered_df.loc[:, "start_year"] = filtered_df.start_date.str.replace(r"^(\w{4})-(.+)", r"\1", regex=True)

# filtered_df.loc[:, "end_year"] = filtered_df.end_date.str.replace(r"^(\w{4})-(.+)", r"\1", regex=True)

# filtered_df.loc[:, "start_year"] = filtered_df.start_date.str.replace(r"^(\w{4})-(.+)", r"\1", regex=True)

# filtered_df.start_year.value_counts(ascending=False)

In [216]:
# filtered_df.loc[:, "end_year"] = filtered_df.end_date.str.replace(r"^(\w{4})-(.+)", r"\1", regex=True)

# filtered_df.end_year.value_counts(ascending=False)

In [217]:
# cleaned_df.type.unique()

In [218]:
# cleaned_df.loc[:, "start_year"] = cleaned_df.start_date.str.replace(r"^(\w+)-(\w+)-(\w+)", r"\1", regex=True)


# cleaned_df.start_year.value_counts(ascending=False).to_list()

In [None]:
# cleaned_df.start_year.value_counts().sort_values().to_csv("runit.csv")

In [None]:
# cleaned_df.to_csv("data/cleaned_df.csv", index=False)