In [60]:
import pandas as pd
import gzip

In [61]:
def read_tbl():
    df = pd.read_csv("../data/input/Ad Hoc Export 2024_11_01 02_29_45 PM.csv")

    return df

import pandas as pd

def normalize_employment_history(df):
    # Identify base columns (non-repeating columns)
    base_columns = [
        'Person LEPS Subcommittee ID',
        'Person First Name',
        'Person Middle Name',
        'Person Last Name',
        'Person Suffix',
        'Person UDF - Date Certified as an Officer'
    ]
    
    # Identify sets of employment-related columns
    employment_sets = []
    employment_base_cols = [
        'Employment Start Date',
        'Employment End Date',
        'Employment Appointment Type',
        'Employment Employment Type',
        'Employment Title/Rank (Current)',
        'Employment Status',
        'Employment Change Reason',
        'Employment Change Comment',
        'Is Primary Employment',
        'Employment UDF - Out of State Certification',
        'Employing Organization Name',
        'Employing Organization ID',
        'Employing Organization Parent Name',
        'Employing Organization Abbreviation',
        'Employing Organization Status',
        'Employing Organization Department UCR Number',
        'Employing Organization Is Employer',
        'Employing Organization Show In Portal',
        'Show As Beneficiary'
    ]
    
    # Find all employment column sets (including those with .1, .2, etc. suffixes)
    for i in range(21):  # Assuming up to 5 employment history records (.1 through .5)
        suffix = '' if i == 0 else f'.{i}'
        current_set = [col + suffix for col in employment_base_cols]
        # Only add if all columns exist in the DataFrame
        if all(col in df.columns for col in current_set):
            employment_sets.append(current_set)
    
    # Create normalized DataFrame
    normalized_rows = []
    
    for _, row in df.iterrows():
        base_data = {col: row[col] for col in base_columns}
        
        # For each employment set, create a new row combining base data and employment data
        for emp_cols in employment_sets:
            employment_data = {
                col.split('.')[0]: row[col] for col in emp_cols  # Remove suffix from column names
            }
            
            # Only create a row if there's actual employment data
            if any(pd.notna(v) for v in employment_data.values()):
                combined_row = {**base_data, **employment_data}
                normalized_rows.append(combined_row)
    
    # Create final DataFrame
    result_df = pd.DataFrame(normalized_rows)
    
    # Sort columns to keep base columns first, followed by employment data
    sorted_columns = base_columns + employment_base_cols
    result_df = result_df[sorted_columns]
    
    return result_df


def rename_cols(df):
    df = df.rename(columns={"Person LEPS Subcommittee ID": "person_nbr", 
                            "Person First Name": "first_name", 
                            "Person Middle Name": "middle_name", 
                            "Person Last Name": "last_name", 
                            "Person Suffix": "suffix", 
                            "Person UDF - Date Certified as an Officer": "certification_date", 
                            "Employment Start Date": "start_date", 
                            "Employment End Date": "end_date", 
                            "Employment Employment Type": "employment_status",
                            "Employing Organization Name": "agency_name", 
                            "Employment Title/Rank (Current)": "rank", 
                            "Employment Status": "separation_reason"})
    return df 

def filter_cols(df):
    df = df[["person_nbr", "first_name","middle_name",
             "last_name", "suffix", "certification_date", 
             "start_date", "end_date", "employment_status", 
             "agency_name",  "rank", "separation_reason"]]
    return df 


def clean_separation_reason(df):
    df.loc[:, "separation_reason"] = df.separation_reason.fillna("")
    return df 

def clean_middle_name(df):
    df.loc[:, "middle_name"] = df.middle_name.str.replace(r"\.", "", regex=True)
    return df 

def clean_agency_name(df):
    df.loc[:, "agency_name"] = (df.agency_name
                                .str.lower()
                                .str.strip()
                                .str.replace(r" pd$", " police department", regex=True)
                                .str.replace(r"\bdept", "department", regex=True)
                                .str.replace(r"wellsville oh pd   1992-94", "", regex=True)
    )
    return df[~((df.agency_name == ""))]

def clean_suffix(df):
    df.loc[:, "suffix"] = (df
                           .suffix
                           .str.lower()
                           .str.strip()
                           .str.replace(r"\.", "", regex=True)
    )
    return df 


def fix_dates(df):
    df.loc[:, "start_date"] = pd.to_datetime(df.start_date, errors="coerce")
    df.loc[:, "end_date"] = pd.to_datetime(df.end_date, errors="coerce")

    df.loc[:, "start_date"] = df.start_date.astype(str).str.lower().str.replace(r"nat", "", regex=False)

    df.loc[:, "end_date"] = df.end_date.astype(str).str.lower().str.replace(r"nat", "", regex=False)
    return df[~((df.start_date == ""))] 


df = read_tbl()

df = (df
      .pipe(normalize_employment_history)
      .pipe(rename_cols)
      .pipe(filter_cols)
      .pipe(clean_separation_reason)
      .pipe(clean_middle_name)
      .pipe(clean_agency_name)
      .pipe(clean_suffix)
      .pipe(fix_dates)
)



  df = read_tbl()


In [62]:
df.suffix.unique()

df

Unnamed: 0,person_nbr,first_name,middle_name,last_name,suffix,certification_date,start_date,end_date,employment_status,agency_name,rank,separation_reason
0,0880-0382,Chad,J,Clatterbuck,,4/27/2007,2005-12-18,,Full Time,ohio county sheriff's office,Corporal,Active
1,0880-0382,Chad,J,Clatterbuck,,4/27/2007,2007-09-17,2012-01-15,,clearview police department,Officer,Resigned - Status Unknown
2,0880-0382,Chad,J,Clatterbuck,,4/27/2007,2013-03-12,2019-06-24,Part Time,west liberty state university police department,Officer,Resigned - In Good Standing
3,3412-3729,Michael,J,Billiter,,4/27/2007,2005-12-19,2013-07-03,,paden city police department,Officer,Other
4,6031-5728,William,F,Kearns,,,2005-12-13,2007-06-04,,new cumberland police department,Officer,Resigned - Status Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
18371,3948-7424,Jayden,Garrett,Straughn,,,2024-03-05,,Full Time,nicholas county sheriff's office,Deputy,Active
18372,7685-8787,Rafael,,Martinez-Reynoso,,,2024-05-13,2024-06-07,,south charleston police department,Officer,Separated
18373,9085-9393,Zachary,Michael,Blosser,,,2024-06-27,,Full Time,beckley police department,Officer,Active
18374,2137-2350,Jonathan,Bruce,Murphy,,,2024-10-15,,Full Time,kanawha county sheriff's office,Deputy,Active


In [63]:
## done

df.to_csv("../data/output/west-virginia-processed.csv", index=False)