In [19]:
import numpy as np
import pandas as pd
from datamatch import (
    ThresholdMatcher,
    DissimilarFilter,
    NonOverlappingFilter,
    ColumnsIndex,
    JaroWinklerSimilarity,
    MaxScorer,
    SimSumScorer,
    AlterScorer,
    MultiIndex,
)
from datavalid.spinner import Spinner

In [20]:
def read_personnel():
    dfa = pd.read_csv("./../../../data/GA/match/personnel_georgia.csv")
    dfa = dfa.rename(columns={"year_of_birth": "birth_year"})
    dfb = pd.read_csv("./../../../data/FL/match/personnel_florida.csv")

    dfa = dfa[["first_name", "last_name", "middle_name", "race", "sex", "birth_year", "state", "uid"]]

    dfb = dfb[["first_name", "last_name", "middle_name", "race", "sex", "birth_year", "state", "uid"]]

    df = pd.concat([dfa, dfb])
    return df


def read_events():
    dfa = pd.read_csv("./../../../data/FL/match/events_florida.csv")

    dfa = dfa[["uid", "event_uid", "event_type", "event_date", "agency", "state"]]
    dfb = pd.read_csv("./../../../data/GA/match/events_georgia.csv")

    dfb.loc[:, "event_date"] = dfb.event_date.astype(str).str.replace(r"0000-00-00", "", regex=True).str.replace(r"(\w{4})-(\w{2})-(\w{2})", r"\2/\3/\1", regex=True)

    dfb = dfb[["uid", "event_uid", "event_type", "event_date", "agency", "state"]]

    df = pd.concat([dfa, dfb])

    df = df[~((df.event_date.fillna("") == ""))]

    df = df.rename(columns={"event_type": "kind"})

    dates = df.event_date.str.extract(r"(\w+)\/(\w+)\/(\w+)")

    df.loc[:, "day"] = dates[1].fillna("")
    df.loc[:, "month"] = dates[0].fillna("")
    df.loc[:, "year"] = dates[2].fillna("")

    df = df[~((df.day == ""))]

    df.loc[:, "day"] = df.day.astype(int)

    df.loc[:, "month"] = df.month.astype(int)

    df.loc[:, "year"] = df.year.astype(int)

    df = df.drop(columns=["event_date"])

    return df

In [21]:
events_df = read_events()
events_df

## check if ledt_date in kind 

Unnamed: 0,uid,event_uid,kind,agency,state,day,month,year
0,b4cbffe2a853f9474ec550c2ccbaff90,fa75a27a3d71b1ee5d71eece2a07ea77,hire_date,Oak Hill Police Department,FL,18,1,2000
1,b4cbffe2a853f9474ec550c2ccbaff90,d7fb7ea1856440187493093b1b8e1097,hire_date,Oak Hill Police Department,FL,5,9,2001
2,dd83af4c6302340ce7ccfc9ac46a0d9f,f1e316950e7483dd2060ba6defb46d77,hire_date,Orange County Sheriff's Office,FL,15,10,1992
3,dd83af4c6302340ce7ccfc9ac46a0d9f,473c370586c14e716e73ea47024be8c6,hire_date,Orange County Sheriff's Office,FL,18,2,1992
4,417a46bf049b6a59cc7ed0809336f7bb,732a44c858d555f907cdedb23fb2c002,hire_date,Pahokee Police Department,FL,4,9,1998
...,...,...,...,...,...,...,...,...
123624,2384666398b703dda2d32bbb5dd933e2,03a2ee6b7377ad0af434faba8fff7f14,hire_date,G1313 SCREVEN COUNTY SHERIFFS OFFICE,GA,1,7,2022
123625,2d5b49921b5f8e2f0ca5b6558823c1ea,cb97b65c88a984163ba1f833c4595b04,hire_date,G1587 NEWINGTON POLICE DEPARTMENT,GA,19,1,2020
123626,2d5b49921b5f8e2f0ca5b6558823c1ea,e1bf8f9164879051288605c6a01ada6d,hire_date,G1587 NEWINGTON POLICE DEPARTMENT,GA,1,1,2021
123628,7bf00889d124f7d472aed71e14bdff11,5677c4083b7f5f8a77c9517a906a49be,hire_date,G1505 ATLANTA POLICE DEPARTMENT,GA,8,1,2020


In [22]:
per_df = read_personnel()
per_df

Unnamed: 0,first_name,last_name,middle_name,race,sex,birth_year,state,uid
0,damon,aanerud,h,,,1972.0,GA,0e1d95f0673a4b7e993594913922775c
1,damon,aanerud,h,,,1972.0,GA,3a8f5b0685aa676c4dc62586482461e5
2,damon,aanerud,h,,,1972.0,GA,ccf851ef98e6b8834435773e21bbf12e
3,damon,aanerud,h,,,1972.0,GA,ccf851ef98e6b8834435773e21bbf12e
4,edna,aaron,m,,,1957.0,GA,e6022bac58c1cbbb6c9b146f79c70818
...,...,...,...,...,...,...,...,...
629841,zy rieana,wilson,demetra kantrell,black,female,2003.0,FL,d99377169fc413574b6e6c8339466b13
629842,jason,arellano,,hispanic,male,1985.0,FL,1a2d0f6d2f64093b6414ec2b9cfde210
629843,prudenciano,ibarra,,hispanic,male,1977.0,FL,047f0b790beb67838b8e991781c37bc4
629844,kiandre,hope,m,black,male,1996.0,FL,eea413fb8ecbf330765c311a42803932


In [23]:

def combine_date_columns(
    df: pd.DataFrame, year_col: str, month_col: str, day_col: str
) -> pd.Series:
    """Combines date columns into a single column

    Args:
        df (pd.DataFrame):
            the frame to process
        year_col (str):
            year column
        month_col (str):
            month column
        day_col (str):
            day column

    Returns:
        the combined datetime series
    """
    dates = df[[year_col, month_col, day_col]]
    dates.columns = ["year", "month", "day"]
    return pd.to_datetime(dates, errors="coerce")


def combine_datetime_columns(
    df: pd.DataFrame, year_col: str, month_col: str, day_col: str, time_col: str
) -> pd.Series:
    """Combines datetime columns into a single column

    Args:
        df (pd.DataFrame):
            the frame to process
        year_col (str):
            year column
        month_col (str):
            month column
        day_col (str):
            day column
        time_col (str):
            time column

    Returns:
        the combined datetime series
    """
    time_frame = df[time_col].str.split(":", expand=True)
    dates = pd.concat([df[[year_col, month_col, day_col]], time_frame], axis=1)
    dates.columns = ["year", "month", "day", "hour", "minute"]
    return pd.to_datetime(dates)


def discard_rows(
    events: pd.DataFrame, bool_index: pd.Series, desc: str, reset_index: bool = False
) -> pd.DataFrame:
    before = events.shape[0]
    events = events[bool_index]
    if reset_index:
        events = events.reset_index(drop=True)
    after = events.shape[0]
    if before > after:
        print(
            "discarded %d %s (%.1f%%)"
            % (before - after, desc, (before - after) / before * 100)
        )
    return events


def assign_min_col(events: pd.DataFrame, per: pd.DataFrame, col: str):
    # Group by 'uid' and calculate the minimum
    min_series = events.groupby(level='uid')[col].min()
    min_dict = min_series.to_dict()
    # Map minimum values to the 'per' DataFrame
    per.loc[:, "min_" + col] = per.index.map(lambda x: min_dict.get(x, np.NaN))


def assign_max_col(events: pd.DataFrame, per: pd.DataFrame, col: str):
    # Group by 'uid' and calculate the maximum
    max_series = events.groupby(level='uid')[col].max()
    max_dict = max_series.to_dict()
    # Map maximum values to the 'per' DataFrame
    per.loc[:, "max_" + col] = per.index.map(lambda x: max_dict.get(x, np.NaN))


def cross_match_officers_between_agencies(personnel, events):
    events = discard_rows(
        events, events.uid.notna(), "events with empty uid column", reset_index=True
    )
    events = discard_rows(
        events, events.day.notna(), "events with empty day column", reset_index=True
    )
    events = discard_rows(
        events, events.day <= 31, "events with impossible day column", reset_index=True
    )
    for col in ["year", "month", "day"]:
        events.loc[:, col] = events[col].astype(int)
    events.loc[:, "date"] = combine_date_columns(events, "year", "month", "day")
    events = discard_rows(
        events, events.date.notna(), "events with empty date", reset_index=True
    )

    # Convert 'date' to datetime format
    events['date'] = pd.to_datetime(events['date'], errors='coerce')

    # Create 'timestamp' from 'date' and convert it to integers
    events.loc[:, "timestamp"] = events["date"].map(lambda x: x.timestamp())
    events['timestamp'] = pd.to_numeric(events['timestamp'], errors='coerce')


    per = personnel[["uid", "first_name", "last_name", "race", "sex", "birth_year", "state"]]

    per.loc[:, "birth_year"] = per.birth_year.astype(str)
    
    per = per.drop_duplicates(subset=["uid"])

    per = discard_rows(
        per,
        per.first_name.notna() & per.last_name.notna(),
        "officers without either first name or last name",
        reset_index=True,
    )
    per.loc[:, "fc"] = per.first_name.map(lambda x: x[:3])
    per.loc[:, "lc"] = per.last_name.map(lambda x: x[:3])
    agency_dict = (
        events.loc[:, ["uid", "agency"]]
        .drop_duplicates()
        .set_index("uid")
        .agency.to_dict()
    )
    per.loc[:, "agency"] = per.uid.map(lambda x: agency_dict.get(x, ""))
    per = discard_rows(
        per, per.agency != "", "officers not linked to any event", reset_index=True
    )


    per = per.set_index("uid")

    # aggregating min/max date
    events = events.set_index(["uid", "event_uid"])
    assign_min_col(events, per, "date")
    assign_max_col(events, per, "date")
    assign_min_col(events, per, "timestamp")
    assign_max_col(events, per, "timestamp")
    per = discard_rows(per, per.min_date.notna(), "officers with no event")

    # concatenate first name and last name to get a series of full names
    full_names = per.first_name.str.cat(per.last_name, sep=" ")
    # filter down the full names to only those that are common
    # common_names_sr = pd.Series([x for x in full_names])
    
    
    ##### check race and sex cols compatbility 

    excel_path = "../../../data/FL/match/cross_agency_officers.xlsx"
    matcher = ThresholdMatcher(
        index=MultiIndex(
            [
                ColumnsIndex(["fc", "lc"]),
            ]
        ),
        scorer=MaxScorer(
            [
                AlterScorer(
                    # calculate similarity score (0-1) based on name similarity
                    scorer=SimSumScorer(
                        {
                            "first_name": JaroWinklerSimilarity(.9),
                            "last_name": JaroWinklerSimilarity(.9),
                            "birth_year": JaroWinklerSimilarity(),
                        }
                    ),
                    # but for pairs that have the same name and their name is common
                    values=full_names,
                    # give a penalty of -.2 which is enough to eliminate them
                    alter=lambda score: score - 0.1,
                ),
            ]
        ),
        dfa=per,
        filters=[
            # don't match officers who belong in the same agency
            DissimilarFilter("state"),
            # don't match officers who appear in overlapping time ranges
            NonOverlappingFilter("min_timestamp", "max_timestamp"),
        ],
        show_progress=True,
    )
    decision = .5
    with Spinner("saving matched clusters to Excel file"):
        matcher.save_clusters_to_excel(excel_path, decision, lower_bound=decision)
    clusters = matcher.get_index_clusters_within_thresholds(decision)
    print("saved %d clusters to %s" % (len(clusters), excel_path))

    return clusters, per[["max_timestamp", "agency", "race", "sex", "birth_year", "state"]]


In [24]:
matches = cross_match_officers_between_agencies(per_df, events_df)

discarded 27 events with empty date (0.0%)
discarded 46 officers without either first name or last name (0.0%)
discarded 15806 officers not linked to any event (2.7%)


scoring pairs: 769768it [00:52, 14566.16it/s]


saved 39082 clusters to ../../../data/FL/match/cross_agency_officers.xlsxfile saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving matched clusters to Excel file saving mat