In [105]:
import numpy as np
import pandas as pd
from datamatch import (
    ThresholdMatcher,
    DissimilarFilter,
    NonOverlappingFilter,
    ColumnsIndex,
    JaroWinklerSimilarity,
    MaxScorer,
    SimSumScorer,
    AlterScorer,
    MultiIndex,
)
from datavalid.spinner import Spinner
from jellyfish import jaro_winkler_similarity


In [106]:
def read_personnel():
    dfa = pd.read_csv("../../clean/GA/per_ga_new.csv")
    dfb = pd.read_csv("./../../../data/FL/match/personnel_florida_dropped_na_left_dates.csv")

    dfa = dfa[["first_name", "last_name", "middle_name", "race", "sex", "birth_year", "state", "uid"]]
    print(dfa.head())

    dfb = dfb[["first_name", "last_name", "middle_name", "race", "sex", "birth_year", "state", "uid"]]
    print(dfb.head())

    df = pd.concat([dfa, dfb])
    return df


def read_events():
    dfa = pd.read_csv("./../../../data/FL/match/events_florida_dropped_na_left_dates.csv")

    dfa = dfa[["uid", "event_uid", "event_type", "event_date", "agency", "state"]]
    dfb = pd.read_csv("../../clean/GA/events_ga_new.csv")

    dfb = dfb[["uid", "event_uid", "event_type", "event_date", "agency", "state"]]

    df = pd.concat([dfa, dfb])

    df = df[~((df.event_date.fillna("") == ""))]

    df = df.rename(columns={"event_type": "kind"})

    dates = df.event_date.str.extract(r"(\w+)\/(\w+)\/(\w+)")

    df.loc[:, "day"] = dates[1].fillna("")
    df.loc[:, "month"] = dates[0].fillna("")
    df.loc[:, "year"] = dates[2].fillna("")

    df = df[~((df.day == ""))]

    df.loc[:, "day"] = df.day.astype(int)

    df.loc[:, "month"] = df.month.astype(int)

    df.loc[:, "year"] = df.year.astype(int)

    df = df.drop(columns=["event_date"])

    return df, dfa, dfb

In [107]:
per = read_personnel()

per

  first_name last_name middle_name   race     sex  birth_year    state  \
0     dalila    a'giza         NaN  black  Female        1976  georgia   
1      damon   aanerud           h  white    Male        1972  georgia   
2      damon   aanerud           h  white    Male        1972  georgia   
3      damon   aanerud           h  white    Male        1972  georgia   
4     angela     aaron    linettie  black  Female        1973  georgia   

       uid  
0  o143810  
1  o095227  
2  o095227  
3  o095227  
4  o206100  
  first_name   last_name middle_name   race   sex  birth_year state  \
0    michael  piwowarski           p  white  male      1969.0    FL   
1    michael  piwowarski           p  white  male      1969.0    FL   
2    michael  piwowarski           p  white  male      1969.0    FL   
3    michael  piwowarski           p  white  male      1969.0    FL   
4    michael  piwowarski           p  white  male      1969.0    FL   

                                uid  
0  b4cbffe2a

Unnamed: 0,first_name,last_name,middle_name,race,sex,birth_year,state,uid
0,dalila,a'giza,,black,Female,1976.0,georgia,o143810
1,damon,aanerud,h,white,Male,1972.0,georgia,o095227
2,damon,aanerud,h,white,Male,1972.0,georgia,o095227
3,damon,aanerud,h,white,Male,1972.0,georgia,o095227
4,angela,aaron,linettie,black,Female,1973.0,georgia,o206100
...,...,...,...,...,...,...,...,...
550248,eliezer,rojas,leonardo,hispanic,male,1984.0,FL,00c79e0a486150b6076b319d6529aafb
550249,nicholas,lesses,,white,male,2002.0,FL,8e0ed14abc2336c1dc0c5b30357bd71f
550250,austin,smith,tyler,white,male,1995.0,FL,4c874798027c548e7d0d2fa6e6e9da7f
550251,devon,williams,,black,male,2000.0,FL,13539545d26fc06603eee4975dc187b6


In [108]:
events_df, dfa, dfb = read_events()
events_df

## check if ledt_date in kind 

Unnamed: 0,uid,event_uid,kind,agency,state,day,month,year
0,b4cbffe2a853f9474ec550c2ccbaff90,fa75a27a3d71b1ee5d71eece2a07ea77,hire_date,Oak Hill Police Department,FL,18,1,2000
1,b4cbffe2a853f9474ec550c2ccbaff90,d7fb7ea1856440187493093b1b8e1097,hire_date,Oak Hill Police Department,FL,5,9,2001
2,dd83af4c6302340ce7ccfc9ac46a0d9f,f1e316950e7483dd2060ba6defb46d77,hire_date,Orange County Sheriff's Office,FL,15,10,1992
3,dd83af4c6302340ce7ccfc9ac46a0d9f,473c370586c14e716e73ea47024be8c6,hire_date,Orange County Sheriff's Office,FL,18,2,1992
4,417a46bf049b6a59cc7ed0809336f7bb,732a44c858d555f907cdedb23fb2c002,hire_date,Pahokee Police Department,FL,4,9,1998
...,...,...,...,...,...,...,...,...
844525,o110791,ed12b16e4186abe3075f652636f8640d,left_date,G1505 ATLANTA POLICE DEPARTMENT,georgia,8,1,2020
844526,o110791,3feccbaaad39346c40944e7dc9b71966,left_date,G1505 ATLANTA POLICE DEPARTMENT,georgia,13,4,2022
844527,o110791,aa76b91592d9c2c17cc67e0c9cd30f15,left_date,G1505 ATLANTA POLICE DEPARTMENT,georgia,20,9,2023
844528,o226212,a06921debcb5f5ac99b68e51154ebf11,left_date,G1072 SMYRNA POLICE DEPARTMENT,georgia,3,2,2017


In [109]:
dfa

Unnamed: 0,uid,event_uid,event_type,event_date,agency,state
0,b4cbffe2a853f9474ec550c2ccbaff90,fa75a27a3d71b1ee5d71eece2a07ea77,hire_date,01/18/2000,Oak Hill Police Department,FL
1,b4cbffe2a853f9474ec550c2ccbaff90,d7fb7ea1856440187493093b1b8e1097,hire_date,09/05/2001,Oak Hill Police Department,FL
2,dd83af4c6302340ce7ccfc9ac46a0d9f,f1e316950e7483dd2060ba6defb46d77,hire_date,10/15/1992,Orange County Sheriff's Office,FL
3,dd83af4c6302340ce7ccfc9ac46a0d9f,473c370586c14e716e73ea47024be8c6,hire_date,02/18/1992,Orange County Sheriff's Office,FL
4,417a46bf049b6a59cc7ed0809336f7bb,732a44c858d555f907cdedb23fb2c002,hire_date,09/04/1998,Pahokee Police Department,FL
...,...,...,...,...,...,...
1062949,fa9193c2b02a434a7b41da5921b98f4c,a10198c28e3c0ef8c1abae114cd572c5,left_date,10/14/2022,Fort Lauderdale Police Department,FL
1062950,00c79e0a486150b6076b319d6529aafb,572b228479f0f5cc45f96492b07c664b,left_date,04/03/2023,New Port Richey Police Department,FL
1062951,8e0ed14abc2336c1dc0c5b30357bd71f,2cb05b597b93e3d5e9035eeee9ee449e,left_date,03/24/2023,Lady Lake Police Department,FL
1062952,4c874798027c548e7d0d2fa6e6e9da7f,70e74859544c5ca96bd600377b2be8ad,left_date,03/23/2023,Blackwater River Correctional Facility,FL


In [110]:
dfb

Unnamed: 0,uid,event_uid,event_type,event_date,agency,state
0,o143810,6730850702701c45e8b5bfcc7bc69910,hire_date,09/10/2007,G1720 DEKALB COUNTY POLICE DEPARTMENT,georgia
1,o095227,254c62c3cf4de55a7e0397a8c46fc7bd,hire_date,04/19/1999,G1161 CHATHAM COUNTY SHERIFFS OFFICE,georgia
2,o095227,5ce6379fa43f4839627d5dc714169835,hire_date,10/09/2000,G1244 SAVANNAH POLICE DEPARTMENT,georgia
3,o095227,31aaa48b54195a1b49a0bf4a41c4e665,hire_date,03/10/2001,G1177 POOLER POLICE DEPARTMENT,georgia
4,o206100,096ff43918a5195fba37a3ac2e4abcff,hire_date,11/05/2012,G1424 MARTA POLICE DEPARTMENT,georgia
...,...,...,...,...,...,...
844525,o110791,ed12b16e4186abe3075f652636f8640d,left_date,01/08/2020,G1505 ATLANTA POLICE DEPARTMENT,georgia
844526,o110791,3feccbaaad39346c40944e7dc9b71966,left_date,04/13/2022,G1505 ATLANTA POLICE DEPARTMENT,georgia
844527,o110791,aa76b91592d9c2c17cc67e0c9cd30f15,left_date,09/20/2023,G1505 ATLANTA POLICE DEPARTMENT,georgia
844528,o226212,a06921debcb5f5ac99b68e51154ebf11,left_date,02/03/2017,G1072 SMYRNA POLICE DEPARTMENT,georgia


In [111]:
per_df = read_personnel()
per_df

  first_name last_name middle_name   race     sex  birth_year    state  \
0     dalila    a'giza         NaN  black  Female        1976  georgia   
1      damon   aanerud           h  white    Male        1972  georgia   
2      damon   aanerud           h  white    Male        1972  georgia   
3      damon   aanerud           h  white    Male        1972  georgia   
4     angela     aaron    linettie  black  Female        1973  georgia   

       uid  
0  o143810  
1  o095227  
2  o095227  
3  o095227  
4  o206100  
  first_name   last_name middle_name   race   sex  birth_year state  \
0    michael  piwowarski           p  white  male      1969.0    FL   
1    michael  piwowarski           p  white  male      1969.0    FL   
2    michael  piwowarski           p  white  male      1969.0    FL   
3    michael  piwowarski           p  white  male      1969.0    FL   
4    michael  piwowarski           p  white  male      1969.0    FL   

                                uid  
0  b4cbffe2a

Unnamed: 0,first_name,last_name,middle_name,race,sex,birth_year,state,uid
0,dalila,a'giza,,black,Female,1976.0,georgia,o143810
1,damon,aanerud,h,white,Male,1972.0,georgia,o095227
2,damon,aanerud,h,white,Male,1972.0,georgia,o095227
3,damon,aanerud,h,white,Male,1972.0,georgia,o095227
4,angela,aaron,linettie,black,Female,1973.0,georgia,o206100
...,...,...,...,...,...,...,...,...
550248,eliezer,rojas,leonardo,hispanic,male,1984.0,FL,00c79e0a486150b6076b319d6529aafb
550249,nicholas,lesses,,white,male,2002.0,FL,8e0ed14abc2336c1dc0c5b30357bd71f
550250,austin,smith,tyler,white,male,1995.0,FL,4c874798027c548e7d0d2fa6e6e9da7f
550251,devon,williams,,black,male,2000.0,FL,13539545d26fc06603eee4975dc187b6


In [112]:
df_for_filtering = pd.read_csv("./../../../data/GA/5-10-2024/reciprocity.csv")


names = df_for_filtering["NAME"].str.extract(r"(.+)\, (\w+) (.+)")

df_for_filtering.loc[:, "last_name"] = names[0].str.lower().str.strip()
df_for_filtering.loc[:, "first_name"] = names[1].str.lower().str.strip()
df_for_filtering.loc[:, "middle_name"] = names[2].str.lower().str.strip()


df_for_filtering = df_for_filtering[["last_name", "first_name", "middle_name"]]


def combine_date_columns(
    df: pd.DataFrame, year_col: str, month_col: str, day_col: str
) -> pd.Series:
    """Combines date columns into a single column"""
    dates = df[[year_col, month_col, day_col]]
    dates.columns = ["year", "month", "day"]
    return pd.to_datetime(dates, errors="coerce")


def combine_datetime_columns(
    df: pd.DataFrame, year_col: str, month_col: str, day_col: str, time_col: str
) -> pd.Series:
    """Combines datetime columns into a single column"""
    time_frame = df[time_col].str.split(":", expand=True)
    dates = pd.concat([df[[year_col, month_col, day_col]], time_frame], axis=1)
    dates.columns = ["year", "month", "day", "hour", "minute"]
    return pd.to_datetime(dates)


def discard_rows(
    events: pd.DataFrame, bool_index: pd.Series, desc: str, reset_index: bool = False
) -> pd.DataFrame:
    before = events.shape[0]
    events = events[bool_index]
    if reset_index:
        events = events.reset_index(drop=True)
    after = events.shape[0]
    if before > after:
        print(
            "discarded %d %s (%.1f%%)"
            % (before - after, desc, (before - after) / before * 100)
        )
    return events


def assign_min_col(events: pd.DataFrame, per: pd.DataFrame, col: str):
    min_series = events.groupby(level='uid')[col].min()
    min_dict = min_series.to_dict()
    per.loc[:, "min_" + col] = per.index.map(lambda x: min_dict.get(x, np.NaN))


def assign_max_col(events: pd.DataFrame, per: pd.DataFrame, col: str):
    max_series = events.groupby(level='uid')[col].max()
    max_dict = max_series.to_dict()
    per.loc[:, "max_" + col] = per.index.map(lambda x: max_dict.get(x, np.NaN))


def filter_similar_names(per_df: pd.DataFrame, df_for_filtering: pd.DataFrame) -> pd.DataFrame:
    def is_similar(name1, name2):
        return jaro_winkler_similarity(name1.lower(), name2.lower()) > 0.9

    similar_first_names = per_df['first_name'].apply(
        lambda x: any(is_similar(x, name) for name in df_for_filtering['first_name'].dropna().unique())
    )
    similar_last_names = per_df['last_name'].apply(
        lambda x: any(is_similar(x, name) for name in df_for_filtering['last_name'].dropna().unique())
    )

    return per_df[similar_first_names & similar_last_names]


def cross_match_officers_between_agencies(personnel, events, df_for_filtering):
    events = discard_rows(
        events, events.uid.notna(), "events with empty uid column", reset_index=True
    )
    events = discard_rows(
        events, events.day.notna(), "events with empty day column", reset_index=True
    )
    events = discard_rows(
        events, events.day <= 31, "events with impossible day column", reset_index=True
    )
    for col in ["year", "month", "day"]:
        events.loc[:, col] = events[col].astype(int)
    events.loc[:, "date"] = combine_date_columns(events, "year", "month", "day")
    events = discard_rows(
        events, events.date.notna(), "events with empty date", reset_index=True
    )

    events['date'] = pd.to_datetime(events['date'], errors='coerce')
    events.loc[:, "timestamp"] = events["date"].map(lambda x: x.timestamp())
    events['timestamp'] = pd.to_numeric(events['timestamp'], errors='coerce')

    per = personnel[["uid", "first_name", "last_name", "race", "sex", "birth_year", "state"]]
    per.loc[:, "birth_year"] = per.birth_year.astype(str)
    per = per.drop_duplicates(subset=["uid"])

    per = discard_rows(
        per,
        per.first_name.notna() & per.last_name.notna(),
        "officers without either first name or last name",
        reset_index=True,
    )
    per.loc[:, "fc"] = per.first_name.map(lambda x: x[:1])
    per.loc[:, "lc"] = per.last_name.map(lambda x: x[:1])
    agency_dict = (
        events.loc[:, ["uid", "agency"]]
        .drop_duplicates()
        .set_index("uid")
        .agency.to_dict()
    )
    per.loc[:, "agency"] = per.uid.map(lambda x: agency_dict.get(x, ""))
    per = discard_rows(
        per, per.agency != "", "officers not linked to any event", reset_index=True
    )

    # Filter per for names similar to those in filtered_df
    per = filter_similar_names(per, df_for_filtering)

    per = per.set_index("uid")

    events = events.set_index(["uid", "event_uid"])
    assign_min_col(events, per, "date")
    assign_max_col(events, per, "date")
    assign_min_col(events, per, "timestamp")
    assign_max_col(events, per, "timestamp")
    per = discard_rows(per, per.min_date.notna(), "officers with no event")

    full_names = per.first_name.str.cat(per.last_name, sep=" ")

    excel_path = "../../../data/FL/match/cross_agency_officers_3-8-2024.xlsx"
    matcher = ThresholdMatcher(
        index=MultiIndex(
            [
                ColumnsIndex(["fc", "lc"]),
            ]
        ),
        scorer=MaxScorer(
            [
                AlterScorer(
                    scorer=SimSumScorer(
                        {
                            "first_name": JaroWinklerSimilarity(),
                            "last_name": JaroWinklerSimilarity(),
                            "birth_year": JaroWinklerSimilarity(),
                            "race": JaroWinklerSimilarity(),
                            "sex": JaroWinklerSimilarity(),
                        }
                    ),
                    values=full_names,
                    alter=lambda score: score - 0.2,
                ),
            ]
        ),
        dfa=per,
        filters=[
            DissimilarFilter("state"),
            NonOverlappingFilter("min_timestamp", "max_timestamp"),
        ],
        show_progress=True,
    )
    decision = .5
    with Spinner("saving matched clusters to Excel file"):
        matcher.save_clusters_to_excel(excel_path, decision, lower_bound=decision)
    clusters = matcher.get_index_clusters_within_thresholds(decision)
    print("saved %d clusters to %s" % (len(clusters), excel_path))

    return clusters, per[["max_timestamp", "agency", "race", "sex", "birth_year", "state"]]

matches = cross_match_officers_between_agencies(per_df, events_df, df_for_filtering)

discarded 1338 events with empty date (0.1%)


  per.loc[:, "birth_year"] = per.birth_year.astype(str)


discarded 63 officers without either first name or last name (0.0%)


scoring pairs: 4173373it [02:03, 33745.05it/s]


KeyboardInterrupt: 