In [675]:
# import sys
# sys.path.append("../")
from lib.clean import standardize_desc_cols, clean_column_names

In [676]:
import pandas as pd
from datamatch import (
    JaroWinklerSimilarity,
    ThresholdMatcher,
    ColumnsIndex,
)

In [677]:
def read_data():
    df = pd.read_csv("../data/OK/aggregate_members_oathkeepers_cleaned.csv")
    return df 

In [678]:
dfa = read_data()

In [679]:
dfa = dfa.fillna("")

In [680]:
dfa.loc[:, "state"] = dfa.state.str.lower().str.strip().str.replace(r"ma\.", "ma", regex=True)

In [681]:
dfa.columns

Index(['id_number', 'group_name', 'bool', 'city', 'state', 'email',
       'undefined_date', 'notes', 'first_name', 'middle_name', 'last_name'],
      dtype='object')

In [682]:
dfa = dfa[dfa.state.isin(["ma"])]
dfa.loc[:, "notes"] = dfa.notes.str.lower().str.strip()
dfa.loc[:, "first_name"] = dfa.first_name.str.replace(r"^ (\w+)", r"\1", regex=True).str.replace(r"(\w+) $", r"\1", regex=True)
dfa.loc[:, "last_name"] = dfa.last_name.str.replace(r"^ (\w+)", r"\1", regex=True).str.replace(r"(\w+) $", r"\1", regex=True)
dfa.loc[:, "city"] = dfa.city.str.lower().str.strip().str.replace(r"\,.+", "", regex=True)

In [683]:
dfa.state.unique()

array(['ma'], dtype=object)

In [684]:
search_1 = dfa[dfa.notes.str.contains("law enforcement")]
search_1

Unnamed: 0,id_number,group_name,bool,city,state,email,undefined_date,notes,first_name,middle_name,last_name


In [685]:
search_2 = dfa[dfa.notes.str.contains("police")]
search_2

Unnamed: 0,id_number,group_name,bool,city,state,email,undefined_date,notes,first_name,middle_name,last_name
4122,1043,Annual,No,hopkinton,ma,cjdcarving@gmail.com,,"civilian - brothers are ret. mil, policeman, m...",cecilia,del,gaudio
8197,6075,Annual,No,westport,ma,David.Bellavance@us.ngrid.com,,cpr/fa/aed instuctor if people want to be cert...,david,,bellavance
13353,11462,Annual,No,braintree,ma,beungood8@aol.com,1/6/2012,i have been a fulltime sworn police officer f...,john,,ouellette


In [686]:
search_3 = dfa[dfa.notes.str.contains("patrol")]
search_3.notes.unique()

array(['i am a sergeant supervising a patrol shift in falmouth, massachusetts.  i am a member of the sandwich american legion post 188 and a member of the leathernecks motorcycle club.'],
      dtype=object)

In [687]:
search_4 = dfa[dfa.notes.str.contains("officer")]
search_4.notes.unique()

array(["also worked as k9 officer, also worked with nd attorney general's drug enforcement unit",
       "cpr/fa/aed instuctor if people want to be certified i can provide that. first responder trained for medical if needed. the skills i've learned as a soldier and part time police officer.",
       'i have been  a fulltime sworn police officer for 16 years.  i am into cycling,hunting fishing,martial arts ,kayaking and cross country skiing'],
      dtype=object)

In [688]:
search_5 = dfa[dfa.notes.str.contains("swat")]
search_5

Unnamed: 0,id_number,group_name,bool,city,state,email,undefined_date,notes,first_name,middle_name,last_name


In [689]:
search_6 = dfa[dfa.notes.str.contains("explosive")]
search_6

Unnamed: 0,id_number,group_name,bool,city,state,email,undefined_date,notes,first_name,middle_name,last_name


In [690]:
search_7 = dfa[dfa.notes.str.contains("bomb")]
search_7

Unnamed: 0,id_number,group_name,bool,city,state,email,undefined_date,notes,first_name,middle_name,last_name


In [691]:
search_8 = dfa[dfa.notes.str.contains("active duty")]
search_8.notes.unique()

array(['spreading the word, and god forbid forming a neighborhood / town militia to keep the oath. i am the son of a us army msgt who died active duty (cid)1971, british boys brigade at 8yo,shot my first m60 at age of 9 at green berit day new cumberland army depot,attended west point youth camp 4 years.i know what kind of people can be counted on and trusted.and i will never submit to any enemy foriegn or domestic.'],
      dtype=object)

In [692]:
dfs = [search_1, search_2, search_3, search_4, search_5, search_6, search_7, search_8]
dfb = pd.concat(dfs, axis=0)

In [693]:
def read_misconduct_data():
    df1 = pd.read_csv("../data/MA/misconduct/nacdl/billerica/ia/uof_billerica_2015_2020.csv")
    df2 = pd.read_csv("../data/MA/misconduct/nacdl/Brookline/IIU/cprr_brookline_2010_2021.csv")
    return df1, df2

In [694]:
df1, df2 = read_misconduct_data()

In [695]:
def split_names(df):
    df = df.pipe(clean_column_names)
    df = df[~((df.name.fillna("") == ""))]
    names = df.name.str.lower().str.strip()\
        .str.extract(r"(.+)\, (\w+) ?(\w+)?")
    
    df.loc[:, "last_name"] = names[0]
    df.loc[:, "first_name"] = names[1]
    df.loc[:, "middle_name"] = names[2]
    return df

In [696]:
df1[["first_name", "middle_name"]] = ""
df1 = df1.drop_duplicates(subset=["last_name"])
df2 = df2.pipe(split_names)
df2 = df2.drop_duplicates(subset=["name"])

dfc = pd.concat([df1, df2])
dfc = dfc[["first_name", "middle_name", "last_name"]]
dfc = dfc.pipe(standardize_desc_cols, ["first_name", "middle_name", "last_name"])
dfc

Unnamed: 0,first_name,middle_name,last_name
0,,,eidens
4,,,gualtieri
5,,,parker
6,,,slaney
7,,,not given
...,...,...,...
429,christopher,k,elcock
435,brian,m,gallagher
438,gregory,,ander
445,richard,a,regan


In [697]:

def match_ok_names_to_personnel(ok, agency):
    dfa = ok[["first_name", "middle_name", "last_name", "notes", "city", "state"]]
    dfa = dfa.fillna("")
    dfa = dfa.drop_duplicates(subset=["first_name", "middle_name", "last_name"])

    dfb = agency[["first_name", "middle_name", "last_name"]]
    dfb = dfb.fillna("")
    dfb["city"] = ""
    dfb["state"] = ""
    dfb["notes"] = ""

    dfb = dfb.drop_duplicates(subset=["first_name", "middle_name", "last_name"])

    matcher = ThresholdMatcher(
        ColumnsIndex([]),
        {
            "first_name": JaroWinklerSimilarity(),
            "last_name": JaroWinklerSimilarity(),
        },
        dfa,
        dfb,
    )
    decision = 0.800
    matcher.save_pairs_to_excel(
        ("../data/MA/oathkeeper_names_v_ma.xlsx"), decision
    )
    return matcher

In [698]:
match = match_ok_names_to_personnel(dfa, dfc)

ValueError: Dataframe index contains duplicates. Both frames need to have index free of duplicates.

In [None]:
def read_matches():
    df = pd.read_excel("../data/MA/oathkeeper_names_v_ma.xlsx")
    return df 

In [None]:
matches = read_matches()

In [None]:
matches

Unnamed: 0,score_range,pair_idx,sim_score,row_key,first_name,middle_name,last_name,notes,city,state,county,lc
0,1.00-0.95,0.0,0.95263,17756,michael,f,keaney,,planville,ma,,k
1,,,,343,michael,joseph,keaveney,,,,,k
2,0.95-0.90,0.0,0.901347,14872,robert,,smalser,,sturbridge,ma,,s
3,,,,346,robert,e,mayer,,,,,m
4,,1.0,0.901347,4167,patrick,michael,fahey,,ipswich,ma,,f
5,,,,139,patrick,l,mahoney,,,,,m
6,0.90-0.85,0.0,0.860887,4433,robert,l,taylor,98z50m,ayer,ma,,t
7,,,,132,robert,paul,lawlor,,,,,l
8,,1.0,0.860104,17128,russell,richard,lassonde,,springfield,ma,,l
9,,,,405,russell,t,lloyd,,,,,l
