In [27]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from helper import clean_column_names
import networkx as nx

In [28]:
def clean_brady_table(df):
    df.columns = [col.lstrip('\'') for col in df.columns]

    df = df.applymap(lambda x: x.lstrip('\'') if isinstance(x, str) else x)
    
    df = df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)


    df.loc[:, "last_name"] = (df.last_name
                              .str.replace(r"\'RULE(.+)", "", regex=True)
                              .str.replace(r"^\'$", "", regex=True)
    )

    df = df[~((df.last_name.fillna("") == ""))]

    df.loc[:, "tracking_id"] = (df.tracking_id
                                .str
                                .lower()
                                .str.strip()
                                .str.replace(r"\s+", "", regex=True)
                                .str.replace(r"\'", "", regex=True)
    )

    df.loc[:, "allegation"] = (df.allegation_rule
                               .str.cat(df.allegation_paragraph, sep=" ")
    )

    df.loc[:, "allegation"] = (df.allegation
                               .str.lower()
                               .str.strip()
                               .str.replace(r"\'", "", regex=True)
    )

    df = df.drop(columns=["allegation_rule", "allegation_paragraph"])

    df.loc[:, "allegation_desc"] = (df.allegation_desc.str
                                    .lower()
                                    .str.strip()
                                    .str.replace(r"\'$", "", regex=True)
                                    .str.replace(r"^$", "missing", regex=True)
    )



    return df 

def read_brady_tbl():
    df = pd.read_csv("../../brady/data/input/brady_2016_2023.csv")
    return df 

def filter_brady_tbl(df):
    df.loc[:, "tracking_id"] = df.tracking_id.str.replace(r"\-(\w+)$", "", regex=True)

    df = df[df.allegation_desc.str.contains("domestic")]
    return df 

def read_gist_tbl():
    df = pd.read_csv("../data/output/labeled_gists.csv")
    return df 

filtered_brady_tbl = read_brady_tbl()
filtered_brady_tbl = filtered_brady_tbl.pipe(clean_brady_table).pipe(clean_column_names).pipe(filter_brady_tbl)

gist_tbl = read_gist_tbl()

gist_tbl = gist_tbl[~(gist_tbl.keep.astype(str) == "0.0")]

gist_tbl


unfiltered_brady_tbl = read_brady_tbl()
unfiltered_brady_tbl = unfiltered_brady_tbl.pipe(clean_brady_table).pipe(clean_column_names)

# ind unique tracking_ids in dfa
unique_ids_in_dfa = filtered_brady_tbl['tracking_id'].unique()

# Filter dfb to find tracking_ids not in dfa
gist_unique = gist_tbl[~gist_tbl['tracking_id'].isin(unique_ids_in_dfa)]

gist_unique_list = [n for n in gist_unique["tracking_id"]]

unfiltered_brady_tbl.loc[:, "tracking_id"] = unfiltered_brady_tbl.tracking_id.str.replace(r"\-(\w{1})$", "", regex=True)

dfc = unfiltered_brady_tbl[unfiltered_brady_tbl.tracking_id.isin(gist_unique_list)]

final_df = pd.concat([filtered_brady_tbl, dfc])

final_df.loc[:, "tracking_id"] = final_df.tracking_id.str.replace(r"\-(\w{1})$", "", regex=True)

final_df


final_df.to_csv("../data/output/gist_w_brady_post_labels.csv", index=False)

  df = df.applymap(lambda x: x.lstrip('\'') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.lstrip('\'') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)


In [29]:
final_df

Unnamed: 0,tracking_id,incident_type,investigation_status,accused_employee_id,first_name,last_name,allegation_number,allegation_desc,disposition,year,allegation
113,2021-0737,public initiated,completed,030307,anna,mcelveen,30580,mississippi code title 97-3- 7 (3) domestic vi...,sustained - rui resign,2021,rule 2: moral conduct paragraph 01 - adherence...
145,2020-0063,public initiated,completed,025966,mark,lynch,28395,nopd policy: chapter 42.4 - domestic violence;...,exonerated,2020,rule 4: perf of duty paragraph 04 - neglect of...
307,2018-0267,public initiated,completed,008913,jeffrey,vappie ii,25045,nopd policy: chapter 42.4 - domestic violence,unfounded,2018,rule 4: perf of duty paragraph 04 - neglect of...
308,2018-0267,public initiated,completed,018876,lawrence,weathersby,25046,nopd policy: chapter 42.4 domestic violence,unfounded,2018,rule 4: perf of duty paragraph 04 - neglect of...
317,2020-0067,public initiated,forwarded,030450,ridge,petes,28765,nopd policy: chapter 42.4 - domestic violence,unfounded,2020,rule 4: perf of duty paragraph 04 - neglect of...
...,...,...,...,...,...,...,...,...,...,...,...
10495,2018-0127,public initiated,completed,019456,sebastian,trigo,24727,missing,unfounded,2018,rule 5: rest activities paragraph 07 - acting ...
10496,2018-0127,public initiated,completed,019456,sebastian,trigo,24728,missing,unfounded,2018,rule 3: prof conduct paragraph 01 - profession...
10497,2018-0127,public initiated,completed,029836,nicholas,wallis,24729,missing,unfounded,2018,rule 3: prof conduct paragraph 01 - profession...
10515,2018-0085,rank initiated,completed,006864,kevin,williams,24779,paragraph b: any employee with supervisory res...,negotiated settlement,2018,rule 4: perf of duty paragraph 04 - neglect of...


In [30]:

# final_df_review = final_df[final_df.year.astype(str).str.contains("2023")]

# final_df_review = final_df_review[~(final_df_review.tracking_id.isin(gist_unique_list))]

# final_df_review.tracking_id.nunique()

In [31]:
find_missing = final_df.copy()
find_missing

result = final_df[final_df.groupby('tracking_id')['allegation_desc'].transform(lambda x: (x == 'missing').all())]
missing_tracking_ids = result['tracking_id'].unique()

missing_tracking_ids

array(['2022-0283', '2022-0204', '2022-0418', '2019-0251', '2019-0064',
       '2021-0130', '2019-0205', '2021-0178', '2021-0303', '2019-0070',
       '2023-0223', '2023-0670', '2022-0113', '2020-0205', '2019-0518',
       '2017-0181', '2016-0678', '2017-0585', '2017-0220', '2017-0612',
       '2016-0148', '2017-0058', '2018-0132', '2018-0127'], dtype=object)

In [32]:
merged_df = gist_tbl.merge(filtered_brady_tbl, on='tracking_id', how='outer', indicator=True)
dfb_only = merged_df[merged_df['_merge'] == 'left_only']

tracking_ids_in_dfb_not_dfa = dfb_only['tracking_id'].tolist()

print("Tracking IDs of allegations in dfb that don't exist in dfa:")
print(tracking_ids_in_dfb_not_dfa)

mapping_df = read_brady_tbl()

mapping_df = mapping_df.pipe(clean_brady_table).pipe(clean_column_names)

mapping_df = mapping_df[mapping_df.tracking_id.str.contains("2022-0283")]

mapping_df

  df = df.applymap(lambda x: x.lstrip('\'') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)


Tracking IDs of allegations in dfb that don't exist in dfa:
['2016-0097', '2016-0108', '2016-0148', '2016-0232', '2016-0271', '2016-0377', '2016-0379', '2016-0388', '2016-0438', '2016-0486', '2016-0502', '2016-0524', '2016-0560', '2016-0569', '2016-0625', '2016-0678', '2017-0022', '2017-0058', '2017-0181', '2017-0191', '2017-0220', '2017-0537', '2017-0557', '2017-0562', '2017-0576', '2017-0585', '2017-0588', '2017-0604', '2017-0612', '2018-0017', '2018-0085', '2018-0127', '2018-0132', '2018-0294', '2018-0653', '2019-0007', '2019-0064', '2019-0070', '2019-0205', '2019-0251', '2019-0262', '2019-0273', '2019-0278', '2019-0383', '2019-0518', '2019-0539', '2019-0572', '2020-0167', '2020-0194', '2020-0205', '2020-0271', '2020-0272', '2020-0329', '2020-0329', '2020-0425', '2020-0604', '2021-0130', '2021-0178', '2021-0303', '2021-0553', '2022-0113', '2022-0178', '2022-0204', '2022-0204', '2022-0214', '2022-0246', '2022-0246', '2022-0283', '2022-0418', '2022-0418', '2023-0020', '2023-0129', '20

Unnamed: 0,tracking_id,incident_type,investigation_status,accused_employee_id,first_name,last_name,allegation_number,allegation_desc,disposition,year,allegation
325,2022-0283-p,public initiated,completed,7660,russell,philibert,31138,missing,not sustained,2022,rule 3: prof conduct paragraph 01 - profession...
