In [103]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px

from helper import clean_column_names

In [104]:
dfa = pd.read_csv("../data/input/brady_2016_2023.csv")


def clean_table(df):
    df.loc[:, "last_name"] = df.last_name.str.replace(r"\'RULE(.+)", "", regex=True).str.replace(r"^\'$", "", regex=True)

    df = df[~((df.last_name.fillna("") == ""))]

    df.loc[:, "tracking_id"] = df.tracking_id.str.replace(r"^\'", "", regex=True)

    df = df.sort_values("tracking_id", ascending=False)

    df.loc[:, "allegation"] = df.allegation_rule.str.cat(df.allegation_paragraph, sep=" ")

    df.loc[:, "allegation"] = df.allegation.str.lower().str.strip().str.replace(r"\'", "", regex=True)

    df = df.drop(columns=["allegation_rule", "allegation_paragraph"])

    df.loc[:, "allegation_desc"] = (df.allegation_desc
                                    .str.replace(r"\'$", "", regex=True)
                                    .str.replace(r"^$", "missing", regex=True)
    )

    return df 

dfa = dfa.pipe(clean_table)

dfa.loc[:, "tracking_id"] = dfa.tracking_id.str.lower().str.strip().str.replace(r"\s+", "", regex=True).str.replace(r"\'", "", regex=True)

dfa

doubles = dfa[dfa.tracking_id.str.contains("2016-0449")]

doubles.allegation_desc.unique()

array(["'Paragraph C: Subparagraph 8 Failing to thoroughly search for, collect, preserve, and identify evidence in an arrest or investigative situation."],
      dtype=object)

In [105]:

collected_reports = [
'2016-0505',
 '2019-0344',
 '2014-0524',
 '2016-0504',
 '2016-0506',
 '2020-0269',
 '2019-0730',
 '2019-0547',
 '2019-0530',
 '2016-0076',
 '2016-0177',
 '2019-0741',
 '2014-0528',
 '2019-0732',
 '2015-0228',
 '2016-0157',
 '2015-0002',
 '2014-0073',
 '2014-0468',
 '2019-0359',
 '2015-0674',
 '2016-0126',
 '2017-0631',
 '2017-0393',
 '2017-0321',
 '2016-0075',
 '2016-0708',
 '2018-0476',
 '2018-0369',
 '2019-0014',
 '2014-0741',
 '2017-0442',
 '2018-0508',
 '2018-0651',
 '2018-0435',
 '2016-0368',
 '2019-0190',
 '2017-0549',
 '2017-0529',
 '2017-0262',
 '2016-0201',
 '2018-0651',
 '2019-0533',
 '2020-0135',
 '2017-0315',
 '2019-0314',
 '2019-0123',
 '2017-0646',
 '2020-0274',
 '2020-0260',
 '2017-0486',
 '2016-0085',
 '2019-0549',
 '2020-0007',
 '2020-0358',
 '2016-0506',
 '2014-0052',
 '2018-0041',
 '2018-0561',
 '2017-0248',
 '2014-0056',
 '2017-0688',
 '2018-0054',
 '2017-0350',
 '2020-0417',
 '2020-0520',
 '2017-0700',
 '2018-0442',
 '2016-0633',
 '2017-0720'
 ]


pattern = '|'.join(collected_reports)
reports_df = dfa[dfa['tracking_id'].str.contains(pattern)]

# Count of tracking IDs not found in the DataFrame
missing_count = sum(dfa['tracking_id'].str.contains(tracking_id).any() for tracking_id in collected_reports)

# Calculate how many are missing
missing_tracking_ids = len(collected_reports) - missing_count

print("Number of missing tracking IDs:", missing_tracking_ids)

# List to hold missing tracking IDs
missing_tracking_ids = []

for tracking_id in collected_reports:
    if not dfa['tracking_id'].str.contains(tracking_id).any():
        missing_tracking_ids.append(tracking_id)

print("Missing tracking IDs:", missing_tracking_ids)

Number of missing tracking IDs: 10
Missing tracking IDs: ['2014-0524', '2014-0528', '2015-0228', '2015-0002', '2014-0073', '2014-0468', '2015-0674', '2014-0741', '2014-0052', '2014-0056']


In [106]:
dfb = pd.read_csv("../data/input/data_nola_gov.csv")

dfb = dfb.rename(columns={"Complaint Tracking Number": "tracking_id"})

dfb = dfb[['tracking_id' ,'Date Complaint Occurred',
       'Date Complaint Received by NOPD (PIB)',
       'Date Complaint Investigation Complete', 'Complaint classification', 'Bureau of Complainant',
       'Division of Complainant', 'Unit of Complainant',
       'Unit Additional Details of Complainant', 
       'Working Status of Complainant', 'Shift of Complainant',
       'Rule Violation', 'Paragraph Violation', 'Unique Officer Allegation ID',
       'Officer Race Ethnicity', 'Officer Gender', 'Officer Age',
       'Officer years of service', 'Complainant Gender',
       'Complainant Ethnicity', 'Complainant Age', "Disposition"]]

dfb.loc[:, "tracking_id"] = dfb.tracking_id.str.lower().str.strip().str.replace(r"\s+", "", regex=True)


doubles_b = dfb[dfb.tracking_id.fillna("").str.contains("2016-0629")]

doubles_b

Unnamed: 0,tracking_id,Date Complaint Occurred,Date Complaint Received by NOPD (PIB),Date Complaint Investigation Complete,Complaint classification,Bureau of Complainant,Division of Complainant,Unit of Complainant,Unit Additional Details of Complainant,Working Status of Complainant,...,Paragraph Violation,Unique Officer Allegation ID,Officer Race Ethnicity,Officer Gender,Officer Age,Officer years of service,Complainant Gender,Complainant Ethnicity,Complainant Age,Disposition
853,2016-0629-p,,2016-09-26,,DI-1,FOB - Field Operations Bureau,Unknown District/Division,Bourbon Promenade,Unknown Assignment,Unknown Working Status,...,PARAGRAPH 02 - FALSE OR INACCURATE REPORTS,35869.0,Black,Male,,,Male,Black,,Pending


In [107]:
df = pd.merge(dfa, dfb, on="tracking_id")

df = df.drop_duplicates(subset=["allegation_number"])

In [108]:
df.loc[:, "allegation_desc"] = df.allegation_desc.str.lower().str.strip()

df.loc[:, "Disposition"] = df["Disposition"].str.lower().str.strip()

KeyError: 'Disposition'

In [None]:
top_allegations = df['allegation'].value_counts().head(25)

# Create a horizontal bar chart
fig = px.bar(top_allegations, orientation='h', labels={'index': 'Allegation', 'value': 'Count'}, 
             title='Top 25 Allegations')
fig.update_layout(
    xaxis_title="Count",
    yaxis_title="Allegation",
    yaxis={'categoryorder':'total ascending'},
    height=900  # Adjusted for better visibility with more categories
)
fig.update_yaxes(automargin=True, tickmode='array', tickvals=list(range(len(top_allegations))), ticktext=top_allegations.index)
fig.show()

In [None]:
# df = df[~((df.allegation_desc == "missing"))]

# # Top 25 allegation descriptions
# top_allegation_desc = df['allegation_desc'].value_counts().head(25)
# fig_desc = px.bar(top_allegation_desc, orientation='h', labels={'index': 'Allegation Description', 'value': 'Count'}, 
#                   title='Top 25 Allegation Descriptions')
# fig_desc.update_layout(
#     xaxis_title="Count",
#     yaxis_title="Allegation Description",
#     yaxis={'categoryorder':'total ascending'},
#     height=800  # Increased height to accommodate more bars
# )
# fig_desc.update_yaxes(automargin=True, tickmode='array', tickvals=list(range(len(top_allegation_desc))), ticktext=top_allegation_desc.index)
# fig_desc.show()

In [None]:
df.loc[:, "tag"] = (df
                    .allegation_desc
                    .str.lower()
                    .str.replace(r"(.+)(domestic violence|domestic abuse|domestic battery)(.+)?", "domestic violence", regex=True)
                    .str.replace(r"(.+)battery(.+)?", "battery", regex=True)
                    .str.replace(r"(.+)?rape(.+)?", "rape", regex=True)
                    .str.replace(r"(.+)sexual harassment(.+)?", "sexual harassment", regex=True)
                    .str.replace(r"(.+)assault(.+)?", "assault", regex=True)
                    .str.replace(r"(.+)body-? ?worn cameras?(.+)?", "body worn camera", regex=True)
                    .str.replace(r"(.+)theft(.+)?", "theft", regex=True)
                    .str.replace(r"(.+)stop(.+)?", "terry stop", regex=True)
                    .str.replace(r"(.+)(authorized firearm|pr312 firearms|possession of a firearm)(.+)?", "firearms")
                    .str.replace(r"(.+)(firearms training|firearms recertification)(.+)?", "firearms training")
                    .str.replace(r"(.+)bias(.+)?", "biased based policing", regex=True)
                    .str.replace(r"(.+)stalking(.+)?", "stalking", regex=True)
                    .str.replace(r"(.+)(search & seizure|search and seizure)(.+)?", "search and seizure", regex=True)
                    .str.replace(r"(.+)supervise(.+)?", "failure to supervise", regex=True)
                    .str.replace(r"(.+)use of force(.+)?", "use of force", regex=True)
                    .str.replace(r"(.+)?necessary police action(.+)?", "failure to take appropriate and necessary action", regex=True)
                    .str.replace(r"(.+)?collect(.+)?", "failure to collect, preserve, and identify evidence", regex=True)
                    .str.replace(r"(.+)?payroll(.+)?", "payroll fraud", regex=True)
                    .str.replace(r"^\'((.+)to cruelty to juveniles|(.+)relative to indecent behavior with juvenile|(.+)relative to indecent behavior with a juvenile|(.+)relative to molestation of a juvenile|(.+)cruelty to juveniles|(.+)?child abuse(.+)?)", "violence against juvenile", regex=True)
)


df = df[~((df.tag.fillna("") == ""))]

In [None]:
dv_df = df[df.tag.str.contains("domestic violence|battery|rape|assault|violence against juvenile")]

dv_df = dv_df[dv_df.year.astype(str).str.contains("2022")]

dv_df.tag.value_counts()
# dv_df

tag
battery                      5
domestic violence            5
violence against juvenile    1
Name: count, dtype: int64

In [None]:

# # Group by year and count the occurrences
# yearly_counts = dv_df.groupby('year').size().reset_index(name='Count')

# # Create a line graph
# fig = px.line(yearly_counts, x='year', y='Count', title='Domestic Violence Complaints per Year', markers=True)
# fig.update_traces(textposition='top center')
# fig.show()

In [None]:
# # # Count occurrences of each disposition
# # disposition_counts = dv_df['Disposition'].value_counts().reset_index()
# # disposition_counts.columns = ['Disposition', 'Count']

# # # Create a bar chart
# # fig = px.bar(disposition_counts, x='Disposition', y='Count', title='Disposition of Domestic Violence Complaints')
# # fig.show()

# # Count occurrences of each disposition by year
# disposition_counts = dv_df.groupby(['year', 'Disposition']).size().reset_index(name='Count')

# # Calculate the total count of dispositions for each year
# total_counts = disposition_counts.groupby('year')['Count'].transform('sum')

# # Calculate the proportion of each disposition for each year
# disposition_counts['Proportion'] = disposition_counts['Count'] / total_counts

# # Create a stacked bar chart
# fig = px.bar(disposition_counts, x='year', y='Proportion', color='Disposition',
#              title='Proportions of Dispositions by Year')
# fig.show()

In [None]:
# # Count occurrences of each incident type for domestic violence complaints
# incident_type_counts = dv_df["'Incident Type"].value_counts().reset_index()
# incident_type_counts.columns = ["'Incident Type", 'Count']

# # Create a bar chart for Incident Type
# fig = px.bar(incident_type_counts, x="'Incident Type", y='Count', title='Incident Types for Domestic Violence Complaints')
# fig.show()

In [None]:
# public_initiated_dv_df = df[(df['tag'] == 'domestic violence') & (df["'Incident Type"] == "'Rank Initiated")]

# # Count occurrences of each disposition for domestic violence 'Public Initiated' incident types
# disposition_counts = public_initiated_dv_df['Disposition'].value_counts().reset_index()
# disposition_counts.columns = ['Disposition', 'Count']

# # Create a bar chart for Disposition counts of Public Initiated Domestic Violence Incidents
# fig = px.bar(disposition_counts, x='Disposition', y='Count', title='Disposition Counts for "Public Initiated" Domestic Violence Incidents')
# fig.show()

In [None]:
# # Count occurrences of each disposition for domestic violence 'Public Initiated' incident types
# disposition_counts_dv = public_initiated_dv_df['Disposition'].value_counts()

# # Calculate the proportion of 'Sustained' dispositions
# total_public_initiated_dv = disposition_counts_dv.sum()  # Total number of Public Initiated DV incidents
# sustained_count_dv = disposition_counts_dv.get('sustained', 0)  # Number of Sustained dispositions
# sustained_proportion_dv = sustained_count_dv / total_public_initiated_dv if total_public_initiated_dv > 0 else 0

# # Output for comparison
# sustained_proportion_dv

In [None]:
# public_initiated_df = df[df["'Incident Type"] == "'Public Initiated"]

# # Count occurrences of each disposition for 'Public Initiated' incident types
# disposition_counts = public_initiated_df['Disposition'].value_counts()

# # Calculate the proportion of 'Sustained' dispositions
# total_public_initiated = disposition_counts.sum()  # Total number of Public Initiated incidents
# sustained_count = disposition_counts.get('sustained', 0)  # Number of Sustained dispositions
# sustained_proportion = sustained_count / total_public_initiated if total_public_initiated > 0 else 0

# # Print the proportion of 'Sustained' dispositions
# sustained_proportion

In [None]:
# # Filter rows for domestic violence
# evidence_df = df[df['tag'] == 'failure to collect, preserve, and identify evidence']

# # Group by year and count the occurrences
# yearly_counts = evidence_df.groupby('year').size().reset_index(name='Count')

# # Create a line graph
# fig = px.line(yearly_counts, x='year', y='Count', title='Evidence Complaints per Year', markers=True)
# fig.update_traces(textposition='top center')
# fig.show()

In [None]:
# # Count occurrences of each disposition
# disposition_counts = evidence_df['Disposition'].value_counts().reset_index()
# disposition_counts.columns = ['Disposition', 'Count']
# # 
# # Create a bar chart
# fig = px.bar(disposition_counts, x='Disposition', y='Count', title='Disposition of Evidence Complaints')
# fig.show()

In [None]:
# # Count occurrences of each incident type for domestic violence complaints
# incident_type_counts = evidence_df["'Incident Type"].value_counts().reset_index()
# incident_type_counts.columns = ["'Incident Type", 'Count']

# # Create a bar chart for Incident Type
# fig = px.bar(incident_type_counts, x="'Incident Type", y='Count', title='Complainant Types for Evidence Complaints')
# fig.show()

In [None]:
# evidence_type_df = df[(df['tag'] == 'failure to collect, preserve, and identify evidence') & (df["'Incident Type"] == "'Public Initiated")]

# # Count occurrences of each disposition for domestic violence 'Public Initiated' incident types
# disposition_counts = evidence_type_df['Disposition'].value_counts().reset_index()
# disposition_counts.columns = ['Disposition', 'Count']

# # Create a bar chart for Disposition counts of Public Initiated Domestic Violence Incidents
# fig = px.bar(disposition_counts, x='Disposition', y='Count', title='Disposition Counts for "Public Initiated" Evidence Complaints')
# fig.show()

In [None]:
# evidence_type_df = df[(df['tag'] == 'failure to collect, preserve, and identify evidence') & (df["'Incident Type"] == "'Public Initiated")]

# # Count occurrences of each disposition for domestic violence 'Public Initiated' incident types
# disposition_counts = evidence_type_df['Disposition'].value_counts().reset_index()
# disposition_counts.columns = ['Disposition', 'Count']

# # Create a bar chart for Disposition counts of Public Initiated Domestic Violence Incidents
# fig = px.bar(disposition_counts, x='Disposition', y='Count', title='Disposition Counts for "Rank Initiated" Evidence Complaints')
# fig.show()

In [None]:
# # Count occurrences of each disposition for domestic violence 'Public Initiated' incident types
# disposition_counts_dv = evidence_type_df['Disposition'].value_counts()

# # Calculate the proportion of 'Sustained' dispositions
# total_public_initiated_dv = disposition_counts_dv.sum()  # Total number of Public Initiated DV incidents
# sustained_count_dv = disposition_counts_dv.get('sustained', 0)  # Number of Sustained dispositions
# sustained_proportion_dv = sustained_count_dv / total_public_initiated_dv if total_public_initiated_dv > 0 else 0

# # Output for comparison
# sustained_proportion_dv