In [19]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from helper import clean_column_names

In [20]:
### clean data nola gov tbl ### 
def clean_gov_table(df):
    df = df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)

    df.loc[:, "tracking_id"] = (df["complaint_tracking_number"].str
                                .lower()
                                .str.strip()
                                .str.replace(r"\s+", "", regex=True)
                                .str.replace(r"\'", "", regex=True)
    )


    return df 

def read_gov_tbl():
    df = pd.read_csv("data/input/data_nola_gov.csv")
    return df 

def extract_year(df):
    year = df.tracking_id.str.extract(r"^(\w{4})")

    df.loc[:, "year"] = year[0].astype(str)
    return df

def read_brady_criminal_tbl():
    df = pd.read_csv("../brady/data/output/dv_df_brady_criminal.csv")
    return df 

def read_gist_tbl():
    df = pd.read_csv("../gist/data/output/gist_w_brady_post_labels.csv")
    return df 

In [21]:
dv_df_gist = read_gist_tbl()

In [22]:
dv_df_brady_criminal = read_brady_criminal_tbl()
dv_df_brady_criminal.loc[:, "tracking_id"] = dv_df_brady_criminal.tracking_id.str.replace(r"\-(\w{1})$", "", regex=True)

# Create a boolean mask for rows in dv_df_gist that have tracking_ids present in dv_df_brady_criminal
mask = dv_df_gist['tracking_id'].isin(dv_df_brady_criminal['tracking_id'])

# Filter out rows from dv_df_gist that have matching tracking_ids
dv_df_gist = dv_df_gist[~mask]

dv_df_gist.shape

# dv_df_gist.to_csv("data/output/final_gist_for_summary_table.csv", index=False)

In [23]:

dv_ids = [n for n in dv_df_gist["tracking_id"]]

gov_tbl = read_gov_tbl()
gov_tbl = gov_tbl.pipe(clean_column_names).pipe(clean_gov_table).pipe(extract_year)

gov_tbl = gov_tbl[~((gov_tbl.incident_type.fillna("") == ""))]
gov_tbl = gov_tbl[~((gov_tbl.disposition.fillna("") == ""))]

gov_tbl = gov_tbl[gov_tbl.year.str.contains("2016|2017|2018|2019|2020|2021|2022|2023")]

gov_tbl.loc[:, "tracking_id_short"] = gov_tbl.tracking_id.str.replace(r"\-(\w{1})$", "", regex=True)


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



In [24]:
gov_tbl_domestic_violence = gov_tbl[gov_tbl.tracking_id_short.isin(dv_ids)]

gov_tbl_domestic_violence = gov_tbl_domestic_violence[~((gov_tbl_domestic_violence.disposition.fillna("") == ""))]

print(gov_tbl_domestic_violence.tracking_id.nunique())

# gov_tbl_domestic_violence.to_csv("data/output/gov_tbl_w_gist_and_brady.csv", index=False)

181


In [25]:
print(f"Before: {gov_tbl_domestic_violence.shape}")
gov_tbl_domestic_violence = gov_tbl_domestic_violence.drop_duplicates(subset=["tracking_id", "rule_violation", "paragraph_violation", "disposition", "unique_officer_allegation_id"])
print(f"After: {gov_tbl_domestic_violence.shape}")

Before: (292, 27)
After: (290, 27)


In [26]:
# Calculate the percentage of "sustained" dispositions for each year and incident type
public_initiated_pct_dv = (gov_tbl_domestic_violence[(gov_tbl_domestic_violence.incident_type == "public initiated") & (gov_tbl.disposition == "sustained")].groupby("year").size() / 
                        gov_tbl_domestic_violence[gov_tbl_domestic_violence.incident_type == "public initiated"].groupby("year").size()) * 100

rank_initiated_pct_dv = (gov_tbl_domestic_violence[(gov_tbl_domestic_violence.incident_type == "rank initiated") & (gov_tbl.disposition == "sustained")].groupby("year").size() / 
                      gov_tbl_domestic_violence[gov_tbl_domestic_violence.incident_type == "rank initiated"].groupby("year").size()) * 100

# Create the traces for the bar chart
trace1 = go.Bar(
    x=public_initiated_pct_dv.index,
    y=public_initiated_pct_dv,
    name='Public Initiated',
    marker_color='blue',
    opacity=0.8,
    text=[f"{pct:.2f}%" for pct in public_initiated_pct_dv],
    textposition='auto',
    textangle=0
)

trace2 = go.Bar(
    x=rank_initiated_pct_dv.index,
    y=rank_initiated_pct_dv,
    name='Rank Initiated',
    marker_color='green',
    opacity=0.8,
    text=[f"{pct:.2f}%" for pct in rank_initiated_pct_dv],
    textposition='auto',
    textangle=0
)

# Create the layout for the chart
layout = go.Layout(
    title='Percentage of Sustained Dispositions by Incident Type and Year for Domestic Violence Subset',
    xaxis=dict(title='Year', tickangle=0),
    yaxis=dict(title='Percentage of Sustained Dispositions'),
    barmode='group'
)

# Create the figure and display the chart
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [27]:
def read_calls_for_service_tbl():
    df = pd.read_csv("../gov-cfs/data/output/calls_for_service_2016_2023.csv")
    return df 

calls_for_service_tbl = read_calls_for_service_tbl()

# Calculate the counts for filtered data
public_initiated_counts = gov_tbl_domestic_violence[gov_tbl_domestic_violence.incident_type == "public initiated"].groupby("year").size()
rank_initiated_counts = gov_tbl_domestic_violence[gov_tbl_domestic_violence.incident_type == "rank initiated"].groupby("year").size()

# Calculate percent changes for filtered data
public_initiated_pct_change = public_initiated_counts.pct_change() * 100
rank_initiated_pct_change = rank_initiated_counts.pct_change() * 100

# Calculate the counts for all incidents
all_incident_counts = gov_tbl_domestic_violence.groupby("year").size()

# Calculate percent changes for all incidents
all_incident_pct_change = all_incident_counts.pct_change() * 100

def format_labels(count, pct):
    if not pd.isna(pct):
        return f'{count}<br>({pct:.1f}%)'
    else:
        return f'{count}'
    
    
# Calculate the counts for calls for service data
calls_for_service_counts = calls_for_service_tbl.groupby("year").size()

# Calculate percent changes for calls for service data
calls_for_service_pct_change = calls_for_service_counts.pct_change() * 100

# Create subplots for each incident chart
fig1 = make_subplots(rows=2, cols=1, subplot_titles=("Public Initiated Incidents", "Calls for Service"), vertical_spacing=0.15)
fig2 = make_subplots(rows=2, cols=1, subplot_titles=("Rank Initiated Incidents", "Calls for Service"), vertical_spacing=0.15)
fig3 = make_subplots(rows=2, cols=1, subplot_titles=("Domestic Allegations", "Calls for Service"), vertical_spacing=0.15)

# Add traces for incident data
fig1.add_trace(go.Scatter(
    x=public_initiated_counts.index,
    y=public_initiated_counts,
    mode='lines+markers+text',
    name='Public Initiated',
    marker=dict(color='yellow'),
    text=[format_labels(count, pct) for count, pct in zip(public_initiated_counts, public_initiated_pct_change)],
    textposition='top center',
    textfont=dict(size=10),
    texttemplate='%{text}',  # Add this line to display the formatted text
    cliponaxis=False  # Add this line to prevent labels from being clipped
), row=1, col=1)

fig2.add_trace(go.Scatter(
    x=rank_initiated_counts.index,
    y=rank_initiated_counts,
    mode='lines+markers+text',
    name='Rank Initiated',
    marker=dict(color='green'),
    text=[format_labels(count, pct) for count, pct in zip(rank_initiated_counts, rank_initiated_pct_change)],
    textposition='top center',
    textfont=dict(size=10),
    texttemplate='%{text}',  # Add this line to display the formatted text
    cliponaxis=False  # Add this line to prevent labels from being clipped
), row=1, col=1)

fig3.add_trace(go.Scatter(
    x=all_incident_counts.index,
    y=all_incident_counts,
    mode='lines+markers+text',
    name='All Incidents',
    marker=dict(color='red'),
    text=[format_labels(count, pct) for count, pct in zip(all_incident_counts, all_incident_pct_change)],
    textposition='top center',
    textfont=dict(size=10),
    texttemplate='%{text}',  # Add this line to display the formatted text
    cliponaxis=False  # Add this line to prevent labels from being clipped
), row=1, col=1)

# Add traces for calls for service data
fig1.add_trace(go.Scatter(
    x=calls_for_service_counts.index,
    y=calls_for_service_counts,
    mode='lines+markers+text',
    name='Domestic Calls for Service',
    marker=dict(color='blue'),
    text=[format_labels(count, pct) for count, pct in zip(calls_for_service_counts, calls_for_service_pct_change)],
    textposition='top center',
    textfont=dict(size=10),
    texttemplate='%{text}',  # Add this line to display the formatted text
    cliponaxis=False  # Add this line to prevent labels from being clipped
), row=2, col=1)

fig2.add_trace(go.Scatter(
    x=calls_for_service_counts.index,
    y=calls_for_service_counts,
    mode='lines+markers+text',
    name='Domestic Calls for Service',
    marker=dict(color='blue'),
    text=[format_labels(count, pct) for count, pct in zip(calls_for_service_counts, calls_for_service_pct_change)],
    textposition='top center',
    textfont=dict(size=10),
    texttemplate='%{text}',  # Add this line to display the formatted text
    cliponaxis=False  # Add this line to prevent labels from being clipped
), row=2, col=1)

fig3.add_trace(go.Scatter(
    x=calls_for_service_counts.index,
    y=calls_for_service_counts,
    mode='lines+markers+text',
    name='Domestic Calls for Service',
    marker=dict(color='blue'),
    text=[format_labels(count, pct) for count, pct in zip(calls_for_service_counts, calls_for_service_pct_change)],
    textposition='top center',
    textfont=dict(size=10),
    texttemplate='%{text}',  # Add this line to display the formatted text
    cliponaxis=False  # Add this line to prevent labels from being clipped
), row=2, col=1)

# Update layout for each subplot
fig1.update_layout(
    title='Total Counts and Percent Change by Year for Public Initiated Domestic Allegations',
    xaxis=dict(title='Year', tickangle=0),
    yaxis=dict(title='Count'),
    xaxis2=dict(title='Year', tickangle=0),
    yaxis2=dict(title='Calls for Service Count'),
    height=800,  # Increase the height of the plot
    width=1000   # Increase the width of the plot
)

fig2.update_layout(
    title='Total Counts and Percent Change by Year for Rank Initiated Domestic Allegations',
    xaxis=dict(title='Year', tickangle=0),
    yaxis=dict(title='Count'),
    xaxis2=dict(title='Year', tickangle=0),
    yaxis2=dict(title='Calls for Service Count'),
    height=800,  # Increase the height of the plot
    width=1000   # Increase the width of the plot
)

fig3.update_layout(
    title='Total Counts and Percent Change by Year for Domestic Allegations',
    xaxis=dict(title='Year', tickangle=0),
    yaxis=dict(title='Count'),
    xaxis2=dict(title='Year', tickangle=0),
    yaxis2=dict(title='Calls for Service Count'),
    height=800,  # Increase the height of the plot
    width=1000   # Increase the width of the plot
)

# Show the charts
fig3.show()
fig1.show()
fig2.show()


Columns (1,23) have mixed types. Specify dtype option on import or set low_memory=False.

