# Setup

In [1]:
import contextily as cx
import figure_utilities
import statsmodels.api as sm
import constants
from stats_utilities import produce_summary_statistics, select_controls, test_balance
import geopandas as gpd
import matplotlib.pyplot as plt
from panel_utilities import get_value_variable_names, prepare_df_for_DiD
import numpy as np
plt.rcParams['savefig.dpi'] = 300
import os
import pandas as pd


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# Store paths.
INPUT_DATA_PANEL = "../data/03_cleaned/crime_analysis_monthly.csv"
INPUT_DATA_TRACTS = "../data/02_intermediate/tracts.csv"
INPUT_DATA_BOSTON_TRACTS_SHAPEFILE = "../data/01_raw/Census_2010_Tracts"
INPUT_DATA_OFFENSE_CODES = "../data/01_raw/rmsoffensecodes.xlsx"
OUTPUT_TABLES = "../output/final_paper/tables"
OUTPUT_FIGURES = "../output/final_paper/figures"
OUTPUT_STATISTICS = "../output/final_paper/statistics.tex"

# Summary Statistics

## Map of Evictions, Colored by Poverty Rate in Census Tract

In [3]:
# Read unrestricted dataset into memory.
df = pd.read_csv(INPUT_DATA_PANEL)
df = df.reset_index() # So we can use 'case_number' like a column

# Create spatial data 
unrestricted_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))
unrestricted_gdf = unrestricted_gdf.set_crs("EPSG:4326")
unrestricted_gdf = unrestricted_gdf.to_crs("EPSG:3857")

# Plot
fig, ax = plt.subplots(figsize=(8, 10))
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticklabels([])
ax.set_xticks([])
unrestricted_gdf.plot(ax=ax,
                      color='black',
                      markersize=0.05)
cx.add_basemap(ax=ax, crs="EPSG:3857", source=cx.providers.CartoDB.Positron)

# Color census tracts by poverty rate.
boston_tracts_gdf = gpd.read_file(INPUT_DATA_BOSTON_TRACTS_SHAPEFILE)[['GEOID10', 'geometry']].set_index('GEOID10')
boston_tracts_gdf.index = boston_tracts_gdf.index.astype(int)
tract_poverty_rates_df = pd.read_csv(INPUT_DATA_TRACTS, usecols=['tract_geoid', 'poor_share2010'],
                                     index_col='tract_geoid')
boston_tracts_gdf = pd.concat([boston_tracts_gdf, tract_poverty_rates_df], axis=1).dropna(
    subset=['geometry', 'poor_share2010']).drop(index=25025990101)
boston_tracts_gdf.plot(ax=ax, column=boston_tracts_gdf['poor_share2010'], cmap='OrRd', alpha=0.4, legend=True,
                       legend_kwds={'label': "Poverty Rate of Census Tract",
                                    'orientation': "horizontal",
                                    'shrink': 0.25})

figure_utilities.save_figure_and_close(fig, os.path.join(OUTPUT_FIGURES, "evictions_map.png"))

## Eviction Filings Over Time

In [4]:
# Plot the number of eviction filings over time.
df.loc[:, 'last_day_of_file_month'] = (pd.to_datetime(df['file_date']) +
                                       pd.tseries.offsets.MonthEnd(0))
filings_per_month = df.groupby('last_day_of_file_month')['case_number'].count()

# Plot eviction filing counts.
fig, ax = plt.subplots()
filings_per_month.plot(ax=ax, kind='line', color='black',
                       zorder=100)
ax.set_ylabel("Number of Evictions")
ax.set_xlabel("Month")
ax.grid(True)
figure_utilities.save_figure_and_close(fig, os.path.join(OUTPUT_FIGURES, "filings_over_time.png"))


In [5]:
# Read unrestricted dataset into memory.
df = pd.read_csv(INPUT_DATA_PANEL)
df = df.reset_index() # So we can use 'case_number' like a column

# Produce summary statistics table.
treatment_date_variable = 'latest_docket_date'

outcomes_of_interest = [f'group_{i}_crimes_{constants.Analysis.MAIN_RESULTS_RADIUS}m' for i in range(6)]
summary_statistics_unrestricted, variable_display_names_dict = produce_summary_statistics(df)

# Rename columns.
summary_statistics_unrestricted.index = summary_statistics_unrestricted.index.set_names(["Panel", "Variable"])
column_display_names_dict = {'mean': "Mean", 'std': "S.D.", 'count': "N", '50%': 'Median'}
summary_statistics_unrestricted = summary_statistics_unrestricted.sort_values(['Panel', 'Variable'])

# Keep only outcomes of interest
for outcome in constants.Variables.outcomes:

    if outcome not in outcomes_of_interest:
        summary_statistics_unrestricted = summary_statistics_unrestricted.drop(f'total_twenty_eighteen_{outcome}',
                                                                                   level=1, axis=0)
        summary_statistics_unrestricted = summary_statistics_unrestricted.drop(f'relative_pre_treatment_change_in_{outcome}',
                                                                                   level=1, axis=0)
        summary_statistics_unrestricted = summary_statistics_unrestricted.drop(f'pre_treatment_change_in_{outcome}',
                                                                                   level=1, axis=0)


# Drop median column.
summary_statistics_unrestricted = summary_statistics_unrestricted.drop(columns='50%')

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "summary_statistics.tex")
latex = (summary_statistics_unrestricted
         .rename(index=variable_display_names_dict)
         .rename(columns=column_display_names_dict)
         .style
         .format(formatter={
    'Mean': "{:,.2f}",
    'Median': "{:,.2f}",
    'S.D.': "{:,.2f}",
    'N': "{:,.0f}"})
         .format_index("\\textit{{{}}}", escape="latex", axis=0, level=0)
         .to_latex(None,
                   column_format="llcccc",
                   hrules=True,
                   clines="skip-last;data")).replace("{*}", "{3cm}")
with open(filename, 'w') as file:
    file.write(latex)
summary_statistics_unrestricted

KeyError: "['total_twenty_eighteen_group_8_crimes_250m', 'pre_treatment_change_in_group_8_crimes_250m', 'relative_pre_treatment_change_in_group_8_crimes_250m', 'total_twenty_eighteen_group_8_crimes_300m', 'pre_treatment_change_in_group_8_crimes_300m', 'relative_pre_treatment_change_in_group_8_crimes_300m', 'total_twenty_eighteen_group_8_crimes_350m', 'pre_treatment_change_in_group_8_crimes_350m', 'relative_pre_treatment_change_in_group_8_crimes_350m', 'total_twenty_eighteen_group_8_crimes_250_to_300m', 'pre_treatment_change_in_group_8_crimes_250_to_300m', 'relative_pre_treatment_change_in_group_8_crimes_250_to_300m', 'total_twenty_eighteen_group_8_crimes_250_to_350m', 'pre_treatment_change_in_group_8_crimes_250_to_350m', 'relative_pre_treatment_change_in_group_8_crimes_250_to_350m', 'total_twenty_eighteen_group_8_crimes_250_to_400m', 'pre_treatment_change_in_group_8_crimes_250_to_400m', 'relative_pre_treatment_change_in_group_8_crimes_250_to_400m'] not in index"

In [None]:
# Share of non entity plaintiffs who are represented by an attorney
share_non_entity_plaintiffs_with_attorney = (100 * df.loc[df['isEntityP'] == 0, 'hasAttyP'].mean()).round(2)

with open(OUTPUT_STATISTICS, 'w') as file:
    file.write(f"\n\\def\\share_non_entity_plaintiffs_with_attorney{'{' + str(share_non_entity_plaintiffs_with_attorney) + '}'}\n")

In [None]:
# Produce treatment timings table.
treatment_timings = (df
                     .groupby(['latest_docket_month', 'judgment_for_plaintiff'])['case_number']
                     .count()
                     .reset_index()
                     .fillna(0))
treatment_timings = treatment_timings.pivot(index='latest_docket_month', columns='judgment_for_plaintiff').fillna(0)
treatment_timings.columns = ["Cases Won By Defendant", "Cases Won By Plaintiff"]
portion_of_all_cases = (treatment_timings['Cases Won By Plaintiff'] + treatment_timings[
    'Cases Won By Defendant']) / len(df)
treatment_timings = pd.concat([treatment_timings, portion_of_all_cases.rename('Portion of All Cases')], axis=1)
sum_across_filing_date = pd.DataFrame(treatment_timings.sum(axis=0)).T
sum_across_filing_date.index = ["All Months"]
treatment_timings = pd.concat([sum_across_filing_date, treatment_timings], axis=0)
treatment_timings.index = treatment_timings.index.rename("Last Docket Date")

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "treatment_timings.tex")
treatment_timings.style.format(formatter={'Cases Won By Plaintiff': '{:,.0f}',
                                          'Cases Won By Defendant': '{:,.0f}',
                                          'Portion of All Cases': '{:0.2f}'}).to_latex(filename, column_format="lccc",
                                                                                       hrules=True)
treatment_timings

In [None]:
# Calculate percent of cases which are in high poverty neighborhoods.
df = df.loc[df['judgment_for_plaintiff'] == 1, :]
original_N = len(df)
cases_in_poor_tracts = len(df.loc[df['poor_share2010'] > 0.20, :])
share_cases_in_poor_tracts = round(100 * (cases_in_poor_tracts / original_N), 2)
with open(OUTPUT_STATISTICS, 'a') as file:
    file.write(f"\n\\def\\share_cases_in_poor_tracts{'{' + str(share_cases_in_poor_tracts) + '}'}\n")

In [None]:
# Produce table describing crime groups and their frequency
columns = ['Offense Code', 'Description']
# Read offense codes and their descriptions from BPD spreadsheet
offense_codes_and_descriptions = pd.read_excel(INPUT_DATA_OFFENSE_CODES)
offense_codes_and_descriptions.columns = columns
offense_codes_and_descriptions = offense_codes_and_descriptions.set_index('Offense Code')

group_0_crimes = pd.DataFrame([['All', '']], columns=columns)
group_0_crimes = pd.concat([group_0_crimes], axis=0, keys=['All Crimes']).reset_index(level=1, drop=True)
group_0_crimes.index.name = "Incident Group"
group_0_crimes = group_0_crimes.reset_index().set_index(["Incident Group", 'Offense Code'])

crime_group_dfs = [group_0_crimes]
offense_code_groups = [constants.Analysis.larceny,
                       constants.Analysis.motor_vehicle,
                       constants.Analysis.vandalism,
                       constants.Analysis.assault
                       constants.Analysis.auto_theft]
labels = ["Larceny", "Motor Vehicle Accident", "Vandalism", "Assault", "Investigation",  "Auto Theft"]
for offense_codes, label in zip(offense_code_groups, labels):
    crime_group_df = offense_codes_and_descriptions.loc[offense_codes, :].reset_index().drop_duplicates()
    crime_group_df = pd.concat([crime_group_df], axis=0, keys=[label]).reset_index(level=1, drop=True)
    crime_group_df.index.name = "Incident Group"
    crime_group_df = crime_group_df.reset_index().set_index(["Incident Group", 'Offense Code'])
    crime_group_dfs.append(crime_group_df)
crime_group_df = pd.concat(crime_group_dfs, axis=0)

# Export to LaTeX.
# Clean description column to avoid silent LaTeX errrors
crime_group_df.loc[:, 'Description'] = crime_group_df['Description'].str.replace("&", "\\&", regex=False)
crime_group_df.loc[:, 'Description'] = crime_group_df['Description'].str.replace("$", "\\$", regex=False)

# Split into two pages
crime_group_df_page_1 = crime_group_df.loc[["Larceny", "Motor Vehicle Accident"], :]
crime_group_df_page_2 = crime_group_df.loc[["Vandalism", "Assault",  "Auto Theft"], :]

for df, page_number in zip([crime_group_df_page_1, crime_group_df_page_2], [1, 2]):
    drop_for_space_reasons = "RECOVERED - MV RECOVERED IN BOSTON (STOLEN OUTSIDE BOSTON)"
    latex = (df
             .loc[df['Description'] != drop_for_space_reasons, :]
             .style
             .format_index("\\textit{{{}}}", escape="latex", axis=0, level=0)
             .to_latex(None,
                       column_format="lll",
                       hrules=True,
                       clines="skip-last;data"))
    with open(os.path.join(OUTPUT_TABLES, f"crime_groups_page_{page_number}.tex"), 'w') as file:
        file.write(latex)

In [None]:
crime_group_df_page_1

In [None]:
crime_group_df_page_2