# Setup

In [14]:
import contextily as cx
import figure_utilities
import statsmodels.api as sm
import constants
from stats_utilities import produce_summary_statistics, select_controls, test_balance
import geopandas as gpd
import matplotlib.pyplot as plt
from panel_utilities import get_value_variable_names, prepare_df_for_DiD
import numpy as np
plt.rcParams['savefig.dpi'] = 300
import statsmodels.api as sm
import os
import pandas as pd

In [15]:
# Store paths.
INPUT_DATA_PANEL = "../data/03_cleaned/crime_analysis_monthly.csv"
INPUT_DATA_TRACTS = "../data/02_intermediate/tracts.csv"
INPUT_DATA_NEIGHBORHOOD_CRIME_COUNTS = "../data/03_cleaned/neighborhood_crime_counts.csv"
INPUT_DATA_NEIGHBORHOODS = "../data/01_raw/2015-2019_neighborhood_tables_2021.12.21.xlsm"
INPUT_DATA_BOSTON_TRACTS_SHAPEFILE = "../data/01_raw/Census_2010_Tracts"
OUTPUT_TABLES = "../output/final_paper/tables"
OUTPUT_FIGURES = "../output/final_paper/figures"

# Summary Statistics

## Map of Evictions, Colored by Poverty Rate in Census Tract

In [16]:
# Read unrestricted dataset into memory.
df = pd.read_csv(INPUT_DATA_PANEL)
df = df.reset_index() # So we can use 'case_number' like a column

# Create spatial data 
unrestricted_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))
unrestricted_gdf = unrestricted_gdf.set_crs("EPSG:4326")
unrestricted_gdf = unrestricted_gdf.to_crs("EPSG:3857")

# Plot
fig, ax = plt.subplots(figsize=(8, 10))
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticklabels([])
ax.set_xticks([])
unrestricted_gdf.plot(ax=ax,
                      color='black',
                      markersize=0.05)
cx.add_basemap(ax=ax, crs="EPSG:3857", source=cx.providers.CartoDB.Positron)

# Color census tracts by poverty rate.
boston_tracts_gdf = gpd.read_file(INPUT_DATA_BOSTON_TRACTS_SHAPEFILE)[['GEOID10', 'geometry']].set_index('GEOID10')
boston_tracts_gdf.index = boston_tracts_gdf.index.astype(int)
tract_poverty_rates_df = pd.read_csv(INPUT_DATA_TRACTS, usecols=['tract_geoid', 'poor_share2010'],
                                     index_col='tract_geoid')
boston_tracts_gdf = pd.concat([boston_tracts_gdf, tract_poverty_rates_df], axis=1).dropna(
    subset=['geometry', 'poor_share2010']).drop(index=25025990101)
boston_tracts_gdf.plot(ax=ax, column=boston_tracts_gdf['poor_share2010'], cmap='OrRd', alpha=0.4, legend=True,
                       legend_kwds={'label': "Poverty Rate of Census Tract",
                                    'orientation': "horizontal",
                                    'shrink': 0.25})

figure_utilities.save_figure_and_close(fig, os.path.join(OUTPUT_FIGURES, "evictions_map.png"))

## Eviction Filings Over Time

In [17]:
# Plot the number of eviction filings over time.
df.loc[:, 'last_day_of_file_month'] = (pd.to_datetime(df['file_date']) +
                                       pd.tseries.offsets.MonthEnd(0))
filings_per_month = df.groupby('last_day_of_file_month')['case_number'].count()

# Plot eviction filing counts.
fig, ax = plt.subplots()
filings_per_month.plot(ax=ax, kind='line', color='black',
                       zorder=100)
ax.set_ylabel("Number of Evictions")
ax.set_xlabel("Month")
ax.grid(True)
figure_utilities.save_figure_and_close(fig, os.path.join(OUTPUT_FIGURES, "filings_over_time.png"))


In [18]:
# Read unrestricted dataset into memory.
df = pd.read_csv(INPUT_DATA_PANEL)
df = df.reset_index() # So we can use 'case_number' like a column

# Produce summary statistics table.
treatment_date_variable = 'latest_docket_date'
outcomes_of_interest = [f'group_0_crimes_{constants.Analysis.MAIN_RESULTS_RADIUS}m']
summary_statistics_unrestricted, variable_display_names_dict = produce_summary_statistics(df)

# Rename columns.
summary_statistics_unrestricted.index = summary_statistics_unrestricted.index.set_names(["Panel", "Variable"])
column_display_names_dict = {'mean': "Mean", 'std': "S.D.", 'count': "N", '50%': 'Median'}

summary_statistics_unrestricted = summary_statistics_unrestricted.sort_values(['Panel', 'Variable'])
# Keep only outcomes of interest
outcomes = constants.Variables.outcomes.copy()
for outcome in outcomes:
    if outcome not in outcomes_of_interest:
        if f"relative_pre_treatment_change_in_{outcome}" in summary_statistics_unrestricted.index.get_level_values(1):
            summary_statistics_unrestricted = summary_statistics_unrestricted.drop(f"relative_pre_treatment_change_in_{outcome}",
                                                                                   level=1, axis=0)
        if f"pre_treatment_change_in_{outcome}" in summary_statistics_unrestricted.index.get_level_values(1):
            summary_statistics_unrestricted = summary_statistics_unrestricted.drop(f"pre_treatment_change_in_{outcome}",
                                                                                   level=1, axis=0)
        if f"total_twenty_seventeen_{outcome}" in summary_statistics_unrestricted.index.get_level_values(1):
            summary_statistics_unrestricted = summary_statistics_unrestricted.drop(f"total_twenty_seventeen_{outcome}",
                                                                                   level=1, axis=0)

# Drop median column.
summary_statistics_unrestricted = summary_statistics_unrestricted.drop(columns='50%')

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "summary_statistics.tex")
latex = (summary_statistics_unrestricted
         .rename(index=variable_display_names_dict)
         .rename(columns=column_display_names_dict)
         .style
         .format(formatter={
    'Mean': "{:,.2f}",
    'Median': "{:,.2f}",
    'S.D.': "{:,.2f}",
    'N': "{:,.0f}"})
         .format_index("\\textit{{{}}}", escape="latex", axis=0, level=0)
         .to_latex(None,
                   column_format="llcccc",
                   hrules=True,
                   clines="skip-last;data")).replace("{*}", "{3cm}")
with open(filename, 'w') as file:
    file.write(latex)
summary_statistics_unrestricted

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
Panel,Variable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Panel A: Pre-treatment Outcomes,pre_treatment_change_in_group_0_crimes_250m,-43.568679,121.045313,1507.0
Panel A: Pre-treatment Outcomes,relative_pre_treatment_change_in_group_0_crimes_250m,-3.830126,26.445119,1507.0
Panel A: Pre-treatment Outcomes,total_twenty_seventeen_group_0_crimes_250m,349.495023,263.922444,1507.0
Panel B: Census Tract Characteristics,frac_coll_plus2010,0.322629,0.218759,1507.0
Panel B: Census Tract Characteristics,job_density_2013,18066.397982,45507.927709,1507.0
Panel B: Census Tract Characteristics,med_hhinc2016,46685.742535,25233.006826,1507.0
Panel B: Census Tract Characteristics,poor_share2010,0.286367,0.148444,1507.0
Panel B: Census Tract Characteristics,popdensity2010,23259.527496,14182.431408,1507.0
Panel B: Census Tract Characteristics,share_white2010,0.314273,0.271775,1507.0
Panel C: Case Initiation,for_cause,0.140677,0.347803,1507.0


In [19]:
# Share of non entity plaintiffs who are represented by an attorney
df.loc[df['isEntityP'] == 0, 'hasAttyP'].mean()

0.42735042735042733

In [20]:
# Produce treatment timings table.
treatment_timings = (df
                     .groupby(['latest_docket_month', 'judgment_for_plaintiff'])['case_number']
                     .count()
                     .reset_index()
                     .fillna(0))
treatment_timings = treatment_timings.pivot(index='latest_docket_month', columns='judgment_for_plaintiff').fillna(0)
treatment_timings.columns = ["Cases Won By Defendant", "Cases Won By Plaintiff"]
portion_of_all_cases = (treatment_timings['Cases Won By Plaintiff'] + treatment_timings[
    'Cases Won By Defendant']) / len(df)
treatment_timings = pd.concat([treatment_timings, portion_of_all_cases.rename('Portion of All Cases')], axis=1)
sum_across_filing_date = pd.DataFrame(treatment_timings.sum(axis=0)).T
sum_across_filing_date.index = ["All Months"]
treatment_timings = pd.concat([sum_across_filing_date, treatment_timings], axis=0)
treatment_timings.index = treatment_timings.index.rename("Last Docket Date")

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "treatment_timings.tex")
treatment_timings.style.format(formatter={'Cases Won By Plaintiff': '{:,.0f}',
                                          'Cases Won By Defendant': '{:,.0f}',
                                          'Portion of All Cases': '{:0.2f}'}).to_latex(filename, column_format="lccc",
                                                                                       hrules=True)
treatment_timings

Unnamed: 0_level_0,Cases Won By Defendant,Cases Won By Plaintiff,Portion of All Cases
Last Docket Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All Months,715.0,792.0,1.0
2019-06,46.0,13.0,0.039151
2019-07,62.0,42.0,0.069011
2019-08,67.0,87.0,0.10219
2019-09,73.0,92.0,0.109489
2019-10,88.0,83.0,0.11347
2019-11,61.0,63.0,0.082283
2019-12,66.0,65.0,0.086928
2020-01,78.0,123.0,0.133378
2020-02,83.0,131.0,0.142004


In [21]:
# Calculate percent of cases which are in high poverty neighborhoods.
df = df.loc[df['judgment_for_plaintiff'] == 1, :]
original_N = len(df)
cases_in_poor_tracts = len(df.loc[df['poor_share2010'] > 0.20, :])
cases_in_poor_tracts / original_N

0.6439393939393939