# Setup

In [1]:
import contextily as cx
import figure_utilities
import statsmodels.api as sm
import constants
from stats_utilities import produce_summary_statistics, test_balance
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from panel_utilities import get_value_variable_names, prepare_df_for_DiD
import numpy as np
plt.rcParams['savefig.dpi'] = 300
import os
import pandas as pd


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# Set paths
INPUT_DATA_PANEL = "/Users/ashanmu1/Documents/GitHub/seniorthesis/data/03_cleaned/crime_analysis_monthly.csv"
INPUT_DATA_PANEL_FULL_SAMPLE = "/Users/ashanmu1/Documents/GitHub/seniorthesis/data/03_cleaned/crime_analysis_monthly_full_sample.csv"
INPUT_DATA_BOSTON_TRACTS_SHAPEFILE = "/Users/ashanmu1/Documents/GitHub/seniorthesis/data/01_raw/Census_2010_Tracts"
INPUT_DATA_TRACTS = "/Users/ashanmu1/Documents/GitHub/seniorthesis/data/02_intermediate/tracts.csv"
INPUT_DATA_OFFENSE_CODES = "/Users/ashanmu1/Documents/GitHub/seniorthesis/data/01_raw/rmsoffensecodes.xlsx"
OUTPUT_TABLES = "/Users/ashanmu1/Documents/GitHub/seniorthesis/output/final_paper/tables"
OUTPUT_FIGURES = "/Users/ashanmu1/Documents/GitHub/seniorthesis/output/final_paper/figures"
OUTPUT_STATISTICS = "/Users/ashanmu1/Documents/GitHub/seniorthesis/output/final_paper/summary_statistics_numbers_to_cite.tex"

# Summary Statistics

## Map of Evictions, Colored by Poverty Rate in Census Tract

In [3]:
# Read unrestricted dataset into memory.
df = pd.read_csv(INPUT_DATA_PANEL_FULL_SAMPLE)
df = df.reset_index() # So we can use 'case_number' like a column

# Create spatial data 
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))
df.loc[df['main_analysis_sample'] == 0, 'geometry'] = df.loc[df['main_analysis_sample'] == 0, 'geometry'].buffer(.0003)
df.loc[df['main_analysis_sample'] == 1, 'geometry'] = df.loc[df['main_analysis_sample'] == 1, 'geometry'].buffer(.0005)
df = df.set_crs("EPSG:4326")
df = df.to_crs("EPSG:3857")


# Plot
fig, ax = plt.subplots(figsize=(8, 10))
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticklabels([])
ax.set_xticks([])

# Color census tracts by poverty rate.
boston_tracts_gdf = gpd.read_file(INPUT_DATA_BOSTON_TRACTS_SHAPEFILE)[['GEOID10', 'geometry']].set_index('GEOID10')
boston_tracts_gdf.index = boston_tracts_gdf.index.astype(int)
tract_poverty_rates_df = pd.read_csv(INPUT_DATA_TRACTS, usecols=['tract_geoid', 'poor_share2010'],
                                     index_col='tract_geoid')
boston_tracts_gdf = pd.concat([boston_tracts_gdf, tract_poverty_rates_df], axis=1).dropna(
    subset=['geometry', 'poor_share2010']).drop(index=[25025990101, 25025980101])
boston_tracts_gdf.plot(ax=ax, column=boston_tracts_gdf['poor_share2010'], cmap='OrRd', alpha=0.4, legend=True,
                       legend_kwds={'label': "Poverty Rate of Census Tract",
                                    'shrink': 0.25})
boston_tracts_gdf = boston_tracts_gdf.reset_index()

df.loc[:, 'main_analysis_sample'] = df['main_analysis_sample'].replace({1: "Properties in Analysis Sample", 0: "Properties not in Analysis Sample"})
df.plot(ax=ax,
        column="main_analysis_sample",
        legend=True,
        cmap=colors.ListedColormap(['green', 'red']))

cx.add_basemap(ax=ax, crs="EPSG:3857", source=cx.providers.CartoDB.Positron)




figure_utilities.save_figure_and_close(fig, os.path.join(OUTPUT_FIGURES, "evictions_map.png"))

## Eviction Filings Over Time

In [4]:
df = pd.read_csv(INPUT_DATA_PANEL)

# Plot the number of eviction filings over time.
df.loc[:, 'last_day_of_file_month'] = (pd.to_datetime(df['file_date']) +
                                       pd.tseries.offsets.MonthEnd(0))
filings_per_month = df.groupby('last_day_of_file_month')['case_number'].count()

# Plot eviction filing counts.
fig, ax = plt.subplots()
filings_per_month.plot(ax=ax, kind='line', color='black',
                       zorder=100)
ax.set_ylabel("Number of Evictions")
ax.set_xlabel("Month")
ax.grid(True)
figure_utilities.save_figure_and_close(fig, os.path.join(OUTPUT_FIGURES, "filings_over_time.png"))


## Summary Statistics

In [5]:
# Read dataset into memory.
df = pd.read_csv(INPUT_DATA_PANEL).reset_index()
sample_size = len(df)
df_full_sample = pd.read_csv(INPUT_DATA_PANEL_FULL_SAMPLE).reset_index()

outcomes_of_interest = [f'group_{i}_crimes_{constants.Analysis.MAIN_RESULTS_RADIUS}m' for i in range(5)]
summary_statistics_table = produce_summary_statistics(df)
all_excluded_properties_means = produce_summary_statistics(df_full_sample)

# Rename columns.
summary_statistics_table.index = summary_statistics_table.index.set_names(["Panel", "Variable"])
column_display_names_dict = {'mean': "Mean", 'std': "S.D.",'50%': 'Median'}
summary_statistics_table = summary_statistics_table.sort_values(['Panel', 'Variable'])
summary_statistics_table.index.names = [None, None]

# Keep only outcomes of interest
for outcome in constants.Variables.outcomes:
    if outcome not in outcomes_of_interest:
        summary_statistics_table = summary_statistics_table.drop(f'total_twenty_seventeen_{outcome}',
                                                                                   level=1, axis=0)
        summary_statistics_table = summary_statistics_table.drop(f'month_neg_twelve_{outcome}',
                                                                                   level=1, axis=0)
        summary_statistics_table = summary_statistics_table.drop(f'month_neg_six_{outcome}',
                                                                                   level=1, axis=0)

        
        all_excluded_properties_means = all_excluded_properties_means.drop(f'total_twenty_seventeen_{outcome}',
                                                                                   level=1, axis=0)
        all_excluded_properties_means = all_excluded_properties_means.drop(f'month_neg_twelve_{outcome}',
                                                                                   level=1, axis=0)
        all_excluded_properties_means = all_excluded_properties_means.drop(f'month_neg_six_{outcome}',
                                                                                   level=1, axis=0)




# Calculate differences in characteristics between the main analysis sample and selected alternative samples
summary_statistics_table = pd.concat([pd.concat([summary_statistics_table], keys=["Analysis Sample (N=1,471)"], axis=1), 
                                      pd.concat([all_excluded_properties_means], keys=["Full Sample (N=7,842)"], axis=1)], axis=1)



summary_statistics_table = (summary_statistics_table
                                   .rename(index=constants.Variables.variable_display_names_dict)
                                   .rename(columns=column_display_names_dict))

# Sort
summary_statistics_table = summary_statistics_table.sort_index(level=[0, 1])

# Add column numbers
summary_statistics_table = figure_utilities.add_column_numbers(summary_statistics_table)

# Add spacer column
summary_statistics_table.insert(3, "", pd.Series(index=summary_statistics_table.index))

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "summary_statistics.tex")
latex = (summary_statistics_table
         .style
         .format(formatter={
    ('Analysis Sample (N=1,471)', 'Mean', '(1)'): "{:,.2f}",
    ('Analysis Sample (N=1,471)', 'Median', '(3)'): "{:,.2f}",
    ('Analysis Sample (N=1,471)', 'S.D.', '(2)'): "{:,.2f}",
    ('Full Sample (N=7,842)', 'Mean', '(4)'): "{:,.2f}",
    ('Full Sample (N=7,842)', 'Median', '(6)'): "{:,.2f}",
    ('Full Sample (N=7,842)', 'S.D.', '(5)'): "{:,.2f}"}, na_rep="")
         .format_index("\\textit{{{}}}", escape="latex", axis=0, level=0)
         .format_index("\\textit{{{}}}", escape="latex", axis=1, level=0)
         .to_latex(None,
                   column_format="llccccccc",
                    multicol_align='c',
                   hrules=True,
                   clines="skip-last;data")).replace("{*}", "{3cm}")

latex = latex.split("\\\\\n")
latex.insert(1, "\\cline{3-5} \\cline{7-9} \n")
latex = "\\\\\n".join(latex)

with open(filename, 'w') as file:
    file.write(latex)
summary_statistics_table

Unnamed: 0_level_0,Unnamed: 1_level_0,"Analysis Sample (N=1,471)","Analysis Sample (N=1,471)","Analysis Sample (N=1,471)",Unnamed: 5_level_0,"Full Sample (N=7,842)","Full Sample (N=7,842)","Full Sample (N=7,842)"
Unnamed: 0_level_1,Unnamed: 1_level_1,Mean,S.D.,Median,Unnamed: 5_level_1,Mean,S.D.,Median
Unnamed: 0_level_2,Unnamed: 1_level_2,(1),(2),(3),Unnamed: 5_level_2,(4),(5),(6)
Panel A: Pre-Treatment Crime Levels,"All Crime Reports, 2017",353.609109,264.137598,311.0,,332.137227,248.87223,299.0
Panel A: Pre-Treatment Crime Levels,"All Crime Reports, Month -12",27.959211,20.850807,24.0,,23.341538,21.742185,19.0
Panel A: Pre-Treatment Crime Levels,"All Crime Reports, Month -6",25.827328,19.750073,22.0,,22.600179,21.0155,18.0
Panel A: Pre-Treatment Crime Levels,"Assault Reports, 2017",57.957852,39.621194,52.0,,55.786124,39.176169,50.0
Panel A: Pre-Treatment Crime Levels,"Assault Reports, Month -12",5.07138,4.043598,4.0,,3.960464,3.880119,3.0
Panel A: Pre-Treatment Crime Levels,"Assault Reports, Month -6",4.814412,4.003598,4.0,,3.858691,3.835944,3.0
Panel A: Pre-Treatment Crime Levels,"Drug Crime Reports, 2017",19.479266,42.73458,7.0,,17.222931,38.427062,7.0
Panel A: Pre-Treatment Crime Levels,"Drug Crime Reports, Month -12",1.422842,3.511206,0.0,,1.070399,2.868352,0.0
Panel A: Pre-Treatment Crime Levels,"Drug Crime Reports, Month -6",1.116927,3.118072,0.0,,0.860605,2.398039,0.0
Panel A: Pre-Treatment Crime Levels,"Larceny Reports, 2017",36.227736,51.977302,22.0,,32.523913,44.996108,20.0


In [6]:
crime_diff_analysis_full = (summary_statistics_table.loc[("Panel A: Pre-Treatment Crime Levels", "All Crime Reports, 2017"), ("Analysis Sample (N=1,471)", "Mean", "(1)")] - 
summary_statistics_table.loc[("Panel A: Pre-Treatment Crime Levels", "All Crime Reports, 2017"), ("Full Sample (N=7,842)", "Mean", "(4)")])

In [7]:
df_full_sample = pd.read_csv(INPUT_DATA_PANEL_FULL_SAMPLE).reset_index()

median = df_full_sample['poor_share2010'].quantile(.5)
crime_diff_below_above_median_poverty = (df_full_sample.loc[df_full_sample['poor_share2010'] > median, 'total_twenty_seventeen_group_0_crimes_250m'].mean() -
                                         df_full_sample.loc[df_full_sample['poor_share2010'] < median, 'total_twenty_seventeen_group_0_crimes_250m'].mean())
crime_diff_analysis_full/crime_diff_below_above_median_poverty

0.13734482569403778

In [8]:
# Share of plaintiffs who are entities
share_plaintiffs_individuals = (100 * (1 - df['isEntityP']).mean()).round(2)

# Share of non entity plaintiffs who are represented by an attorney
share_non_entity_plaintiffs_with_attorney = (100 * df.loc[df['isEntityP'] == 0, 'hasAttyP'].mean()).round(2)

In [9]:
# Statistics of money judgments 
money_judgment = pd.read_csv(INPUT_DATA_PANEL)['judgment']
sixtieth_percentile_value = money_judgment.quantile(.6)
share_with_money_judgment_0 = 100 * (money_judgment == 0).sum() / len(money_judgment) 

In [10]:
money_judgment.median()

170.89

In [11]:
# Produce treatment timings table.
treatment_timings = (df
                     .groupby(['file_month', 'judgment_for_plaintiff'])['case_number']
                     .count()
                     .reset_index()
                     .fillna(0))
treatment_timings = treatment_timings.pivot(index='file_month', columns='judgment_for_plaintiff').fillna(0)
treatment_timings.columns = ["Cases Won By Defendant", "Cases Won By Plaintiff"]
portion_of_all_cases = (treatment_timings['Cases Won By Plaintiff'] + treatment_timings[
    'Cases Won By Defendant']) / len(df)
treatment_timings = pd.concat([treatment_timings, portion_of_all_cases.rename('Portion of All Cases')], axis=1)
sum_across_filing_date = pd.DataFrame(treatment_timings.sum(axis=0)).T
sum_across_filing_date.index = ["All Months"]
treatment_timings = pd.concat([sum_across_filing_date, treatment_timings], axis=0)
# treatment_timings.index = treatment_timings.index.rename("Month of Filing")

# Add column numbers
treatment_timings = figure_utilities.add_column_numbers(treatment_timings)

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "treatment_timings.tex")
treatment_timings.style.format(formatter={('Cases Won By Plaintiff', '(2)'): '{:,.0f}',
                                          ('Cases Won By Defendant', '(1)'): '{:,.0f}',
                                          ('Portion of All Cases', '(3)'): '{:0.2f}'}).to_latex(filename, column_format="lccc",
                                                                                       hrules=True)
treatment_timings

Unnamed: 0_level_0,Cases Won By Defendant,Cases Won By Plaintiff,Portion of All Cases
Unnamed: 0_level_1,(1),(2),(3)
All Months,694.0,777.0,1.0
2019-05,16.0,15.0,0.021074
2019-06,70.0,83.0,0.104011
2019-07,82.0,127.0,0.14208
2019-08,76.0,86.0,0.110129
2019-09,68.0,90.0,0.10741
2019-10,89.0,71.0,0.10877
2019-11,49.0,75.0,0.084296
2019-12,79.0,105.0,0.125085
2020-01,70.0,72.0,0.096533


In [12]:
# Calculate percent of cases which are in high poverty neighborhoods.
df = df.loc[df['judgment_for_plaintiff'] == 1, :]
original_N = len(df)
cases_in_poor_tracts = len(df.loc[df['poor_share2010'] > 0.20, :])
share_cases_in_poor_tracts = round(100 * (cases_in_poor_tracts / original_N), 2)

In [15]:
# Produce table describing crime groups and their frequency
columns = ['Offense Code', 'Description']
# Read offense codes and their descriptions from BPD spreadsheet
offense_codes_and_descriptions = pd.read_excel(INPUT_DATA_OFFENSE_CODES, usecols=[0, 1])
offense_codes_and_descriptions.columns = columns
offense_codes_and_descriptions = offense_codes_and_descriptions.set_index('Offense Code')

group_0_crimes = pd.DataFrame([['All', '']], columns=columns)
group_0_crimes = pd.concat([group_0_crimes], axis=0, keys=['All Crimes']).reset_index(level=1, drop=True)
group_0_crimes.index.name = "Incident Group"
group_0_crimes = group_0_crimes.reset_index().set_index(["Incident Group", 'Offense Code'])

crime_group_dfs = [group_0_crimes]
offense_code_groups = [constants.Analysis.larceny,
                       constants.Analysis.drugs,
                       constants.Analysis.vandalism,
                       constants.Analysis.assault,
                       constants.Analysis.auto_theft]
labels = ["Larceny", "Drugs", "Vandalism", "Assault", "Auto Theft"]
for offense_codes, label in zip(offense_code_groups, labels):
    crime_group_df = offense_codes_and_descriptions.loc[offense_codes, :].reset_index().drop_duplicates()
    crime_group_df = pd.concat([crime_group_df], axis=0, keys=[label]).reset_index(level=1, drop=True)
    crime_group_df.index.name = "Incident Group"
    crime_group_df = crime_group_df.reset_index().set_index(["Incident Group", 'Offense Code'])
    crime_group_dfs.append(crime_group_df)
crime_group_df = pd.concat(crime_group_dfs, axis=0)

# Export to LaTeX.
# Clean description column to avoid silent LaTeX errrors
crime_group_df.loc[:, 'Description'] = crime_group_df['Description'].str.replace("&", "\\&", regex=False)
crime_group_df.loc[:, 'Description'] = crime_group_df['Description'].str.replace("$", "\\$", regex=False)

# Split into two pages
crime_group_df_page_1 = crime_group_df.loc[["Auto Theft", "Assault"], :]
crime_group_df_page_2 = crime_group_df.loc[["Drugs"], :]
crime_group_df_page_3 = crime_group_df.loc[["Larceny", "Vandalism"], :]

for df, page_number in zip([crime_group_df_page_1, crime_group_df_page_2, crime_group_df_page_3], [1, 2, 3]):
    drop_for_space_reasons = "RECOVERED - MV RECOVERED IN BOSTON (STOLEN OUTSIDE BOSTON)"
    latex = (df
             .loc[df['Description'] != drop_for_space_reasons, :]
             .style
             .format_index("\\textit{{{}}}", escape="latex", axis=0, level=0)
             .to_latex(None,
                       column_format="lll",
                       hrules=True,
                       clines="skip-last;data"))
    with open(os.path.join(OUTPUT_TABLES, f"crime_groups_page_{page_number}.tex"), 'w') as file:
        file.write(latex)

## Write Statistics to File

In [16]:
with open(OUTPUT_STATISTICS, 'w') as file:
    file.write(f"\n\\def\\shareplaintiffsindividuals{'{' + str(share_plaintiffs_individuals) + ' }'}\n")
    file.write(f"\n\\def\\sharenonentityplaintiffswithattorney{'{' + str(share_non_entity_plaintiffs_with_attorney) + ' }'}\n")
    file.write(f"\n\\def\\sharecasesinpoortracts{'{' + str(share_cases_in_poor_tracts) + ' }'}\n")
    file.write(f"\n\\def\\sample_size{'{' + str(sample_size) + ' }'}\n")
    file.write(f"\n\\def\\sharewithmoneyjudgmentzero{'{' +  f'{share_with_money_judgment_0 :0f}'+ ' }'}\n")
    file.write(f"\n\\def\\sixtiethpercentilevalue{'{' +  f'{round(sixtieth_percentile_value, -1):g}' + ' }'}\n")

In [17]:
f'{round(sixtieth_percentile_value, -1):g}'

'450'