# 06_merge.ipynb

In [20]:
import pandas as pd
import geopandas as gpd
from dask.distributed import Client
import dask.dataframe as dd
from dask_jobqueue import SLURMCluster
import matplotlib.pyplot as plt
import contextily as cx
from build_utilities import generate_variable_names, aggregate_crime_to_case_month
import dask_geopandas
INPUT_DATA_EVICTIONS = "../../data/02_intermediate/evictions.csv"
INPUT_DATA_TRACTS = "../../data/02_intermediate/tracts.csv"
INPUT_DATA_TAX_PARCELS = "../../data/02_intermediate/tax_parcels.gpkg"
INPUT_DATA_ZESTIMATES = "../../data/02_intermediate/zestimates.csv"
INPUT_DATA_CRIME = "../../data/01_raw/crime_incidents"
OUTPUT_DATA_UNRESTRICTED = "../../data/03_cleaned/unrestricted.csv"
OUTPUT_DATA_ZILLOW = "../../data/03_cleaned/zestimates_analysis.csv"
OUTPUT_DATA_CRIME = "../../data/03_cleaned/crime_analysis.csv"
VERBOSE = True
N_PARTITIONS = 1
value_vars_to_concat = []  # A list of DataFrames, where each DataFrame contains the panel data for a single outcome variable and has case_number as its index.

## 1. Loading Evictions Data

In [21]:
# Load evictions data.
with open(INPUT_DATA_EVICTIONS, 'r') as file:
    all_column_names = set(file.readline().replace("\"", "").replace("\n", "").split(","))
to_drop = {'Accuracy Score', 'Accuracy Type', 'Number', 'Street', 'Unit Type', 'Unit Number',
           'State', 'Zip', 'Country', 'Source', 'Census Year', 'State FIPS', 'County FIPS',
           'Place Name', 'Place FIPS', 'Census Tract Code', 'Census Block Code', 'Census Block Group',
           'Metro/Micro Statistical Area Code', 'Metro/Micro Statistical Area Type',
           'Combined Statistical Area Code', 'Metropolitan Division Area Code', 'court_location',
           'defendant', 'defendant_atty', 'defendant_atty_address_apt',
           'defendant_atty_address_city', 'defendant_atty_address_name', 'defendant_atty_address_state',
           'defendant_atty_address_street', 'defendant_atty_address_zip', 'docket_history', 'execution', 'judgment_for',
           'judgment_total', 'latest_docket_date', 'plaintiff', 'plaintiff_atty', 'plaintiff_atty_address_apt',
           'plaintiff_atty_address_city', 'plaintiff_atty_address_name', 'plaintiff_atty_address_state',
           'plaintiff_atty_address_street', 'plaintiff_atty_address_zip', 'Metropolitan Division Area Name',
           'property_address_city', 'property_address_state', 'property_address_street',
           'property_address_zip'}
evictions_df = pd.read_csv(INPUT_DATA_EVICTIONS, usecols=set(all_column_names) - set(to_drop))
original_N = len(evictions_df)
if VERBOSE:
    print(f"Beginning with {original_N} observations.")

# Drop cases missing file_date.
mask = evictions_df['file_date'].notna()
if VERBOSE:
    print(
        f"Dropping {(~mask).sum()} observations where file_date is missing.")
evictions_df = evictions_df.loc[mask, :]

# Add file month and year to dataset.
evictions_df.loc[:, 'file_month'] = pd.to_datetime(evictions_df['file_date']).dt.strftime('%Y-%m')
evictions_df.loc[:, 'file_year'] = pd.to_datetime(evictions_df['file_date']).dt.year

# Clean the values in the judgment_for_pdu variable.
judgment_for_pdu_replacement_dict = {"unknown": "Unknown",
                                     "plaintiff": "Plaintiff",
                                     "defendant": "Defendant"}
evictions_df.loc[:, "judgment_for_pdu"] = (evictions_df.loc[:, "judgment_for_pdu"]
                                           .replace(judgment_for_pdu_replacement_dict))

# Replace missing values in money judgment column with zeroes.
evictions_df.loc[:, 'judgment'] = evictions_df['judgment'].fillna(0)

# Rename duration to case_duration.
evictions_df = evictions_df.rename(columns={'duration': 'case_duration'})

# Drop malformed addresses.
if VERBOSE:
    print(f"Dropping {evictions_df['property_address_full'].str.contains('span, span span').sum()} observations which "
          f"have malformed addresses.")
evictions_df = evictions_df.loc[~evictions_df['property_address_full'].str.contains("span, span span"), :]

# Drop addresses without latitude and longitude.
if VERBOSE:
    print(f"Dropping {evictions_df[['longitude', 'latitude']].isna().any(axis=1).sum()} evictions missing latitude "
          f"or longitude.")
evictions_df = evictions_df.dropna(subset=['longitude', 'latitude'])

Beginning with 40759 observations.
Dropping 0 observations where file_date is missing.
Dropping 24 observations which have malformed addresses.
Dropping 1 evictions missing latitude or longitude.


## 2. Merging Evictions With Census Tract Characteristics

In [22]:
# Merge with census tract characteristics.
evictions_df = evictions_df.rename(columns={'Full FIPS (tract)': 'tract_geoid'})
evictions_tracts_df = evictions_df.merge(pd.read_csv(INPUT_DATA_TRACTS, dtype={'tract_geoid': float}),
                                  on='tract_geoid',
                                  how='left',
                                  validate='m:1').set_index('case_number')
if VERBOSE:
    print(f"Successfully merged {evictions_tracts_df['med_hhinc2016'].notna().sum()} observations with census tracts.")

Successfully merged 40732 observations with census tracts.


## 3. Merging Evictions With Zestimates

In [27]:
evictions_tracts_zestimates_df = pd.read_csv(INPUT_DATA_ZESTIMATES, index_col='case_number').merge(evictions_tracts_df,
                                                     right_index=True,
                                                     left_index=True,
                                                     how='right',
                                                     validate='1:1')
if VERBOSE:
    successfully_matched_observations = (~evictions_tracts_zestimates_df['2022-12'].isna()).sum()
    print(
        f"Successfully matched {successfully_matched_observations} evictions "
        f"({100 * (successfully_matched_observations / len(evictions_tracts_zestimates_df)) :.2f} percent of observations) to "
        f"Zestimates.")

# Rename columns containing Zestimates.
years = [str(year) for year in range(2013, 2023)]
months = ["0" + str(month) for month in range(1, 10)] + [str(month) for month in range(10, 13)]
value_vars = ["2012-12"] + [str(year) + "-" + str(month) for year in years for month in months]
value_vars_zestimates, _, _ = generate_variable_names('zestimate')
for value_var, value_var_zestimates in zip(value_vars, value_vars_zestimates):
    evictions_tracts_zestimates_df = evictions_tracts_zestimates_df.rename(columns={value_var: value_var_zestimates})
value_vars_to_concat.append(evictions_tracts_zestimates_df[value_vars_zestimates])

Successfully matched 11496 evictions (28.22 percent of observations) to Zestimates.


## 4. Merging Evictions with Tax Parcels

In [28]:
# Request computing resources.
cluster = SLURMCluster(queue='batch',
                       cores=32,
                       memory='230 GB',
                       walltime='01:00:00',
                      scheduler_options={'dashboard_address': '8787'} )
cluster.scale(jobs=1)


In [29]:
client = Client(cluster)

In [35]:
# Create a GeoDataFrame containing eviction Points as geometry and case_number as a column.
evictions_gdf = gpd.GeoDataFrame(evictions_df, geometry=gpd.points_from_xy(evictions_df['longitude'], evictions_df['latitude']))[['case_number', 'geometry']]
evictions_gdf = evictions_gdf.set_crs("EPSG:4326", allow_override=True).to_crs('EPSG:26986')
# Convert this GeoDataFrame to a Dask GeoDataFrame
evictions_dgdf = dask_geopandas.from_geopandas(evictions_gdf, npartitions=N_PARTITIONS).repartition(partition_size='25 MB')

# Read in tax parcel data.
tax_parcels_dgdf = dask_geopandas.read_file(INPUT_DATA_TAX_PARCELS, npartitions=N_PARTITIONS, layer='layer').repartition(partition_size='25 MB')

# Join tax parcels with evictions, keeping only the geometry of the tax parcels.
evictions_tax_parcels_dgdf = dask_geopandas.sjoin(tax_parcels_dgdf, evictions_dgdf, how='inner', predicate='contains').drop(columns='index_right')
evictions_tax_parcels_dgdf = evictions_tax_parcels_dgdf.loc[ddf['LOC_ID'] != "F_819960_2934955", :]  # Drop the eviction which erroneously merges to two parcels. 

# Set index to case_number and drop all columns besides geometry.
evictions_tax_parcels_dgdf = evictions_tax_parcels_dgdf.set_index('case_number')['geometry']

Index(['LOC_ID', 'geometry', 'index_right', 'case_number'], dtype='object')


NameError: name 'notavariable' is not defined

## 5a. Merge Evictions With Own-Parcel Crime Data

In [37]:
# Convert eviction data to Dask-GeoDataFrame.
ddf = dask_geopandas.from_geopandas(ddf, npartitions=N_PARTITIONS)
ddf = ddf.repartition(partition_size='25 MB')  # Reduce partition size in preparation for spatial join.

# Read crime data as Dask DataFrame; clean it.
crime_dgdf = (dd.read_csv(INPUT_DATA_CRIME + "/*.csv", dtype={'REPORTING_AREA': 'object', 'SHOOTING': 'object'})
                .dropna(subset=['Long', 'Lat', 'OCCURRED_ON_DATE'])
                .rename(columns={'OCCURRED_ON_DATE': 'month_of_crime_incident'})
                .drop(columns=['OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'YEAR', 'MONTH',
                               'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Location']))
crime_dgdf['month_of_crime_incident'] = dd.to_datetime(crime_dgdf['month_of_crime_incident'].str[:10]).dt.to_period("M").astype(str)
crime_dgdf = crime_dgdf.compute()

# Convert crime data to GeoDataFrame.
crime_dgdf = (gpd.GeoDataFrame(crime_dgdf, geometry=gpd.points_from_xy(crime_dgdf['Long'], crime_dgdf['Lat']))
                               .set_crs("EPSG:4326", allow_override=True)
                               .to_crs("EPSG:26986"))

# Convert crime data to Dask-GeoDataFrame.
crime_dgdf = dask_geopandas.from_geopandas(crime_dgdf, npartitions=N_PARTITIONS).repartition(partition_size='25 MB')
crime_dgdf = crime_dgdf.dissolve(by='INCIDENT_NUMBER').reset_index()

# Join Dask-GeoDataFrames containing eviction data and crime data.
ddf = dask_geopandas.sjoin(crime_dgdf,
                            ddf,
                            how='inner',
                            predicate='within')

# We no longer have any use for crime incident geometry.
ddf = ddf.drop(columns='geometry')

## 5b. Aggregating Own-Parcel Crime Data to Case-Month Level 

In [38]:
ddf = aggregate_crime_to_case_month(ddf)

In [39]:
# Now, ddf contains wide format crime incident data for the 792 properties which experienced crime.
# Merge with the evictions that experienced no crime, stored in df.
ddf = ddf.set_index('case_number')
df = df.set_index('case_number')
ddf = pd.concat([df, ddf], axis=1)

In [40]:
# Rename columns containing own-parcel crime incident counts.
years = [str(year) for year in range(2015, 2023)]
months = ["0" + str(month) for month in range(1, 10)] + [str(month) for month in range(10, 13)]
value_vars = [str(year) + "-" + str(month) for year in years for month in months]
value_vars = value_vars[5:]
value_vars.append('2023-01')
value_vars_crimes_own_parcel, _, _ = generate_variable_names('crimes_own_parcel')
for value_var, value_var_crimes_own_parcel in zip(value_vars, value_vars_crimes_own_parcel):
    ddf = ddf.rename(columns={value_var: value_var_crimes_own_parcel})

In [41]:
# Replace missing crime data with zero for evictions that were not matched to crimes.
ddf.loc[:, value_vars_crimes_own_parcel] = ddf[value_vars_crimes_own_parcel].fillna(0)

In [42]:
# Separately store own-parcel crime counts.
to_concat = []
crimes_own_parcel_data = ddf[value_vars_crimes_own_parcel]
to_concat.append(crimes_own_parcel_data)
ddf = ddf.drop(columns=value_vars_crimes_own_parcel)  # Drop from ddf so that we are not spatially joining unnecessary data.

## 6a. Merge Evictions with Crimes Within Varying Distances

`
ddf : GeoDataFrame with only a geometry column, containing Points corresponding to evictions.


for each radius in [60, 90, 140, 200]:
    60m_gdf = ddf.copy()
    60m_gdf.geometry = 60m_gdf.geomtry.buffer(radius)
    
`

In [43]:
# Drop existing geometry column and create a new one using the latitude and longitude of the eviction coordinates.
ddf = gpd.GeoDataFrame(ddf,
                          geometry=gpd.points_from_xy(ddf['longitude'], ddf['latitude']))
ddf = ddf.set_crs("EPSG:4326", allow_override=True).to_crs('EPSG:26986')
ddf = ddf.reset_index()  # We need case_number as a column for gropuby operation.

# TODO: perform merge at each value of radius
for radius in [60, 90, 140, 200]:
    current_gdf = ddf.copy()  # Copy dataframe containing geometry for each eviction.
    current_gdf.geometry = current_gdf.geometry.buffer(radius)  # Add buffer.
    
    # Convert to Dask-GeoDataFrame and repartition
    current_dgdf = dask_geopandas.from_geopandas(current_gdf, npartitions=N_PARTITIONS)
    current_dgdf = current_dgdf.repartition(partition_size='25 MB')
    
    current_dgdf = dask_geopandas.sjoin(crime_dgdf,
                            current_dgdf,
                            how='inner',
                            predicate='within')
    print(type(current_dgdf))
    # We no longer have any use for crime incident geometry.
    current_dgdf = current_dgdf.drop(columns='geometry')
    
    current_dgdf = aggregate_crime_to_case_month(current_dgdf)
    break
    # TODO: finish aggregating crimes to case month, rename value vars, add to to_concat,
    # then continue the for loop to the next iteration!
current_dgdf

<class 'dask_geopandas.core.GeoDataFrame'>


month_of_crime_incident,case_number,2015-06,2015-07,2015-08,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,...,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12,2023-01
0,19H79SP002248,3.0,6.0,4.0,3.0,13.0,6.0,6.0,2.0,2.0,...,4.0,6.0,10.0,5.0,9.0,2.0,5.0,3.0,2.0,2.0
1,19H82SP01078,,,,,,,,,,...,,,,,,,,,,
2,19H84SP001836,,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,...,2.0,3.0,1.0,1.0,3.0,1.0,1.0,,3.0,3.0
3,19H84SP001837,4.0,9.0,5.0,6.0,6.0,5.0,7.0,10.0,6.0,...,9.0,7.0,4.0,3.0,8.0,8.0,2.0,7.0,7.0,5.0
4,19H84SP001838,1.0,3.0,4.0,4.0,2.0,3.0,7.0,6.0,4.0,...,2.0,2.0,2.0,4.0,1.0,4.0,2.0,3.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5455,21H84SP001196,1.0,4.0,5.0,3.0,1.0,3.0,7.0,1.0,1.0,...,,,1.0,1.0,2.0,1.0,1.0,2.0,,
5456,21H84SP001198,2.0,4.0,3.0,1.0,2.0,,6.0,3.0,2.0,...,1.0,,,1.0,4.0,1.0,1.0,1.0,2.0,
5457,21H84SP001199,4.0,3.0,9.0,5.0,3.0,1.0,4.0,1.0,4.0,...,1.0,2.0,2.0,2.0,5.0,2.0,2.0,1.0,1.0,
5458,21H84SP001202,5.0,,1.0,1.0,3.0,2.0,5.0,2.0,2.0,...,2.0,,,2.0,,,,1.0,,


## 7. Producing the Unrestricted Dataset

In [None]:
unrestricted_df = pd.DataFrame(unrestricted_df.drop(columns='geometry'))
unrestricted_df.to_csv(OUTPUT_DATA_UNRESTRICTED)

## 8. Producing the Samples Used in Analysis

In [None]:
# Drop cases resolved via mediation.
mediated_mask = unrestricted_df['disposition_found'] == "Mediated"
if VERBOSE:
    print(f"Dropping {mediated_mask.sum()} cases resolved through mediation.")
unrestricted_df = unrestricted_df.loc[~mediated_mask, :]

In [None]:
# Drop cases resolved via volntary dismissal (dropped by plaintiff). 
voluntary_dismissal_mask = unrestricted_df['disposition'].str.contains("R 41(a)(1) Voluntary Dismissal on", na=False, regex=False)
if VERBOSE:
    print(f"Droppping {voluntary_dismissal_mask.sum()} cases resolved through voluntary dismissal.")
unrestricted_df = unrestricted_df.loc[~voluntary_dismissal_mask, :]

In [None]:
# Drop cases where disposition found is other.
disposition_found_other_mask = unrestricted_df['disposition_found'] == "Other"
if VERBOSE:
    print(f"Dropping {disposition_found_other_mask.sum()} cases where disposition_found is \"Other\"")
unrestricted_df = unrestricted_df.loc[~disposition_found_other_mask, :]

In [None]:
# Drop rows which contain consistent values of disposition_found and judgment_for_pdu.

# First, we drop cases where disposition_found is "Defaulted" but judgment_for_pdu is "Defendant"
inconsistent_mask_1 = ((unrestricted_df['disposition_found'] == "Defaulted") & (unrestricted_df['judgment_for_pdu'] == "Defendant"))
if VERBOSE:
    print(f"Dropping {inconsistent_mask_1.sum()} observations where disposition_found is \"Defaulted\" but judgment_for_pdu is \"Defendant\".")
unrestricted_df = unrestricted_df.loc[~inconsistent_mask_1, :]
          
# Next, we drop cases where disposition_found is "Dismissed" yet judgment_for_pdu is "Plaintiff"
inconsistent_mask_2 = ((unrestricted_df['disposition_found'] == "Dismissed") & (unrestricted_df['judgment_for_pdu'] == "Plaintiff"))
if VERBOSE:
    print(f"Dropping {inconsistent_mask_2.sum()} observations where disposition_found is \"Dismissed\" but judgment_for_pdu is \"Plaintiff\".")
unrestricted_df = unrestricted_df.loc[~inconsistent_mask_2, :]

In [None]:
# Generate a variable indicating judgment in favor of defendant.
unrestricted_df.loc[:, 'judgment_for_defendant'] = 0
defendant_won_mask = ((unrestricted_df['disposition_found'] == "Dismissed") |
                      (unrestricted_df['judgment_for_pdu'] == "Defendant"))
unrestricted_df.loc[defendant_won_mask, 'judgment_for_defendant'] = 1

# Generate a variable indicating judgement in favor of plaintiff.
unrestricted_df.loc[:, 'judgment_for_plaintiff'] = 1 - unrestricted_df['judgment_for_defendant']


### 8a. Producing the Zillow Sample

In [None]:
# Drop rows where we are missing any Zestimates.
zestimates_df = unrestricted_df.copy()
has_all_zestimates_mask = zestimates_df[value_vars_zestimates].notna().all(axis=1)
if VERBOSE:
    print(f"Limiting sample to {has_all_zestimates_mask.sum()} evictions for which we observe Zestimates at every month from 2012-12 to 2022-12.")
zestimates_df = zestimates_df.loc[has_all_zestimates_mask, :]
zestimates_df.to_csv(OUTPUT_DATA_ZILLOW)

### 8b. Producing the Crime Sample

In [None]:
crime_df = unrestricted_df.copy()
# Restrict to evictions which took place in Boston.
boston_mask = ((crime_df['County'] == "Suffolk County") & (~crime_df['City'].isin(["Chelsea", "Revere", "Winthrop"])))
if VERBOSE:
    print(f"Limiting sample to {boston_mask.sum()} observations which are in Boston.")
crime_df = crime_df.loc[boston_mask, :]
crime_df.to_csv(OUTPUT_DATA_CRIME)