# 01_summary_statistics.ipynb

In [36]:
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 300
plt.rcParams['savefig.dpi'] = 300
import numpy as np
import pandas as pd

# Store paths.
INPUT_DATA_UNRESTRICTED = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/data/03_cleaned/unrestricted.csv"
INPUT_DATA_RESTRICTED = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/data/03_cleaned/restricted.csv"

# Read unrestricted dataset into memory.
df = pd.read_csv(INPUT_DATA_UNRESTRICTED)

In [45]:
# Panel A: Case Initiaton
panel_A_columns = ['for_cause', 'foreclosure', 'no_cause', 'non_payment', 'for_cause_transfer', 'foreclosure_transfer',
                   'non_payment_transfer', 'no_cause_transfer']
origin_columns = ['initiating_action', 'initiating_action', 'initiating_action','initiating_action', 
                  'initiating_action', 'initiating_action', 'initiating_action', 'initiating_action']
target_values = ["SP Summons and Complaint - Cause",  "SP Summons and Complaint - Foreclosure",
                 "SP Summons and Complaint - No Cause", "SP Summons and Complaint - Non-payment of Rent",
                 "SP Transfer - Cause", "SP Transfer - Foreclosure", "SP Transfer - Non-payment of Rent",
                 "SP Transfer- No Cause"]

for dummy_column, origin_column, target_value in zip(panel_A_columns, origin_columns, target_values):
    df.loc[:, dummy_column] = np.where(df[origin_column] == target_value, 1, 0)

panel_A = df[sorted(panel_A_columns)].describe().T
panel_A = pd.concat([panel_A], keys=["Panel A: Case Initiation"])
panel_A

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
Panel A: Case Initiation,for_cause,37966.0,0.115946,0.320164,0.0,0.0,0.0,0.0,1.0
Panel A: Case Initiation,for_cause_transfer,37966.0,0.006453,0.080073,0.0,0.0,0.0,0.0,1.0
Panel A: Case Initiation,foreclosure,37966.0,0.01928,0.137511,0.0,0.0,0.0,0.0,1.0
Panel A: Case Initiation,foreclosure_transfer,37966.0,0.000975,0.031203,0.0,0.0,0.0,0.0,1.0
Panel A: Case Initiation,no_cause,37966.0,0.099299,0.299067,0.0,0.0,0.0,0.0,1.0
Panel A: Case Initiation,no_cause_transfer,37966.0,0.006216,0.078598,0.0,0.0,0.0,0.0,1.0
Panel A: Case Initiation,non_payment,37966.0,0.735843,0.440889,0.0,0.0,1.0,1.0,1.0
Panel A: Case Initiation,non_payment_transfer,37966.0,0.015988,0.12543,0.0,0.0,0.0,0.0,1.0


In [46]:
# Panel B: Case Resolution
panel_B_columns = ['mediated', 'dismissed', 'voluntary_dismissal', 'defaulted', 'heard']
origin_columns = ['disposition_found', 'disposition_found', 'voluntary_dismissal', 'disposition_found',
                  'disposition_found']
target_values = ["Mediated", "Dismissed", 1, "Defaulted", "Heard"]

for dummy_column, origin_column, target_value in zip(panel_B_columns, origin_columns, target_values):
    if dummy_column == 'voluntary_dismissal':
        continue
    df.loc[:, dummy_column] = np.where(df[origin_column] == target_value, 1, 0)

panel_B_columns.remove('voluntary_dismissal')
panel_B = df[sorted(panel_B_columns)].describe().T
panel_B = pd.concat([panel_B], keys=["Panel B: Case Resolution"])
panel_B

KeyError: 'voluntary_dismissal'

In [None]:
# Panel E: Zestimates Around Treatment Time
# Get month of latest docket date for each row and use to grab Zestimates at different times prior to treatment.
df.loc[:, 'latest_docket_date'] = pd.to_datetime(df['latest_docket_date'])
df.loc[:, 'nan'] = np.nan
panel_E_columns = []
for i in range(-5, 4):
    # This column contains the year-month which is i years relative to treatment for each property.
    offset_docket_month = (df['latest_docket_date'] + pd.tseries.offsets.DateOffset(years=i)).dt.strftime('%Y-%m').copy()
    
    # Some of the year-months will be outside of the range of our data.
    # For instance, we do not have Zestimates 2 years post-treatment for evictions which occurred in 2022. 
    # For these observations, the offset docket month needs to map to the column of nans we created earlier.
    offset_docket_month.loc[~offset_docket_month.isin(df.columns)] = 'nan'
    
    # Set column accordingly. 
    idx, cols = pd.factorize(offset_docket_month)
    new_col_name = f'zestimate_{i}_years_relative_to_treatment'
    panel_E_columns.append(new_col_name)
    df.loc[:, new_col_name] = df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]

In [None]:
panel_E = df[panel_E_columns].describe().T
panel_E = pd.concat([panel_E], keys=["Panel E: Zestimates Around Treatment"])
panel_E