# 01_summary_statistics.ipynb

In [1]:
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 300
plt.rcParams['savefig.dpi'] = 300
import numpy as np
import os
import pandas as pd


# Store paths.
INPUT_DATA_UNRESTRICTED = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/data/03_cleaned/unrestricted.csv"
INPUT_DATA_RESTRICTED = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/data/03_cleaned/restricted.csv"
OUTPUT_TABLES = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/output/summary_statistics/tables"
OUTPUT_FIGURES = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/output/summary_statistics/figures"

# Read unrestricted dataset into memory.
unrestricted_df = pd.read_csv(INPUT_DATA_UNRESTRICTED)

# Drop rows where voluntary dismissal occurred. 
unrestricted_df = unrestricted_df.loc[unrestricted_df['voluntary_dismissal'] == 0, :]

In [2]:
# Master summary statistics function.
def produce_summary_statistics(df: pd.DataFrame):
    # Panel A: Case Initiaton
    panel_A_columns = ['for_cause', 'foreclosure', 'no_cause', 'non_payment', 'for_cause_transfer', 'foreclosure_transfer',
                       'non_payment_transfer', 'no_cause_transfer']
    origin_columns = ['initiating_action', 'initiating_action', 'initiating_action','initiating_action', 
                      'initiating_action', 'initiating_action', 'initiating_action', 'initiating_action']
    target_values = ["SP Summons and Complaint - Cause",  "SP Summons and Complaint - Foreclosure",
                     "SP Summons and Complaint - No Cause", "SP Summons and Complaint - Non-payment of Rent",
                     "SP Transfer - Cause", "SP Transfer - Foreclosure", "SP Transfer - Non-payment of Rent",
                     "SP Transfer- No Cause"]

    for dummy_column, origin_column, target_value in zip(panel_A_columns, origin_columns, target_values):
        df.loc[:, dummy_column] = np.where(df[origin_column] == target_value, 1, 0)

    panel_A = df[sorted(panel_A_columns)].describe().T
    panel_A = pd.concat([panel_A], keys=["Panel A: Case Initiation"])

    # Panel B: Case Resolution
    panel_B_columns = ['mediated', 'dismissed', 'voluntary_dismissal', 'defaulted', 'heard']
    origin_columns = ['disposition_found', 'disposition_found', 'voluntary_dismissal', 'disposition_found',
                      'disposition_found']
    target_values = ["Mediated", "Dismissed", 1, "Defaulted", "Heard"]

    for dummy_column, origin_column, target_value in zip(panel_B_columns, origin_columns, target_values):
        df.loc[:, dummy_column] = np.where(df[origin_column] == target_value, 1, 0)

    panel_B = df[sorted(panel_B_columns)].describe().T
    panel_B = pd.concat([panel_B], keys=["Panel B: Case Resolution"])

    # Panel C: Defendant and Plaintiff Characteristics
    panel_C_columns = ['hasAttyD', 'isEntityD', 'hasAttyP', 'isEntityP', 'judgment']
    panel_C = df[sorted(panel_C_columns)].describe().T
    panel_C = pd.concat([panel_C], keys=["Panel C: Defendant and Plaintiff Characteristics"])

    # Panel D: Tax Assessment Records From F.Y. Following Eviction Filing
    panel_D_columns = ['TOTAL_VAL', 'BLDG_VAL', 'LAND_VAL', 'OTHER_VAL', 'UNITS']
    panel_D = df[sorted(panel_D_columns)].describe().T
    panel_D = pd.concat([panel_D], keys=["Panel C: Assessor Records From Post-Filing F.Y."])

    # Panel E: Zestimates Around Treatment Time
    # Get month of latest docket date for each row and use to grab Zestimates at different times prior to treatment.
    df.loc[:, 'latest_docket_date'] = pd.to_datetime(df['latest_docket_date'])
    df.loc[:, 'nan'] = np.nan
    panel_E_columns = []
    for i in range(-5, 4):
        # This column contains the year-month which is i years relative to treatment for each property.
        offset_docket_month = (df['latest_docket_date'] + pd.tseries.offsets.DateOffset(years=i)).dt.strftime('%Y-%m').copy()

        # Some of the year-months will be outside of the range of our data.
        # For instance, we do not have Zestimates 2 years post-treatment for evictions which occurred in 2022. 
        # For these observations, the offset docket month needs to map to the column of nans we created earlier.
        offset_docket_month.loc[~offset_docket_month.isin(df.columns)] = 'nan'

        # Set column accordingly. 
        idx, cols = pd.factorize(offset_docket_month)
        new_col_name = f'zestimate_{i}_years_relative_to_treatment'
        panel_E_columns.append(new_col_name)
        df.loc[:, new_col_name] = df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]

    panel_E = df[panel_E_columns].describe().T
    panel_E = pd.concat([panel_E], keys=["Panel E: Zestimates Around Treatment"])


    # Concatenate Panels A-E
    summary_statistics = pd.concat([panel_A, panel_B, panel_C, panel_D, panel_E], axis=0)[['mean', 'std', 'count']]

    return summary_statistics

In [8]:
# Produce summary statistics for unrestricted sample.
summary_statistics_unrestricted = produce_summary_statistics(unrestricted_df)

# Rename rows.
variable_names = summary_statistics_unrestricted.index.get_level_values(1)
display_names = ["For cause", "For cause (transfer)",
                "Foreclosure", "Foreclosure (transfer)",
                 "No cause", "No cause (transfer)",
                 "Non-payment of rent", "Non-payment of rent (transfer)",
                 "Case defaulted", "Case involuntarily dismissed", "Case heard", "Case mediated",
                 "Case voluntarily dismissed", "Defendant has an attorney", "Plaintiff has an attorney",
                "Defendant is an entity", "Plaintiff is an entity", "Money judgment", "Total property value",
                "Building value", "Land value", "Other value", "Units",
                "Five years before latest docket date", "Four years before latest docket date",
                 "Three years before latest docket date", "Two years before latest docket date", 
                "One year before latest docket date", "Latest docket date", "One year after latest docket date",
                "Two years after latest docket date", "Three years after latest docket date"] 
variable_display_names_dict = {key: value for key, value in zip(variable_names, display_names)}

summary_statistics_unrestricted = summary_statistics_unrestricted.rename(index=variable_display_names_dict)


# Rename columns.
summary_statistics_unrestricted.index = summary_statistics_unrestricted.index.set_names(["Panel", "Variable"])
column_display_names_dict = {'mean': "Mean", 'std': "S.D.", 'count': "N"}
summary_statistics_unrestricted = summary_statistics_unrestricted.rename(columns=column_display_names_dict)

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "summary_statistics.tex")
latex = (summary_statistics_unrestricted
         .style
         .format(thousands=",",
                 formatter={'Mean': "{:.2f}",
                            'S.D.': "{:.2f}",
                            'N': "{:,.0f}"})
         .format_index("\\textit{{{}}}", escape="latex", axis=0, level=0)
         .to_latex(None,
                   hrules=True,
                   clines="skip-last;data",
                   position="H")).replace("{*}", "{4cm}")
with open(filename, 'w') as file:
    file.write(latex)

In [14]:
# Load restricted data.
restricted_df = pd.read_csv(INPUT_DATA_RESTRICTED)
# Drop rows where voluntary dismissal occurred. 
restricted_df = restricted_df.loc[restricted_df['voluntary_dismissal'] == 0, :]

# Separate into treatment and control groups.
restricted_df_treatment = restricted_df.loc[restricted_df['judgment_for_plaintiff'] == 1, :].copy()
restricted_df_control = restricted_df.loc[restricted_df['judgment_for_plaintiff'] == 0, :].copy()

# Produce summary statistics for treatment group.
summary_statistics_treatment = produce_summary_statistics(restricted_df_treatment).drop(columns='std')
summary_statistics_treatment = pd.concat({"Treatment Group": summary_statistics_treatment}, axis=1, names=["", ""])

# Produce summary statistics for control group. 
summary_statistics_control = produce_summary_statistics(restricted_df_control).drop(columns='std')
summary_statistics_control = pd.concat({"Control Group": summary_statistics_control}, axis=1, names=["", ""])

# Combine tables.
summary_statistics_restricted = pd.concat([summary_statistics_control, summary_statistics_treatment], axis=1)

# Rename rows.
summary_statistics_restricted = summary_statistics_restricted.rename(index=variable_display_names_dict)

# Rename columns.
summary_statistics_restricted.index = summary_statistics_restricted.index.set_names(["Panel", "Variable"])
column_display_names_dict = {'mean': "Mean", 'std': "S.D.", 'count': "N"}
summary_statistics_restricted = summary_statistics_restricted.rename(columns=column_display_names_dict)

# Export to LaTeX.
filename = os.path.join(OUTPUT_TABLES, "balance_table.tex")
latex = (summary_statistics_restricted
 .style
 .format(thousands=",", formatter={('Control Group', 'Mean'): "{:.2f}",
                                                      ('Control Group', 'S.D.'): "{:.2f}",
                                                      ('Control Group', 'N'): "{:.0f}",
                                                      ('Treatment Group', 'Mean'): "{:.2f}",
                                                      ('Treatment Group', 'S.D.'): "{:.2f}",
                                                      ('Treatment Group', 'N'): "{:.0f}" })
 .format_index("\\textit{{{}}}", escape="latex", axis=0, level=0)
 .to_latex(None,
           hrules=True,
           clines="skip-last;data",
           position="H",
           multicol_align="c")).replace("{*}", "{4cm}")
with open(filename, 'w') as file:
    file.write(latex)
summary_statistics_restricted

Unnamed: 0_level_0,Unnamed: 1_level_0,Control Group,Control Group,Control Group,Treatment Group,Treatment Group,Treatment Group
Unnamed: 0_level_1,Unnamed: 1_level_1,Mean,S.D.,N,Mean,S.D.,N
Panel,Variable,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Panel A: Case Initiation,For cause,0.120241,0.3252877,3651.0,0.1194434,0.3243247,10708.0
Panel A: Case Initiation,For cause (transfer),0.01013421,0.1001711,3651.0,0.005883452,0.07648126,10708.0
Panel A: Case Initiation,Foreclosure,0.01451657,0.1196234,3651.0,0.03259245,0.1775757,10708.0
Panel A: Case Initiation,Foreclosure (transfer),0.0008216927,0.02865733,3651.0,0.001307434,0.03613649,10708.0
Panel A: Case Initiation,No cause,0.1196932,0.3246469,3651.0,0.1016063,0.3021439,10708.0
Panel A: Case Initiation,No cause (transfer),0.01506437,0.1218257,3651.0,0.004856182,0.06952015,10708.0
Panel A: Case Initiation,Non-payment of rent,0.6858395,0.4642442,3651.0,0.7217968,0.448135,10708.0
Panel A: Case Initiation,Non-payment of rent (transfer),0.0336894,0.1804532,3651.0,0.01251401,0.1111691,10708.0
Panel B: Case Resolution,Case defaulted,0.0,0.0,3651.0,0.6970489,0.4595557,10708.0
Panel B: Case Resolution,Case involuntarily dismissed,0.9917831,0.09028644,3651.0,0.0,0.0,10708.0
