In [1]:
import os
import itertools
import pandas as pd
import matplotlib.pyplot as plt
import const
from pathlib import Path
src_path = Path.cwd()
save_plots_dir = src_path / "results" / "final_plots"

RESULTS_DIR = "results/diff"
placebo_vals = [True, False]
imputed_vals = [True, False]
incident_order = ['arrest', 'indictment filed', 'conviction']
methods_l = ['OLS', 'OLS + FE', 'Clustered SE', 'Permutation', 'Mean P-Values']

# colors codes: purple, turquoise, blue, pink
methods_colors = ['#76448a', '#2980b9', '#16a085', '#d188c8']
# incident colors: orange, yellow, brown
incidents_colors = ['#dc7633', '#f1c40f', '#935116']
p_val_line_color = '#b03a2e'
p_val_mean_color = '#7f8c8d'

# put all data together:
did_df = pd.DataFrame()

for placebo, imputed in itertools.product(placebo_vals, imputed_vals):
    for col_to_predict in const.columns_to_predict:

        data_path = os.path.join(RESULTS_DIR, col_to_predict, f"did_results_placebo={placebo}_imputed={imputed}_k=10_cosine.csv")

        # Load data
        df = pd.read_csv(data_path)

        df["placebo"] = placebo
        df["imputed"] = imputed
        df.rename({"column": "col_to_predict"}, inplace=True)

        did_df = pd.concat([did_df, df])

# Reverse the Hebrew text label for each authority
did_df['target_name'] = did_df['target_name'].apply(lambda x: x[::-1])
did_df = did_df[did_df['column'].isin(const.columns_to_predict)]
did_df['Mean P-Values'] = did_df[methods_l[:-1]].mean(axis=1)

In [None]:
did_df[(did_df['column'] == 'female_out_migration') & (did_df['placebo'] == False) & (did_df['imputed'] == False) & (did_df['incident_type'] == 'indictment filed')]

## The bootom line: 
### Bar Plot of %cases with effect (Do we see effect of the event on y)

For specific did method (OLS)
- X: col_to_predict
    - Cols 1...49
- Y: % cases over different auth
- Color: event type

In [12]:
def plot_grouped_bar(df, method, subtitle=''):
    # Pivot the dataframe to have 'column' and 'incident_type' as index and 'method' as values
    pivot_df = df.pivot_table(index='column', columns='incident_type', values=method)

    # Reorder the columns based on the desired incident_type order
    pivot_df = pivot_df[incident_order]

    # Plot
    pivot_df.plot(kind='bar', figsize=(16, 6), color=incidents_colors)
    plt.title(f'P-values for {method} across columns and incident types')
    if subtitle:
        plt.suptitle(subtitle)
    plt.ylabel('P-value')
    plt.xlabel('Column')
    plt.axhline(0.05, color=p_val_line_color, linestyle='--', label='Significance threshold (0.05)')
    plt.legend(title='Incident Type')
    # add line on y=1
    plt.axhline(1, color='black', linestyle='-')
    # add ligth green color in the background of the plot where y<0.05
    plt.axhspan(0, 0.05, color='lightgreen', alpha=0.5)
    plt.tight_layout()
    plt.savefig(save_plots_dir / f'pvals_{method}_{subtitle}.png')
    plt.close()

# Plot for each method

for placebo, imputed in itertools.product(placebo_vals, imputed_vals):
    df = did_df[(did_df.placebo == placebo) & (did_df.imputed == imputed)]
    # groipby column and method and avg over authority
    df_numeric = df.select_dtypes(include='number')  # Select only numeric columns
    m_df = df.groupby(['column', 'incident_type'])[df_numeric.columns].mean().reset_index()  # Perform groupby on numeric columns
    for method in methods_l:
        plot_grouped_bar(df=m_df, method=method, subtitle=f'Placebo={placebo}, Imputed={imputed}')
print("finish")

finish


In [10]:
def plot_grouped_bar(df, method, subtitle=''):
    # Pivot the dataframe to have 'column' as index and 'incident_type' as columns and 'method' as values
    pivot_df = df.pivot_table(index='column', columns='incident_type', values=method)

    # Reorder the columns based on the desired incident_type order
    pivot_df = pivot_df[incident_order]

    # Plot
    pivot_df.plot(kind='bar', figsize=(16, 6), color=incidents_colors)
    plt.title(f'Percentage of Significant Authorities for {method} across columns and incident types')
    if subtitle:
        plt.suptitle(subtitle)
    plt.ylabel('Percentage of Authorities (%)')  # Changed ylabel to reflect percentage
    plt.xlabel('Column')
    plt.legend(title='Incident Type')
    print(f" ### plot significance percentage: {method=}, {subtitle}")
    # add y line on 50%
    plt.axhline(50, color="grey", linestyle='--')
    plt.tight_layout()
    plt.savefig(save_plots_dir / f'significance_percentage_{method}_{subtitle}.png')
    plt.close()

# Plot for each method

for placebo, imputed in itertools.product(placebo_vals, imputed_vals):
    df = did_df[(did_df.placebo == placebo) & (did_df.imputed == imputed)]
    
    # Instead of averaging p-values over authorities, calculate the percentage of authorities with p-value < 0.05
    def calc_percent_significant(group):
        result = {}
        for method in methods_l:
            p_values = group[method]
            # Calculate the percentage of p-values less than 0.05
            percent = (p_values < 0.05).mean() * 100
            result[method] = percent
        return pd.Series(result)

    # Apply the percentage calculation to each group defined by 'column' and 'incident_type'
    df = df.groupby(['column', 'incident_type']).apply(calc_percent_significant).reset_index()
    
    for method in methods_l:
        plot_grouped_bar(df=df, method=method, subtitle=f'Placebo={placebo}, Imputed={imputed}')
print("finish")

  df = df.groupby(['column', 'incident_type']).apply(calc_percent_significant).reset_index()


 ### plot significance percentage: method='OLS', Placebo=True, Imputed=True
 ### plot significance percentage: method='OLS + FE', Placebo=True, Imputed=True
 ### plot significance percentage: method='Clustered SE', Placebo=True, Imputed=True
 ### plot significance percentage: method='Permutation', Placebo=True, Imputed=True
 ### plot significance percentage: method='Mean P-Values', Placebo=True, Imputed=True
 ### plot significance percentage: method='OLS', Placebo=True, Imputed=False


  df = df.groupby(['column', 'incident_type']).apply(calc_percent_significant).reset_index()


 ### plot significance percentage: method='OLS + FE', Placebo=True, Imputed=False
 ### plot significance percentage: method='Clustered SE', Placebo=True, Imputed=False
 ### plot significance percentage: method='Permutation', Placebo=True, Imputed=False
 ### plot significance percentage: method='Mean P-Values', Placebo=True, Imputed=False


  df = df.groupby(['column', 'incident_type']).apply(calc_percent_significant).reset_index()


 ### plot significance percentage: method='OLS', Placebo=False, Imputed=True
 ### plot significance percentage: method='OLS + FE', Placebo=False, Imputed=True
 ### plot significance percentage: method='Clustered SE', Placebo=False, Imputed=True
 ### plot significance percentage: method='Permutation', Placebo=False, Imputed=True
 ### plot significance percentage: method='Mean P-Values', Placebo=False, Imputed=True


  df = df.groupby(['column', 'incident_type']).apply(calc_percent_significant).reset_index()


 ### plot significance percentage: method='OLS', Placebo=False, Imputed=False
 ### plot significance percentage: method='OLS + FE', Placebo=False, Imputed=False
 ### plot significance percentage: method='Clustered SE', Placebo=False, Imputed=False
 ### plot significance percentage: method='Permutation', Placebo=False, Imputed=False
 ### plot significance percentage: method='Mean P-Values', Placebo=False, Imputed=False
finish


## Zoom in: 
### Bar plot of p vals

for specific column (deficit) and specific auth (tiberias)
- Line on 0.05
- Line on average of p vals
- Y: p_val
- X: method


In [6]:
# Create a function to generate zoomed-in bar plots for each authority
def plot_pvals_for_authority(df, subtitle=''):
    # Loop over each unique authority (target_name)
    for target_id in df['target_id'].unique():
        # Filter the dataframe for the specific authority
        authority_df = df[df['target_id'] == target_id]
        authority_n = authority_df['target_name'].iloc[0]
        methods = methods_l[:-1]

        for year, df_year in authority_df.groupby('incident_year'):
            # Extract the methods' p-values for each column
            pvals = df_year[['column'] + methods]

            # Set the index to 'column' for plotting
            pvals.set_index('column', inplace=True)

            # Plot the p-values for each method
            pvals.plot(kind='bar', figsize=(16, 6), color=methods_colors)

            # Add title and labels
            plt.title(f'P-values for Authority: {authority_n}, Year: {int(year)}')
            if subtitle:
                plt.suptitle(subtitle)
            plt.ylabel('P-value')
            plt.xlabel('Column')

            # Add significance threshold line at 0.05
            plt.axhline(0.05, color=p_val_line_color, linestyle='--', label='Significance threshold (0.05)')

            # Add the average of p-values across methods in dots
            avg_pvals = pvals.mean(axis=1)
            plt.plot(avg_pvals.index, avg_pvals.values, color=p_val_mean_color, marker='o', linestyle='', label='Average P-value')

            # Show legend
            plt.legend(loc='upper right')

            print(f" ### plot pvals per authoroty: target_id={target_id}, {year=}, {subtitle}")
            plt.tight_layout()
            plt.savefig(save_plots_dir / f'pvals_authority_{target_id}_{year}_{subtitle}.png')
            plt.close()

# Call the function to plot for each authority
for placebo, imputed, incident_type in itertools.product(placebo_vals, imputed_vals, incident_order):
    df = did_df[(did_df.placebo == placebo) & (did_df.imputed == imputed) & (did_df.incident_type == incident_type)]
    plot_pvals_for_authority(df, subtitle=f'Incident Type: {incident_type}, Placebo={placebo}, Imputed={imputed}')
    # if incident_type == 'indictment filed' and placebo and imputed:
    #     break
print("finish")

 ### plot pvals per authoroty: target_id=31, year=2012.0, Incident Type: arrest, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=70, year=2010.0, Incident Type: arrest, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=9600, year=2009.0, Incident Type: arrest, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=9200, year=2007.0, Incident Type: arrest, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=9700, year=2017.0, Incident Type: arrest, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=8500, year=2006.0, Incident Type: arrest, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=7800, year=2016.0, Incident Type: arrest, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=587, year=2013.0, Incident Type: indictment filed, Placebo=True, Imputed=True
 ### plot pvals per authoroty: target_id=70, year=2012.0, Incident Type: indictment filed, Placebo=True, Imputed=Tr

In [20]:
# plot for each column the p-values of the different methods for each incident type and for each authority
# x axis: auth name, y axis: p-value, color: method, facet: incident
def plot_p_vals_per_column(df, subtitle=''):
    # Loop over each unique column
    for column in df['column'].unique():
        # Filter the dataframe for the specific column
        column_df = df[df['column'] == column]
        methods = methods_l[:-1]

        for type, df_type in column_df.groupby('incident_type'):
            # new index name_year
            df_type['new_index_col'] = df_type['target_name'] + "_" + df_type['incident_year'].astype(int).astype(str)
            # Extract the methods' p-values for each authority
            pvals = df_type[['new_index_col'] + methods]

            # Set the index to 'target_name' for plotting
            pvals.set_index(['new_index_col'], inplace=True)

            # Plot the p-values for each method
            pvals.plot(kind='bar', figsize=(16, 6), color=methods_colors)

            # Add title and labels
            plt.title(f'P-values for Column: {column}, Event Type: {type}')
            if subtitle:
                plt.suptitle(subtitle)
            plt.ylabel('P-value')
            plt.xlabel('Authority')

            # Add significance threshold line at 0.05
            plt.axhline(0.05, color=p_val_line_color, linestyle='--', label='Significance threshold (0.05)')

            # Add the average of p-values across methods in dots
            avg_pvals = pvals.mean(axis=1)
            plt.plot(avg_pvals.index, avg_pvals.values, color=p_val_mean_color, marker='o', linestyle='', label='Average P-value')

            # Show legend
            plt.legend(loc='upper right')
            plt.tight_layout()
            plt.savefig(save_plots_dir / f'pvals_column_{column}_{type}_{subtitle}.png')
            plt.close()

# Call the function to plot for each authority
for placebo, imputed, incident_type in itertools.product(placebo_vals, imputed_vals, incident_order):
    df = did_df[(did_df.placebo == placebo) & (did_df.imputed == imputed) & (did_df.incident_type == incident_type)]
    plot_p_vals_per_column(df, subtitle=f'Incident Type: {incident_type}, Placebo={placebo}, Imputed={imputed}')

In [None]:
df[df['target_id'] ==9200]
# df[df['target_id'] ==70]
# df