In [1]:
from sklearn.linear_model import LogisticRegression as lr
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics
import math
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import seaborn as sns
sns.set(rc={'figure.figsize':(16,10)}, font_scale=1.3)

import networkx as nx
from networkx.algorithms.tree.branchings import maximum_branching
import scipy.stats as stats
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix

import random

In [2]:
def get_treatment_level(input, bins):
    if input < bins[0]:
        return 0
    elif input < bins[1]:
        return 1
    elif input < bins[2]:
        return 2
    else:
        return 3

def logit(p):
    logit_value = math.log(p / (1-p))
    return logit_value

def plot_propensity_plots(predictions, predictions_logit, T):
    fig, ax = plt.subplots(1,2)
    fig.suptitle('Density distribution plots for propensity score and logit(propensity score).')
    sns.kdeplot(x = predictions[:,1], hue = T , ax = ax[0])
    ax[0].set_title('Propensity Score')
    sns.kdeplot(x = predictions_logit, hue = T , ax = ax[1])
    ax[1].axvline(-0.4, ls='--')
    ax[1].set_title('Logit of Propensity Score')
    plt.show()

def create_maximum_branching_graph(df_data):
    G = nx.Graph()
    G.add_nodes_from(df_data.reset_index()['osward'].tolist())
    df_data_no_index = df_data.reset_index()
    epsilon = 0.0001
    for index, row in df_data_no_index.iterrows():
        other_rows = df_data_no_index[df_data_no_index['treatment'] != row.treatment]
        for o_index, o_row in other_rows.reset_index().iterrows():
            modified_distance = (abs(row.propensity_score_logit - o_row.propensity_score_logit) + epsilon) / abs(row.treatment - o_row.treatment)
            if not G.has_edge(row.osward, o_row.osward):
                G.add_edge(row.osward, o_row.osward, weight=modified_distance)
    return maximum_branching(G)  

In [3]:
full_dataset_confounders_2011 = pd.read_csv('processed_data/full_dataset_confounders_2011.csv')
full_dataset_confounders_2012 = pd.read_csv('processed_data/full_dataset_confounders_2012.csv')
full_dataset_confounders_2013 = pd.read_csv('processed_data/full_dataset_confounders_2013.csv')

full_dataset_confounders_2011.set_index('osward', inplace=True)
full_dataset_confounders_2012.set_index('osward', inplace=True)
full_dataset_confounders_2013.set_index('osward', inplace=True)

all_columns_to_drop = ['arts_count', 'cinema_count', 'gallery_count',
'comm_center_count', 'dance_count', 'lgbt_count', 'library_count', 'museum_count', 'music_count', 'outdoor_count',
'pub_count', 'skate_count', 'theatre_count', 'total_count', 'QUANTITY']
all_columns_to_drop_year = [x for x in all_columns_to_drop if x != "total_count"]
all_columns_to_drop_arts = [x for x in all_columns_to_drop if x != "arts_count"]
all_columns_to_drop_cinema = [x for x in all_columns_to_drop if x != "cinema_count"]
all_columns_to_drop_gallery = [x for x in all_columns_to_drop if x != "gallery_count"]
all_columns_to_drop_comm_center = [x for x in all_columns_to_drop if x != "comm_center_count"]
all_columns_to_drop_dance = [x for x in all_columns_to_drop if x != "dance_count"]
all_columns_to_drop_lgbt = [x for x in all_columns_to_drop if x != "lgbt_count"]
all_columns_to_drop_library = [x for x in all_columns_to_drop if x != "library_count"]
all_columns_to_drop_museum = [x for x in all_columns_to_drop if x != "museum_count"]
all_columns_to_drop_music = [x for x in all_columns_to_drop if x != "music_count"]
all_columns_to_drop_outdoor = [x for x in all_columns_to_drop if x != "outdoor_count"]
all_columns_to_drop_pub = [x for x in all_columns_to_drop if x != "pub_count"]
all_columns_to_drop_skate = [x for x in all_columns_to_drop if x != "skate_count"]
all_columns_to_drop_theatre = [x for x in all_columns_to_drop if x != "theatre_count"]

In [14]:
def null_model(input_data, treatment_column, columns_to_drop, bins, iterations):
    effects = []
    for i in range(iterations):
        df = input_data.copy()
        subset = df.drop(columns_to_drop, axis=1)
        df[treatment_column] = np.random.permutation(df[treatment_column].astype('int'))
        subset[treatment_column] = subset[treatment_column].astype('int')
        treatment_levels = [get_treatment_level(x, bins) for x in list(df[treatment_column])]
        subset_treatment_levels = [get_treatment_level(x, bins) for x in list(subset[treatment_column])]
        df[treatment_column + '_bin'] = treatment_levels
        subset[treatment_column] = subset_treatment_levels

        T = subset[treatment_column]
        X = subset.loc[:,subset.columns != treatment_column]
        y = df[['QUANTITY']]

        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('logistic_classifier', lr())
        ])
        pipe.fit(X, T)

        predictions = pipe.predict_proba(X)
        predictions_binary = pipe.predict(X)
        predictions_logit = np.array([logit(xi) for xi in predictions[:,1]])
        # plot_propensity_plots(predictions, predictions_logit, T)
        df.loc[:,'propensity_score'] = predictions[:,1]
        df.loc[:,'propensity_score_logit'] = predictions_logit
        df.loc[:,'outcome'] = y.QUANTITY

        X.loc[:,'propensity_score'] = predictions[:,1]
        X.loc[:,'propensity_score_logit'] = predictions_logit
        X.loc[:,'outcome'] = y.QUANTITY
        X.loc[:,'treatment'] = df[treatment_column + '_bin']

        caliper = np.std(df.propensity_score) * 0.25
        # print('\nCaliper (radius) is: {:.4f}\n'.format(caliper))

        df_data = X
        knn = NearestNeighbors(n_neighbors=10 , p = 2, radius=caliper)
        knn.fit(df_data[['propensity_score_logit']].to_numpy())
        
        distances , indexes = knn.kneighbors(
            df_data[['propensity_score_logit']].to_numpy(), \
            n_neighbors=10)
        
        edmonds_applied = create_maximum_branching_graph(df_data)
        treatment_effect = []
        for (u,v) in edmonds_applied.edges():
            effect = (df_data['outcome'][u] - df_data['outcome'][v])/(df_data['treatment'][u] - df_data['treatment'][v])
            treatment_effect.append(effect)
        effects.append(treatment_effect)
        
    return effects

In [36]:
effects_2011 = null_model(full_dataset_confounders_2011, 'total_count', all_columns_to_drop_year, bins=[4,7,12], iterations=10)
effects_2012 = null_model(full_dataset_confounders_2012, 'total_count', all_columns_to_drop_year, bins=[4,7,12], iterations=10)
effects_2013 = null_model(full_dataset_confounders_2013, 'total_count', all_columns_to_drop_year, bins=[4,7,12], iterations=10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [32]:
def calculate_averages(effects):
    ate = 0
    minimum = 0
    percentile_25 = 0
    percentile_75 = 0
    maximum = 0
    for eff in effects:
        ate += np.mean(eff)
        minimum += np.min(eff)
        percentile_25 += np.percentile(eff,25)
        percentile_75 += np.percentile(eff,75)
        maximum += np.max(eff)
    print('ATE:', ate/10, 'Min:', minimum/10,'25th %:', percentile_25/10, '75th %:', percentile_75/10, 'Max:', maximum/10)

In [37]:
print('2011')
calculate_averages(effects_2011)
print('2012')
calculate_averages(effects_2012)
print('2013')
calculate_averages(effects_2013)

2012
ATE: 7.728811473101918 Min: -536.2983123280869 25th %: -71.50712998435475 75th %: 85.9880156042402 Max: 584.4928193578542
