In [3]:
import pandas as pd
import numpy as np
import multiprocessing
from multiprocessing import Pool
from scipy.stats import ttest_ind
import statsmodels.api as sm

Steps for regression table
- Get meta distributions for each state in 2017
- Store N*N state meta distribution comparison
- Loop pairs through map/reduce
- For each pair: do
    - Run t-test
    - Run regression
    - Store t-test and regression results

This dataset isn't the final one, because we still need to actually cast all of the numeric fields as numeric.

I'm still going to use it, because we need to make progress on the regression table programming. Once we're closer to having clean data, we can come back and use this.

In [4]:
all_states = pd.read_csv('../../Data/transformed_missing_replaced_with_zero.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
all_states.drop(['PERWT', 'YRIMMIG'] , axis=1, inplace=True)

In [6]:
states = list(set((all_states['STATEICP'])))

In [7]:
def get_pairs():
    rt = {}
    
    for idx, s in enumerate(states):

        to_iterate = states[idx+1:]

        for j in to_iterate:

            if s[0] < j[0]:
                string = '{}-{}'.format(s,j)
            else:
                string = '{}-{}'.format(j,s)

            if string not in rt:
                rt[string] = {}
    
    return rt

pairs = get_pairs()

In [8]:
numerics = list(all_states.select_dtypes(include=[np.number]))
numerics.append('STATEICP')
df = all_states[ numerics]

In [9]:
def get_distributions(states, df):
    '''
    Returns a dictionary with the state averages on all columns, for each state
    '''
    rt = {}

    for s in states:
        
        meta = []
        
        state_df = df[ df['STATEICP'] == s]
        
        for h in list(state_df):
            if h == 'STATEICP':
                continue
            mean = state_df[h].mean()
            meta.append(mean)
        
        rt[s] = meta
    return rt
        
distributions = get_distributions(states, df)

In [26]:
def run_regression(s1, s2):
    
    OUTCOME_COL = 'POVERTY'
    
    reg_data = df[ df['STATEICP'].isin([s1,s2])]
    
    reg_data = pd.get_dummies(reg_data, columns = ['STATEICP']).drop('YEAR', axis=1,inplace=False)
        
    ys = reg_data[OUTCOME_COL]
    
    xs = reg_data.drop(OUTCOME_COL, inplace=False, axis=1)

    sm.add_constant(xs)
    model = sm.OLS(ys, xs).fit()

    s = str(model.summary())
    
    splits = s.split('STATEICP')[1].split(' ')
    
    cleaned_splits = [i for i in splits if '.' in i]

    treatment_beta = cleaned_splits[0]
    t_value = cleaned_splits[2]
    return [s1, s2, treatment_beta, t_value]

effect = transform_pair('Arizona-New Jersey')

In [28]:
effect

['Arizona', 'New Jersey', '583.0883', '13.507']

In [37]:
def transform_pair(string):
    significance_level = 0.05
    
    # get distributions for both states
    splits = string.split('-')
    s1, s2 = splits[0], splits[1]
    dist1 = distributions[s1]
    dist2 = distributions[s2]
    
    # run t-test
    sim_t, p_value = ttest_ind(dist1, dist2)
    
    # Need to add some logic here for evaluating policy levers. 
    # Ideally, only one lever is different, and this is the one that they are about to enact 
    
    # if les than .05, fail to reject null hypothesis and conclude difference. Otherwise, they're similar
    if p_value < significance_level:
        print ('Cannot run an RCT at this level of similarity')
        return []
    
    effect = run_regression(s1, s2)
    effect.append(sim_t)
    
    return effect

In [38]:
def run_regressions():

    pool = Pool(processes=multiprocessing.cpu_count())

    regressions = pool.map(transform_pair, list(pairs.keys()))
    pool.close() 
    pool.join()    
    
    return regressions

regressions = run_regressions()

In [40]:
reg_df = pd.DataFrame(regressions, columns = ['Treatment State', 'Control 2', 'Treatment Beta', 'Treatment T Value', 'Similarity T Value'])

In [41]:
reg_df.to_csv('../../Data/regression_table.csv', index=False)

In [42]:
!aws s3 cp '../../Data/regression_table.csv' s3://ep-agent

Completed 68.9 KiB/68.9 KiB (631.0 KiB/s) with 1 file(s) remainingupload: ../../Data/regression_table.csv to s3://ep-agent/regression_table.csv
