In [1]:
import os
import numpy as np
import pandas as pd
import csv
from datetime import datetime
from sklearn.metrics import r2_score

from sklearn.preprocessing import StandardScaler 

import SparseSC

from definitions import target_var, country_col, date_col, save_output, fake_num, show_plots, \
    save_figs, year_col, donor_countries_all, stat, incl_countries, incl_years
from helper_functions_general import get_table_path, get_impl_date, get_trans, get_donor_countries, read_data, get_data_path, flatten
from plot_functions import plot_predictions, plot_diff, plot_cumsum, plot_cumsum_impl, plot_qq
from statistical_tests import shapiro_wilk_test, t_test_result

In [9]:
stat = 'stat'
timeframe = 'm'
treatment_country = 'united_kingdom'
model = 'did'
impl_date = '2013-04-01'
prox = False
x_years = 3

tables_path_res = get_table_path(timeframe=timeframe, folder='results', country=treatment_country, model=model)
donor_countries = get_donor_countries(prox=prox, treatment_country=treatment_country)

In [20]:
# read data
df_stat = read_data(source_path=get_data_path(timeframe=timeframe), file_name=f'total_{timeframe}_{stat}')
# df = df_stat[(df_stat[country_col].isin(incl_countries)) & (df_stat[year_col].isin(incl_years))]
df = df_stat
df

Unnamed: 0,country,date,year,month,co2,gdp,ind_prod,infl,unempl,pop,brent
0,austria,2001-01-01,2001,1,-0.019138,-0.002088,-0.019,-0.004,-0.001,-99999.0,-0.001560
1,austria,2001-02-01,2001,2,0.039611,-0.001670,0.018,0.003,0.000,-99999.0,0.070813
2,austria,2001-03-01,2001,3,0.022160,-0.001253,-0.011,-0.001,0.001,-99999.0,-0.115513
3,austria,2001-04-01,2001,4,0.119718,-0.000835,-0.015,0.003,0.002,-99999.0,0.046260
4,austria,2001-05-01,2001,5,0.024326,-0.000423,0.000,-0.003,0.001,-99999.0,0.098282
...,...,...,...,...,...,...,...,...,...,...,...
4300,united_kingdom,2019-06-01,2019,6,-0.017357,0.004439,-0.005,-0.003,0.000,-99999.0,-0.104862
4301,united_kingdom,2019-07-01,2019,7,-0.034553,0.006109,-0.002,0.000,0.000,-99999.0,-0.004682
4302,united_kingdom,2019-08-01,2019,8,-0.020662,0.003989,-0.008,0.004,0.000,-99999.0,-0.079417
4303,united_kingdom,2019-09-01,2019,9,-0.005156,0.001869,0.004,-0.003,-0.001,-99999.0,0.062217


In [21]:
df[date_col]

0       2001-01-01
1       2001-02-01
2       2001-03-01
3       2001-04-01
4       2001-05-01
           ...    
4300    2019-06-01
4301    2019-07-01
4302    2019-08-01
4303    2019-09-01
4304    2019-10-01
Name: date, Length: 4305, dtype: object

In [23]:
all_periods = df[date_col][impl_date_index - int(12*x_years):impl_date_index + int(12*x_years)]

df = df[(df[country_col].isin(donor_countries + [treatment_country])) &
        (df[date_col].isin(all_periods))]
df
df_pivot = df.pivot(index=date_col, columns=country_col, values=target_var)
df_pivot = df_pivot.replace({fake_num: np.nan})
df_pivot = df_pivot.dropna(axis=1, how='all')
df_pivot = df_pivot.dropna(axis=0, how='any')
df_pivot

SS = StandardScaler()
SS_treatmentfit = SS.fit(np.array(df_pivot).reshape(-1,1))
df_pivot_stand = pd.DataFrame(SS.fit_transform(df_pivot), columns = df_pivot.columns).set_index(df_pivot.index)

# df_sel = df.copy()
# df_sel['treatment_dummy'] = np.where(df_sel[country_col] == treatment_country, 1, 0)
# df_sel['post_dummy'] = np.where(df_sel.index >= impl_date, 1, 0)
# df_sel['treatment_post_dummy'] = df_sel['treatment_dummy'] * df_sel['post_dummy']
# df_sel = df_sel[[country_col, target_var, 'treatment_dummy', 'post_dummy', 'treatment_post_dummy']]

# treatment_pre = df_sel[(df_sel['treatment_dummy'] == 1) & (df_sel['post_dummy'] == 0)]
# treatment_post = df_sel[(df_sel['treatment_dummy'] == 1) & (df_sel['post_dummy'] == 1)]

# donors_pre = df_sel[(df_sel['treatment_dummy'] == 0) & (df_sel['post_dummy'] == 0)]
# donors_post = df_sel[(df_sel['treatment_dummy'] == 0) & (df_sel['post_dummy'] == 1)]

country,austria,belgium,bulgaria,croatia,czech_republic,germany,hungary,italy,lithuania,netherlands,romania,slovakia,spain,united_kingdom
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2010-04-01,0.125437,0.136824,0.074803,0.040649,0.093205,0.165627,0.033476,0.045961,0.069580,0.103060,0.033835,0.133499,-0.096567,0.048213
2010-05-01,0.119872,0.166816,0.075328,-0.020169,0.055488,0.177549,0.011983,0.035396,0.056182,0.126765,-0.004058,0.068806,-0.048592,0.051327
2010-06-01,0.078225,0.066576,0.057895,-0.029487,0.011787,0.061822,0.034318,0.019321,0.041737,0.050713,0.007701,0.071286,-0.058775,0.009992
2010-07-01,0.117510,0.064822,0.075414,-0.022273,0.061689,0.102458,0.015554,0.046950,0.054507,0.054566,0.012478,0.089266,-0.038976,0.034165
2010-08-01,0.100553,0.041623,0.084914,-0.035519,0.013379,0.055834,0.006551,0.003542,0.003001,0.049765,-0.020114,0.072796,-0.028913,0.018835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-01,0.020155,-0.000749,0.060059,0.058394,-0.052692,-0.074088,0.069380,0.046194,0.027922,-0.021328,-0.073992,-0.002895,0.066871,-0.065992
2015-12-01,0.010103,-0.125926,0.046151,-0.017037,0.018896,-0.024054,0.022770,-0.002934,0.032294,-0.118717,0.006410,0.024422,-0.012566,-0.158139
2016-01-01,0.040247,-0.068232,-0.049438,0.043975,0.037034,0.003652,0.048144,0.005455,0.103105,-0.053525,0.028571,0.047429,-0.058930,-0.084699
2016-02-01,-0.058547,-0.076030,-0.150211,-0.129142,-0.063887,-0.090846,-0.137455,-0.091369,-0.075183,-0.061972,-0.134975,-0.038392,-0.045915,-0.080322


In [6]:
donor_countries = get_donor_countries(prox=prox, treatment_country=treatment_country)

In [8]:
df_pivot = df.copy()
df_pivot = df_pivot[df_pivot[country_col].isin(donor_countries + [treatment_country])]
df_pivot = df_pivot.pivot(index=date_col, columns=country_col, values=target_var)
df_pivot = df_pivot.replace({fake_num: np.nan})
df_pivot = df_pivot.dropna(axis=1, how='all')
df_pivot = df_pivot.dropna(axis=0, how='any')

SS = StandardScaler()
df_pivot_stand = pd.DataFrame(SS.fit_transform(df_pivot), columns=df_pivot.columns).set_index(df_pivot.index)
    
df_pivot_stand

country,austria,belgium,bulgaria,croatia,czech_republic,germany,hungary,italy,lithuania,netherlands,romania,slovakia,spain,united_kingdom
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2001-01-01,-0.466642,-0.330772,-0.613861,-1.712273,-0.043002,0.009677,-1.592931,-1.121013,0.161349,0.639140,-1.181376,-1.535142,-1.985825,1.521928
2001-02-01,0.510201,-0.620917,-0.244584,-0.172182,0.339447,0.183391,-0.620744,-0.331569,0.573183,0.386520,-0.275821,-0.549601,-0.576534,0.998912
2001-03-01,0.220037,-0.108355,-0.245114,-1.210309,-0.534371,0.165872,-1.177256,-1.094667,0.863545,0.872550,-0.382276,-0.513527,-1.659520,1.252491
2001-04-01,1.842188,0.430435,0.748654,2.147232,1.057379,1.131645,1.066399,0.999716,1.262776,1.042231,1.628927,2.282259,-1.165086,0.502009
2001-05-01,0.256052,0.028440,0.539953,0.937708,-0.255616,-0.046664,0.363184,0.334136,0.323099,0.533759,1.110982,1.179097,0.305849,0.448073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-06-01,0.297791,0.000689,-0.289968,0.324622,-0.472677,-0.964563,0.090259,0.391003,-0.619327,-0.435710,-0.068294,-0.811897,-0.777427,-0.043687
2019-07-01,0.379227,0.073213,-0.349029,0.353126,-1.022911,-1.120770,0.073156,0.141715,0.075548,0.131661,-0.084696,-0.752426,-0.811245,-0.361733
2019-08-01,0.201079,0.037662,-0.335756,0.341210,-0.553238,-1.064497,0.022538,0.335218,-0.144153,-0.816472,-0.098586,-0.959914,-0.678730,-0.104817
2019-09-01,0.321701,0.040508,-0.295247,0.343386,0.325310,-0.995458,0.086839,0.205076,0.892070,-0.449566,-0.105338,-0.547589,-0.677188,0.181950


In [None]:
tables_path_res = get_table_path(timeframe=timeframe, folder='results', country=treatment_country, model=model)
donor_countries = get_donor_countries(prox=prox, treatment_country=treatment_country)

impl_date = get_impl_date(treatment_country=treatment_country)
impl_date_index = list(df[date_col]).index(impl_date)
# pre_period = df[date_col][impl_date_index - 12*x_years:impl_date_index]
# post_period = df[date_col][impl_date_index:impl_date_index + 12*x_years]
all_periods = df[date_col][impl_date_index - 12*x_years:impl_date_index + 12*x_years]

df = df[(df[country_col].isin(donor_countries + [treatment_country])) &
        (df[date_col].isin(all_periods))].set_index(date_col)[[country_col, target_var]]
df = df.replace({fake_num: np.nan})
df = df.dropna(axis=0, how='any')
df

df_sel = df.copy()
df_sel['treatment_dummy'] = np.where(df_sel[country_col] == treatment_country, 1, 0)
df_sel['post_dummy'] = np.where(df_sel.index >= impl_date, 1, 0)
df_sel['treatment_post_dummy'] = df_sel['treatment_dummy'] * df_sel['post_dummy']
df_sel = df_sel[[country_col, target_var, 'treatment_dummy', 'post_dummy', 'treatment_post_dummy']]
df_sel

# treatment_pre = df_sel[(df_sel['treatment_dummy'] == 1) & (df_sel['post_dummy'] == 0)]
# treatment_post = df_sel[(df_sel['treatment_dummy'] == 1) & (df_sel['post_dummy'] == 1)]

# donors_pre = df_sel[(df_sel['treatment_dummy'] == 0) & (df_sel['post_dummy'] == 0)]
# donors_post = df_sel[(df_sel['treatment_dummy'] == 0) & (df_sel['post_dummy'] == 1)]