In [178]:
import os
import numpy as np
import pandas as pd
import csv
from datetime import datetime
from sklearn.metrics import r2_score
import seaborn as sns
import statsmodels.api as sm

from scipy.stats import ttest_1samp, ttest_ind, t
from scipy.stats import shapiro, normaltest, kstest
from statsmodels.stats.stattools import durbin_watson

from definitions import target_var, country_col, date_col, save_output, fake_num, show_plots, save_figs, fig_size, sign_level
from util_general import get_table_path, get_impl_date, get_trans, get_donor_countries
from plot_functions import plot_predictions, plot_diff, plot_cumsum, plot_cumsum_impl, plot_qq
from statistical_tests import shapiro_wilk_test, t_test_result

# custom functions
from definitions import all_paths, country_col, year_col, stat, incl_countries, incl_years, model_val
from util_general import read_data, validate_input, get_trans, get_data_path, get_impl_date
from estimators import arco, sc, did

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

from sklearn.linear_model import Lasso
from definitions import fig_size, show_plots, save_figs, treatment_countries
from util_general import get_impl_date, get_fig_path, get_formal_title, get_model_color, get_formal_country_name

from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

In [159]:
timeframe = 'm'
treatment_country = 'switzerland'
model = 'lasso'

In [160]:
df = read_data(source_path=get_data_path(timeframe=timeframe), file_name=f'total_{timeframe}')
df

Unnamed: 0,country,date,year,month,co2,gdp,ind_prod,infl,unempl,pop,brent
0,austria,2000-01-01,2000,1,9.726928e+09,0.009051,-0.028,-0.004,0.043,8.003000e+06,25.51
1,austria,2000-02-01,2000,2,8.532685e+09,0.008627,0.043,0.008,0.043,8.004700e+06,27.78
2,austria,2000-03-01,2000,3,8.134567e+09,0.008203,0.006,0.001,0.041,8.006400e+06,27.49
3,austria,2000-04-01,2000,4,6.546738e+09,0.007778,0.018,-0.002,0.038,8.008100e+06,22.76
4,austria,2000-05-01,2000,5,5.603262e+09,0.007029,0.020,-0.001,0.037,8.010100e+06,27.74
...,...,...,...,...,...,...,...,...,...,...,...
4517,united_kingdom,2019-06-01,2019,6,2.864851e+10,0.004439,-0.005,0.000,0.038,6.684433e+07,64.22
4518,united_kingdom,2019-07-01,2019,7,2.872032e+10,0.006109,-0.002,0.000,0.038,6.686800e+07,63.92
4519,united_kingdom,2019-08-01,2019,8,2.809851e+10,0.003989,-0.008,0.004,0.038,6.689167e+07,59.04
4520,united_kingdom,2019-09-01,2019,9,3.070698e+10,0.001869,0.004,0.001,0.037,6.691533e+07,62.83


In [161]:
df_stat = read_data(source_path=get_data_path(timeframe=timeframe), file_name=f'total_{timeframe}_stat')
df_stat

Unnamed: 0,country,date,year,month,co2,gdp,ind_prod,infl,unempl,pop,brent
0,austria,2001-01-01,2001,1,-0.019138,-0.002088,-0.019,-0.004,-0.027399,-99999.0,-0.001560
1,austria,2001-02-01,2001,2,0.039611,-0.001670,0.018,0.003,0.000000,-99999.0,0.070813
2,austria,2001-03-01,2001,3,0.022160,-0.001253,-0.011,-0.001,0.027399,-99999.0,-0.115513
3,austria,2001-04-01,2001,4,0.119718,-0.000835,-0.015,0.003,0.052644,-99999.0,0.046260
4,austria,2001-05-01,2001,5,0.024326,-0.000423,0.000,-0.003,0.025318,-99999.0,0.098282
...,...,...,...,...,...,...,...,...,...,...,...
4300,united_kingdom,2019-06-01,2019,6,-0.017357,0.004439,-0.005,-0.003,0.000000,-99999.0,-0.104862
4301,united_kingdom,2019-07-01,2019,7,-0.034553,0.006109,-0.002,0.000,0.000000,-99999.0,-0.004682
4302,united_kingdom,2019-08-01,2019,8,-0.020662,0.003989,-0.008,0.004,0.000000,-99999.0,-0.079417
4303,united_kingdom,2019-09-01,2019,9,-0.005156,0.001869,0.004,-0.003,-0.026668,-99999.0,0.062217


In [162]:
file_name = f'{model}_{treatment_country}_{timeframe}_act_pred_log_diff'
act_pred_log_diff = read_data(get_table_path(timeframe, 'results', treatment_country, model), file_name)

act_pred_log_diff = act_pred_log_diff.set_index('date')
act_log_diff = act_pred_log_diff['act']
pred_log_diff = act_pred_log_diff['pred']
error_log_diff = act_pred_log_diff['error']

act_pred_log_diff

Unnamed: 0_level_0,act,pred,error
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-01,-0.026118,-0.011255,-0.014862
2001-02-01,0.019276,0.015656,0.003619
2001-03-01,-0.007401,0.018095,-0.025496
2001-04-01,0.089918,0.084675,0.005244
2001-05-01,0.008302,0.011037,-0.002736
...,...,...,...
2019-06-01,0.003582,0.020332,-0.016750
2019-07-01,0.014090,0.028164,-0.014075
2019-08-01,0.027670,0.018290,0.009380
2019-09-01,-0.000445,0.020978,-0.021423


In [163]:
# summarize chosen configuration
date_start = df['date'].iloc[0]
date_end = df['date'].iloc[-1]
log, diff_level, = get_trans(timeframe=timeframe)[target_var]

In [164]:
orig = df.copy()
orig = orig[(orig[country_col] == treatment_country) &
            (orig[date_col] >= date_start) &
            (orig[date_col] <= date_end)].set_index(date_col)[target_var]
orig

date
2000-01-01    5.615683e+09
2000-02-01    4.956583e+09
2000-03-01    4.815360e+09
2000-04-01    4.247155e+09
2000-05-01    3.567299e+09
                  ...     
2019-06-01    2.876220e+09
2019-07-01    2.840084e+09
2019-08-01    2.793635e+09
2019-09-01    3.445180e+09
2019-10-01    3.819263e+09
Name: co2, Length: 238, dtype: float64

In [165]:
if log:
    orig_log = np.log(orig)
else:
    orig_log = orig

if diff_level != 0:
    orig_log_diff = orig_log.diff(diff_level).dropna()
else:
    orig_log_diff = orig_log.dropna()

if sum(orig_log_diff - act_log_diff) > 1e-5:
    raise ValueError('Fault in conversion')
    
# save act_pred_log_diff_check
act_pred_log_diff_check = pd.DataFrame(list(zip(act_log_diff, orig_log_diff, pred_log_diff)),
                                       columns=['act', 'check', 'pred']).set_index(pred_log_diff.index)
act_pred_log_diff_check['error'] = act_pred_log_diff_check['act'] - act_pred_log_diff_check['pred']
act_pred_log_diff_check

Unnamed: 0_level_0,act,check,pred,error
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-01-01,-0.026118,-0.026118,-0.011255,-0.014862
2001-02-01,0.019276,0.019276,0.015656,0.003619
2001-03-01,-0.007401,-0.007401,0.018095,-0.025496
2001-04-01,0.089918,0.089918,0.084675,0.005244
2001-05-01,0.008302,0.008302,0.011037,-0.002736
...,...,...,...,...
2019-06-01,0.003582,0.003582,0.020332,-0.016750
2019-07-01,0.014090,0.014090,0.028164,-0.014075
2019-08-01,0.027670,0.027670,0.018290,0.009380
2019-09-01,-0.000445,-0.000445,0.020978,-0.021423


In [166]:
act_log = np.zeros(len(orig_log))
pred_log = np.zeros(len(orig_log))
pred_log[:diff_level] = orig_log[:diff_level]
act_log[:diff_level] = orig_log[:diff_level]
for i in range(diff_level, len(orig_log)):
    if diff_level != 0:
        pred_log[i] = pred_log[i - diff_level] + pred_log_diff[i-diff_level]
        act_log[i] = act_log[i - diff_level] + act_log_diff[i-diff_level]
        
if sum(orig_log - act_log) > 1e-5:
    raise ValueError('Fault in conversion')        

# act_pred_log
act_pred_log = pd.DataFrame(list(zip(act_log, pred_log)), columns=['act', 'pred']).set_index(orig_log.index)
act_pred_log['error'] = act_pred_log['act'] - act_pred_log['pred']
act_pred_log = act_pred_log.iloc[diff_level:]
act_pred_log

Unnamed: 0_level_0,act,pred,error
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-01,22.422712,22.437574,-0.014862
2001-02-01,22.343258,22.339639,0.003619
2001-03-01,22.287675,22.313172,-0.025496
2001-04-01,22.259434,22.254190,0.005244
2001-05-01,22.003376,22.006112,-0.002736
...,...,...,...
2019-06-01,21.779743,22.026285,-0.246542
2019-07-01,21.767099,22.058116,-0.291016
2019-08-01,21.750609,21.952070,-0.201460
2019-09-01,21.960242,22.102584,-0.142343


In [167]:
act = np.exp(act_log)
pred = np.exp(pred_log)
        
if sum(orig_log - act_log) > 1e-5:
    raise ValueError('Fault in conversion')        

 # act_pred
act_pred = pd.DataFrame(list(zip(act, pred)), columns=['act', 'pred']).set_index(orig.index)
act_pred['error'] = act_pred['act'] - act_pred['pred']
act_pred = act_pred.iloc[diff_level:]
act_pred

Unnamed: 0_level_0,act,pred,error
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-01,5.470914e+09,5.552830e+09,-8.191621e+07
2001-02-01,5.053050e+09,5.034795e+09,1.825548e+07
2001-03-01,4.779852e+09,4.903287e+09,-1.234352e+08
2001-04-01,4.646748e+09,4.622447e+09,2.430181e+07
2001-05-01,3.597037e+09,3.606892e+09,-9.854046e+06
...,...,...,...
2019-06-01,2.876220e+09,3.680391e+09,-8.041703e+08
2019-07-01,2.840084e+09,3.799426e+09,-9.593422e+08
2019-08-01,2.793635e+09,3.417140e+09,-6.235051e+08
2019-09-01,3.445180e+09,3.972194e+09,-5.270145e+08
