In [1]:
import os
import numpy as np
import pandas as pd
import csv
from datetime import datetime
from sklearn.metrics import r2_score

from sklearn.preprocessing import StandardScaler 

import SparseSC

from definitions import target_var, country_col, date_col, save_output, fake_num, show_plots, \
    save_figs, year_col, donor_countries_all, stat, incl_countries, incl_years
from util_general import get_table_path, get_impl_date, get_trans, get_donor_countries, read_data, get_data_path, flatten
from plot_functions import plot_predictions, plot_diff, plot_cumsum, plot_cumsum_impl, plot_qq
from statistical_tests import shapiro_wilk_test, t_test_result

In [2]:
stat = 'stat'
timeframe = 'm'
treatment_country = 'united_kingdom'
model = 'sc'
impl_date = '2013-04-01'
prox = False

In [4]:
# read data
df = read_data(source_path=get_data_path(timeframe=timeframe), file_name=f'total_{timeframe}')
df_stat = read_data(source_path=get_data_path(timeframe=timeframe), file_name=f'total_{timeframe}_{stat}')

df = df[(df[country_col].isin(incl_countries)) & (df[year_col].isin(incl_years))]
df_stat = df_stat[(df_stat[country_col].isin(incl_countries)) & (df_stat[year_col].isin(incl_years))]

In [37]:
def sc_pivot(df: object, treatment_country: str, timeframe: str, model: str, impl_date: str, prox: bool):
    tables_path_res = get_table_path(timeframe=timeframe, folder='results', country=treatment_country, model=model)
    donor_countries = get_donor_countries(prox=prox, treatment_country=treatment_country)

    df_pivot = df.copy()
    df_pivot = df_pivot[df_pivot[country_col].isin(donor_countries + [treatment_country])]
    df_pivot = df_pivot.pivot(index=date_col, columns=country_col, values=target_var)
    df_pivot = df_pivot.replace({fake_num: np.nan})
    df_pivot = df_pivot.dropna(axis=1, how='all')
    df_pivot = df_pivot.dropna(axis=0, how='any')

    pre_treat = df_pivot[df_pivot.index < impl_date]
    post_treat = df_pivot[df_pivot.index >= impl_date]
    treat_unit = [idx for idx, val in enumerate(df_pivot.columns) if val == treatment_country]
    
    return df_pivot, pre_treat, post_treat, treat_unit

In [48]:
df_pivot, pre_treat, post_treat, treat_unit = sc_pivot(df=df_stat, treatment_country=treatment_country,
                                                       timeframe=timeframe, model=model, impl_date=impl_date,
                                                       prox=prox)

# standardize
SS = StandardScaler()
SS_treatmentfit = SS.fit(np.array(df_pivot).reshape(-1,1))

df_pivot_stand = pd.DataFrame(SS.fit_transform(df_pivot), columns = df_pivot.columns).set_index(df_pivot.index)
pre_treat_stand = pd.DataFrame(SS.fit_transform(pre_treat), columns = pre_treat.columns).set_index(pre_treat.index)
post_treat_stand = pd.DataFrame(SS.fit_transform(post_treat), columns = df_pivot.columns).set_index(post_treat.index)

In [49]:
# define the SC estimator
sc = SparseSC.fit_fast(
    features=np.array(pre_treat_stand.T),
    targets=np.array(post_treat_stand.T),
    treated_units=treat_unit,
    model_type='retrospective',
)

In [51]:
# Predict the series, make act_pred dataframe
act_pred_log_diff = df_pivot[treatment_country].to_frame()
act_pred_log_diff.rename(columns={treatment_country: 'act'}, inplace=True)

SS_treatmentfit = SS.fit(np.array(df_pivot).reshape(-1,1))
pred_log_diff = SS_treatmentfit.inverse_transform(sc.predict(df_pivot_stand.T.values)[0].reshape(-1,1))
act_pred_log_diff['pred'] = pred_log_diff
act_pred_log_diff['error'] = act_pred_log_diff['pred'] - act_pred_log_diff['act']

act_pred_log_diff

Unnamed: 0_level_0,act,pred,error
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-01,0.067296,-0.064119,-0.131415
2001-02-01,0.039017,-0.013769,-0.052785
2001-03-01,0.052728,-0.031978,-0.084706
2001-04-01,0.012149,0.101745,0.089596
2001-05-01,0.009233,0.037953,0.028720
...,...,...,...
2019-06-01,-0.017357,-0.035888,-0.018531
2019-07-01,-0.034553,-0.031963,0.002591
2019-08-01,-0.020662,-0.041594,-0.020932
2019-09-01,-0.005156,-0.020174,-0.015017
