In [31]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.vector_ar.vecm import coint_johansen
import warnings
from statsmodels.tools.sm_exceptions import InterpolationWarning

In [53]:
df_model = pd.read_csv('data/preprocessed/dataset.csv')

df_log = df_model.copy()
# Log Transformations
df_log[['CPIH_ln','AWE_ln','Prod_ln']] = df_log[['CPIH','AWE','Productivity']].apply(np.log)
cols = ['CPIH_ln','AWE_ln','Prod_ln']

# Running unit root test for each variable
print("Dickey Fuller Test for log variables")
for col in cols:
    adf_test = adfuller(df_log[col], autolag='AIC')[1]
    print(f"{col} : {adf_test}")
# Cannot reject the null of non-stationarity
# Prod_ln can reject then null at 10% but not 5%

#Taking first difference
df_log_diff = df_log[cols].diff().dropna()
#Running unit roots tests for differenced variables
print("\nDickey Fuller Test for log variables - 1st difference")
for col in cols:
    adf_test = adfuller(df_log_diff[col], autolag='AIC')[1]
    print(f"{col} : {adf_test}")
# We reject the null at 1% level, the differenced series are stationary

# Testing for trend stationarity using KPSS
warnings.simplefilter('ignore', InterpolationWarning)
print("\nKPSS Test for log variables")
for col in cols:
    stat, p, lags, crit = kpss(df_log[col].dropna(), regression='c', nlags='auto')
    print(f"{col} KPSS statistic : {stat}")
    print(f"{col} p-value: {p}\n")
# We can confidently reject the null of stationarity using both tests

# Testing for Co-Integration
joh_data = df_log[cols]
for i in range(1, 5):
    print(f"{i} Lags in Differences")
    johansen_test = coint_johansen(joh_data, det_order=0, k_ar_diff=i)
    trace_stats = johansen_test.lr1
    crit_vals = johansen_test.cvt[:, 1]

    # Hypothesis testing
    for rank, (trace,crit) in enumerate(zip(trace_stat, crit_vals)):
        decision = "Reject Null" if trace>crit else "Fail to Reject Null"
        print(f"r ≤ {rank}: Trace={trace:.2f}, Crit(5%)={crit:.2f} --- {decision}")

    # Estimating cointegration rank
    rank = np.argmax(trace_stats < crit_vals)
    print(f"Cointegration rank at {i} lags: {rank}\n")

# Estimated cointegration rank is 2
# With 3 variables -> 2 indep. stationary long-run trends, 1 common stochastic trend




Dickey Fuller Test for log variables
CPIH_ln : 0.9978836013850167
AWE_ln : 0.988713743086515
Prod_ln : 0.09409055828814583

Dickey Fuller Test for log variables - 1st difference
CPIH_ln : 0.014480752737135996
AWE_ln : 1.81475465994484e-29
Prod_ln : 1.7210911248880824e-07

KPSS Test for log variables
CPIH_ln KPSS statistic : 2.5874281123825575
CPIH_ln p-value: 0.01

AWE_ln KPSS statistic : 2.5370534837531893
AWE_ln p-value: 0.01

Prod_ln KPSS statistic : 2.5335623841141586
Prod_ln p-value: 0.01

1 Lags in Differences
r ≤ 0: Trace=38.06, Crit(5%)=29.80 --- Reject Null
r ≤ 1: Trace=16.78, Crit(5%)=15.49 --- Reject Null
r ≤ 2: Trace=1.46, Crit(5%)=3.84 --- Fail to Reject Null
Cointegration rank at 1 lags: 2

2 Lags in Differences
r ≤ 0: Trace=38.06, Crit(5%)=29.80 --- Reject Null
r ≤ 1: Trace=16.78, Crit(5%)=15.49 --- Reject Null
r ≤ 2: Trace=1.46, Crit(5%)=3.84 --- Fail to Reject Null
Cointegration rank at 2 lags: 2

3 Lags in Differences
r ≤ 0: Trace=38.06, Crit(5%)=29.80 --- Reject Null