In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np

from statsmodels.tsa.stattools import pacf

In [3]:
data = pd.read_csv("../data/combined.csv").set_index("date")

In [4]:
PCAF_THRESHOLD = 0.25

In [5]:
# lagged data
lagged = []
for col in data.columns:
    try:
        best_lags = np.where(np.abs(pacf(data[col], nlags=52)) >= PCAF_THRESHOLD)[0]
        for lag in best_lags[1:]:
            lagged.append(pd.Series(data[col].shift(lag), name=f"{col}_l{lag}"))
    except np.linalg.LinAlgError:
        pass

data_lagged = pd.concat(lagged, axis=1, join="inner").bfill()  # JUSTIFY BFILL
data_lagged

Unnamed: 0_level_0,inflation_one_month_l1,inflation_two_month_l1,inflation_two_month_l5,inflation_two_month_l10,inflation_three_month_l1,inflation_three_month_l5,inflation_three_month_l10,inflation_three_month_l14,inflation_three_month_l40,inflation_six_month_l1,...,oil_reserves_united_kingdom_l1,oil_reserves_united_states_l1,oil_reserves_upper_middle_income_countries_l1,oil_reserves_upper_middle_income_countries_l41,oil_reserves_venezuela_l1,oil_reserves_vietnam_l1,oil_reserves_world_l1,oil_reserves_yemen_l1,gas_price_l1,gas_price_l2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-05,0.4,1.0,1.0,1.0,1.5,1.5,1.5,1.5,1.5,2.2,...,5.726978e+08,4.329006e+09,4.374194e+10,4.374194e+10,8.630661e+09,3.402676e+07,1.487027e+11,2.744929e+08,1.068,1.068
1993-04-12,0.4,1.0,1.0,1.0,1.5,1.5,1.5,1.5,1.5,2.2,...,5.726978e+08,4.329006e+09,4.374194e+10,4.374194e+10,8.630661e+09,3.402676e+07,1.487027e+11,2.744929e+08,1.068,1.068
1993-04-19,0.2,0.6,1.0,1.0,1.2,1.5,1.5,1.5,1.5,1.8,...,5.730902e+08,4.324575e+09,4.391546e+10,4.374194e+10,8.637450e+09,3.412943e+07,1.488909e+11,2.745190e+08,1.068,1.068
1993-04-26,0.2,0.6,1.0,1.0,1.2,1.5,1.5,1.5,1.5,1.8,...,5.734826e+08,4.320144e+09,4.408898e+10,4.374194e+10,8.644238e+09,3.423210e+07,1.490791e+11,2.745452e+08,1.079,1.068
1993-05-03,0.2,0.6,1.0,1.0,1.2,1.5,1.5,1.5,1.5,1.8,...,5.738750e+08,4.315712e+09,4.426250e+10,4.374194e+10,8.651026e+09,3.433478e+07,1.492673e+11,2.745713e+08,1.079,1.079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,3.410000e+08,9.378455e+09,5.391370e+10,5.399431e+10,4.143911e+10,6.001600e+08,2.359787e+11,4.092000e+08,3.353,3.504
2023-01-02,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,3.410000e+08,9.378455e+09,5.391168e+10,5.399230e+10,4.143911e+10,6.001600e+08,2.359723e+11,4.092000e+08,3.234,3.353
2023-01-09,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,3.410000e+08,9.378455e+09,5.390967e+10,5.399028e+10,4.143911e+10,6.001600e+08,2.359659e+11,4.092000e+08,3.203,3.234
2023-01-16,0.2,0.3,0.3,0.4,0.6,0.6,0.8,1.2,1.7,1.8,...,3.410000e+08,9.378455e+09,5.390852e+10,5.398827e+10,4.143911e+10,6.001600e+08,2.359623e+11,4.092000e+08,3.331,3.203


In [6]:
# discrete derivatives
first_order = data.diff().rename(columns={col: f"{col}_d1" for col in data.columns})
second_order = (
    data.diff().diff().rename(columns={col: f"{col}_d2" for col in data.columns})
)  # first_order table is not used to calculate for simpler renaming
third_order = (
    data.diff().diff().diff().rename(columns={col: f"{col}_d3" for col in data.columns})
)

data_deriv = pd.concat([first_order, second_order, third_order], axis=1, join="inner")
data_deriv

Unnamed: 0_level_0,inflation_one_month_d1,inflation_two_month_d1,inflation_three_month_d1,inflation_six_month_d1,inflation_twelve_month_d1,opec_price_d1,brent_price_d1,wti_price_d1,oil_reserves_africa_d1,oil_reserves_africa_bp_d1,...,oil_reserves_tunisia_d3,oil_reserves_united_arab_emirates_d3,oil_reserves_united_kingdom_d3,oil_reserves_united_states_d3,oil_reserves_upper_middle_income_countries_d3,oil_reserves_venezuela_d3,oil_reserves_vietnam_d3,oil_reserves_world_d3,oil_reserves_yemen_d3,gas_price_d3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-05,,,,,,,,,,,...,,,,,,,,,,
1993-04-12,-0.2,-0.4,-0.3,-0.4,0.1,0.375,0.31,0.44,4.306208e+06,4.306208e+06,...,,,,,,,,,,
1993-04-19,0.0,0.0,0.0,0.0,0.0,-0.045,-0.02,-0.07,4.306208e+06,4.306208e+06,...,,,,,,,,,,
1993-04-26,0.0,0.0,0.0,0.0,0.0,-0.065,-0.04,-0.09,4.306208e+06,4.306208e+06,...,0.000000e+00,0.0,0.0,1.907349e-06,1.525879e-05,0.000002,1.490116e-08,-3.051758e-05,-5.960464e-08,-0.022
1993-05-03,0.0,0.0,0.0,0.0,0.0,-0.220,-0.11,-0.33,4.306208e+06,4.306208e+06,...,-4.656613e-10,0.0,0.0,-9.536743e-07,-1.525879e-05,0.000000,-7.450581e-09,-3.051758e-05,0.000000e+00,0.018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.0,0.0,0.0,0.0,0.0,-0.210,2.24,2.06,2.198944e+05,2.199048e+05,...,0.000000e+00,0.0,0.0,0.000000e+00,-1.525879e-05,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.038
2023-01-02,0.0,0.0,0.0,0.0,0.0,4.100,-0.12,2.16,2.198944e+05,2.199048e+05,...,0.000000e+00,0.0,0.0,0.000000e+00,7.629395e-06,0.000000,0.000000e+00,1.220703e-04,0.000000e+00,0.056
2023-01-09,0.0,0.0,0.0,0.0,0.0,-0.180,1.15,1.95,1.256539e+05,1.256599e+05,...,0.000000e+00,0.0,0.0,0.000000e+00,8.637147e+05,0.000000,0.000000e+00,2.740295e+06,0.000000e+00,0.071
2023-01-16,0.0,0.0,0.0,0.0,0.0,-5.050,-4.63,-4.96,0.000000e+00,0.000000e+00,...,0.000000e+00,0.0,0.0,0.000000e+00,2.879049e+05,0.000000,0.000000e+00,9.134316e+05,0.000000e+00,-0.252


In [7]:
# FIX TO NOT INFER MEANS FROM FUTURE
# maybe use expanding window?

# percent_change
# 0 -> 0 results in NaN percent change, replace with 0
# 0 -> !0 results in inf percent change, replace with mean

data_percent_change = data.pct_change()
positive_means = data_percent_change.replace([0, np.inf, -np.inf], np.nan)[
    data_percent_change > 0
].mean()

negative_means = data_percent_change.replace([0, np.inf, -np.inf], np.nan)[
    data_percent_change < 0
].mean()

data_percent_change = data_percent_change.fillna(0)
data_percent_change = (
    data_percent_change.replace(np.inf, positive_means)
    .replace(-np.inf, negative_means)
    .iloc[1:]  # the first row is nonsensical
)
data_percent_change

Unnamed: 0_level_0,inflation_one_month,inflation_two_month,inflation_three_month,inflation_six_month,inflation_twelve_month,opec_price,brent_price,wti_price,oil_reserves_africa,oil_reserves_africa_bp,...,oil_reserves_tunisia,oil_reserves_united_arab_emirates,oil_reserves_united_kingdom,oil_reserves_united_states,oil_reserves_upper_middle_income_countries,oil_reserves_venezuela,oil_reserves_vietnam,oil_reserves_world,oil_reserves_yemen,gas_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-12,-0.5,-0.400000,-0.2,-0.181818,0.029412,0.019521,0.016830,0.022000,0.000520,0.000520,...,-0.329469,0.0,0.000685,-0.001024,0.003967,0.000787,0.003017,0.001266,0.000095,0.000000
1993-04-19,0.0,0.000000,0.0,0.000000,0.000000,-0.002298,-0.001068,-0.003425,0.000519,0.000519,...,-0.491355,0.0,0.000685,-0.001025,0.003951,0.000786,0.003008,0.001264,0.000095,0.010300
1993-04-26,0.0,0.000000,0.0,0.000000,0.000000,-0.003327,-0.002138,-0.004418,0.000519,0.000519,...,-0.966008,0.0,0.000684,-0.001026,0.003936,0.000785,0.002999,0.001262,0.000095,0.000000
1993-05-03,0.0,0.000000,0.0,0.000000,0.000000,-0.011297,-0.005892,-0.016272,0.000519,0.000519,...,-28.418386,0.0,0.000684,-0.001027,0.003920,0.000785,0.002990,0.001261,0.000095,0.006487
1993-05-10,-1.0,-0.666667,-0.5,-0.111111,-0.028571,0.011945,0.005927,0.017544,0.000518,0.000518,...,1.036472,0.0,0.000683,-0.001028,0.003905,0.000784,0.002981,0.001259,0.000095,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.0,0.000000,0.0,0.000000,0.000000,-0.002681,0.028513,0.028196,0.000013,0.000013,...,0.000000,0.0,0.000000,0.000000,-0.000037,0.000000,0.000000,-0.000027,0.000000,-0.035491
2023-01-02,0.0,0.000000,0.0,0.000000,0.000000,0.052477,-0.001485,0.028754,0.000013,0.000013,...,0.000000,0.0,0.000000,0.000000,-0.000037,0.000000,0.000000,-0.000027,0.000000,-0.009586
2023-01-09,0.0,0.000000,0.0,0.000000,0.000000,-0.002189,0.014254,0.025233,0.000007,0.000007,...,0.000000,0.0,0.000000,0.000000,-0.000021,0.000000,0.000000,-0.000015,0.000000,0.039963
2023-01-16,0.0,0.000000,0.0,0.000000,0.000000,-0.061548,-0.056581,-0.062603,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010507


In [8]:
# rolling mean and variance
# one month
one_month_mean = (
    data.rolling(window=4, min_periods=0)
    .mean()
    .rename(columns={col: f"{col}_rm4" for col in data.columns})
)
one_month_var = (
    data.rolling(window=4, min_periods=0)
    .var()
    .rename(columns={col: f"{col}_rv4" for col in data.columns})
)

# six month
six_month_mean = (
    data.rolling(window=26, min_periods=0)
    .mean()
    .rename(columns={col: f"{col}_rm26" for col in data.columns})
)
six_month_var = (
    data.rolling(window=26, min_periods=0)
    .var()
    .rename(columns={col: f"{col}_rv26" for col in data.columns})
)

# one year
twelve_month_mean = (
    data.rolling(window=52, min_periods=0)
    .mean()
    .rename(columns={col: f"{col}_rm52" for col in data.columns})
)
twelve_month_var = (
    data.rolling(window=52, min_periods=0)
    .var()
    .rename(columns={col: f"{col}_rv52" for col in data.columns})
)

expanding_mean = (
    data.expanding().mean().rename(columns={col: f"{col}_rmex" for col in data.columns})
)
expanding_var = (
    data.expanding().var().rename(columns={col: f"{col}_rvex" for col in data.columns})
)

data_rolling = pd.concat(
    [
        one_month_mean,
        one_month_var,
        six_month_mean,
        six_month_var,
        twelve_month_mean,
        twelve_month_var,
        expanding_mean,
        expanding_var,
    ],
    axis=1,
    join="inner",
)
data_rolling

Unnamed: 0_level_0,inflation_one_month_rm4,inflation_two_month_rm4,inflation_three_month_rm4,inflation_six_month_rm4,inflation_twelve_month_rm4,opec_price_rm4,brent_price_rm4,wti_price_rm4,oil_reserves_africa_rm4,oil_reserves_africa_bp_rm4,...,oil_reserves_tunisia_rvex,oil_reserves_united_arab_emirates_rvex,oil_reserves_united_kingdom_rvex,oil_reserves_united_states_rvex,oil_reserves_upper_middle_income_countries_rvex,oil_reserves_venezuela_rvex,oil_reserves_vietnam_rvex,oil_reserves_world_rvex,oil_reserves_yemen_rvex,gas_price_rvex
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-05,0.400000,1.000000,1.500,2.200000,3.400000,19.21000,18.4200,20.0000,8.288873e+09,8.288874e+09,...,,,,,,,,,,
1993-04-12,0.300000,0.800000,1.350,2.000000,3.450000,19.39750,18.5750,20.2200,8.291026e+09,8.291027e+09,...,6.126130e+12,0.000000e+00,7.698243e+10,9.818291e+12,1.505479e+16,2.304007e+13,5.270944e+09,1.770932e+16,3.421441e+08,0.000000
1993-04-19,0.266667,0.733333,1.300,1.933333,3.466667,19.44500,18.6200,20.2700,8.293179e+09,8.293180e+09,...,1.225226e+13,0.000000e+00,1.539649e+11,1.963658e+13,3.010958e+16,4.608014e+13,1.054189e+10,3.541864e+16,6.842883e+08,0.000040
1993-04-26,0.250000,0.700000,1.275,1.900000,3.475000,19.45250,18.6325,20.2725,8.295333e+09,8.295333e+09,...,2.042043e+13,0.000000e+00,2.566081e+11,3.272764e+13,5.018263e+16,7.680024e+13,1.756981e+10,5.903107e+16,1.140480e+09,0.000040
1993-05-03,0.200000,0.600000,1.200,1.800000,3.500000,19.46375,18.6675,20.2600,8.299639e+09,8.299639e+09,...,3.063065e+13,0.000000e+00,3.849121e+11,4.909146e+13,7.527394e+16,1.152004e+14,2.635472e+10,8.854660e+16,1.710721e+09,0.000061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.175000,0.325000,0.650,1.950000,5.775000,79.88750,82.6975,76.5525,1.707603e+10,1.707603e+10,...,6.293273e+14,2.531101e+14,1.723994e+16,3.760874e+18,2.162794e+19,2.225649e+20,4.235909e+16,1.066992e+21,4.216594e+15,0.872612
2023-01-02,0.200000,0.300000,0.600,1.800000,5.700000,79.59000,81.2675,76.1875,1.707625e+10,1.707625e+10,...,6.289219e+14,2.529693e+14,1.724528e+16,3.768884e+18,2.163025e+19,2.226347e+20,4.235357e+16,1.067327e+21,4.215584e+15,0.872575
2023-01-09,0.200000,0.300000,0.600,1.800000,5.700000,80.18750,80.4675,76.1725,1.707644e+10,1.707645e+10,...,6.285169e+14,2.528286e+14,1.725060e+16,3.776870e+18,2.163253e+19,2.227041e+20,4.234802e+16,1.067660e+21,4.214573e+15,0.872697
2023-01-16,0.200000,0.300000,0.600,1.800000,5.700000,79.85250,80.1275,76.4750,1.707659e+10,1.707659e+10,...,6.281125e+14,2.526880e+14,1.725589e+16,3.784833e+18,2.163479e+19,2.227732e+20,4.234246e+16,1.067992e+21,4.213561e+15,0.872865


In [9]:
data_basic = pd.concat(
    [data, data_lagged, data_deriv, data_percent_change, data_rolling],
    axis=1,
    join="inner",
)
data_basic

Unnamed: 0_level_0,inflation_one_month,inflation_two_month,inflation_three_month,inflation_six_month,inflation_twelve_month,opec_price,brent_price,wti_price,oil_reserves_africa,oil_reserves_africa_bp,...,oil_reserves_tunisia_rvex,oil_reserves_united_arab_emirates_rvex,oil_reserves_united_kingdom_rvex,oil_reserves_united_states_rvex,oil_reserves_upper_middle_income_countries_rvex,oil_reserves_venezuela_rvex,oil_reserves_vietnam_rvex,oil_reserves_world_rvex,oil_reserves_yemen_rvex,gas_price_rvex
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-12,0.2,0.6,1.2,1.8,3.5,19.585,18.73,20.44,8.293179e+09,8.293180e+09,...,6.126130e+12,0.000000e+00,7.698243e+10,9.818291e+12,1.505479e+16,2.304007e+13,5.270944e+09,1.770932e+16,3.421441e+08,0.000000
1993-04-19,0.2,0.6,1.2,1.8,3.5,19.540,18.71,20.37,8.297486e+09,8.297486e+09,...,1.225226e+13,0.000000e+00,1.539649e+11,1.963658e+13,3.010958e+16,4.608014e+13,1.054189e+10,3.541864e+16,6.842883e+08,0.000040
1993-04-26,0.2,0.6,1.2,1.8,3.5,19.475,18.67,20.28,8.301792e+09,8.301792e+09,...,2.042043e+13,0.000000e+00,2.566081e+11,3.272764e+13,5.018263e+16,7.680024e+13,1.756981e+10,5.903107e+16,1.140480e+09,0.000040
1993-05-03,0.2,0.6,1.2,1.8,3.5,19.255,18.56,19.95,8.306098e+09,8.306099e+09,...,3.063065e+13,0.000000e+00,3.849121e+11,4.909146e+13,7.527394e+16,1.152004e+14,2.635472e+10,8.854660e+16,1.710721e+09,0.000061
1993-05-10,0.0,0.2,0.6,1.6,3.4,19.485,18.67,20.30,8.310404e+09,8.310405e+09,...,4.288291e+13,0.000000e+00,5.388770e+11,6.872804e+13,1.053835e+17,1.612805e+14,3.689661e+10,1.239652e+17,2.395009e+09,0.000066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.2,0.3,0.6,1.8,5.7,78.130,80.80,75.12,1.707636e+10,1.707636e+10,...,6.293273e+14,2.531101e+14,1.723994e+16,3.760874e+18,2.162794e+19,2.225649e+20,4.235909e+16,1.066992e+21,4.216594e+15,0.872612
2023-01-02,0.2,0.3,0.6,1.8,5.7,82.230,80.68,77.28,1.707658e+10,1.707658e+10,...,6.289219e+14,2.529693e+14,1.724528e+16,3.768884e+18,2.163025e+19,2.226347e+20,4.235357e+16,1.067327e+21,4.215584e+15,0.872575
2023-01-09,0.2,0.3,0.6,1.8,5.7,82.050,81.83,79.23,1.707670e+10,1.707671e+10,...,6.285169e+14,2.528286e+14,1.725060e+16,3.776870e+18,2.163253e+19,2.227041e+20,4.234802e+16,1.067660e+21,4.214573e+15,0.872697
2023-01-16,0.2,0.3,0.6,1.8,5.7,77.000,77.20,74.27,1.707670e+10,1.707671e+10,...,6.281125e+14,2.526880e+14,1.725589e+16,3.784833e+18,2.163479e+19,2.227732e+20,4.234246e+16,1.067992e+21,4.213561e+15,0.872865
