In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np

from statsmodels.tsa.stattools import pacf

In [3]:
data = pd.read_csv("../data/combined.csv").set_index("date")

In [4]:
PCAF_THRESHOLD = 0.25

In [5]:
# lagged data
lagged = []
for col in data.columns:
    try:
        best_lags = np.where(np.abs(pacf(data[col], nlags=52)) >= PCAF_THRESHOLD)[0]
        for lag in best_lags[1:]:
            lagged.append(pd.Series(data[col].shift(lag), name=f"{col}_l{lag}"))
    except np.linalg.LinAlgError:
        pass

data_lagged = pd.concat(lagged, axis=1, join="inner")
data_lagged

Unnamed: 0_level_0,inflation_one_month_l1,inflation_two_month_l1,inflation_two_month_l5,inflation_two_month_l10,inflation_three_month_l1,inflation_three_month_l5,inflation_three_month_l10,inflation_three_month_l14,inflation_three_month_l40,inflation_six_month_l1,...,oil_reserves_vietnam_l1,oil_reserves_lower_middle_income_countries_l1,oil_reserves_nigeria_l1,oil_reserves_libya_l1,oil_reserves_congo_l1,opec_price_l1,wti_price_l1,brent_price_l1,gas_price_l1,gas_price_l2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-05,,,,,,,,,,,...,,,,,,,,,,
1993-04-12,0.4,1.0,,,1.5,,,,,2.2,...,3.402676e+07,1.928013e+10,2.823368e+09,3.109920e+09,9.339996e+07,19.210,20.00,18.42,1.068,
1993-04-19,0.2,0.6,,,1.2,,,,,1.8,...,3.412943e+07,1.928739e+10,2.830954e+09,3.109920e+09,9.325294e+07,19.585,20.44,18.73,1.068,1.068
1993-04-26,0.2,0.6,,,1.2,,,,,1.8,...,3.423210e+07,1.929465e+10,2.838540e+09,3.109920e+09,9.310593e+07,19.540,20.37,18.71,1.079,1.068
1993-05-03,0.2,0.6,,,1.2,,,,,1.8,...,3.433478e+07,1.930191e+10,2.846126e+09,3.109920e+09,9.295892e+07,19.475,20.28,18.67,1.079,1.079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,6.001600e+08,3.177444e+10,5.031796e+09,6.596713e+09,3.931048e+08,78.340,73.06,78.56,3.353,3.504
2023-01-02,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,6.001600e+08,3.177401e+10,5.031796e+09,6.596713e+09,3.931048e+08,78.130,75.12,80.80,3.234,3.353
2023-01-09,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,6.001600e+08,3.177359e+10,5.031796e+09,6.596713e+09,3.931048e+08,82.230,77.28,80.68,3.203,3.234
2023-01-16,0.2,0.3,0.3,0.4,0.6,0.6,0.8,1.2,1.7,1.8,...,6.001600e+08,3.177335e+10,5.031796e+09,6.596713e+09,3.931048e+08,82.050,79.23,81.83,3.331,3.203


In [6]:
# discrete derivatives
first_order = data.diff().rename(columns={col: f"{col}_d1" for col in data.columns})
second_order = (
    data.diff().diff().rename(columns={col: f"{col}_d2" for col in data.columns})
)  # first_order table is not used to calculate for simpler renaming
third_order = (
    data.diff().diff().diff().rename(columns={col: f"{col}_d3" for col in data.columns})
)

data_deriv = pd.concat([first_order, second_order, third_order], axis=1, join="inner")
data_deriv

Unnamed: 0_level_0,inflation_one_month_d1,inflation_two_month_d1,inflation_three_month_d1,inflation_six_month_d1,inflation_twelve_month_d1,oil_reserves_algeria_d1,oil_reserves_kuwait_d1,oil_reserves_united_states_d1,oil_reserves_middle_east_bp_d1,oil_reserves_iran_d1,...,oil_reserves_iran_d3,oil_reserves_vietnam_d3,oil_reserves_lower_middle_income_countries_d3,oil_reserves_nigeria_d3,oil_reserves_libya_d3,oil_reserves_congo_d3,opec_price_d3,wti_price_d3,brent_price_d3,gas_price_d3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-05,,,,,,,,,,,...,,,,,,,,,,
1993-04-12,-0.2,-0.4,-0.3,-0.4,0.1,0.0,-1.373342e+06,-4.431318e+06,3.154502e+06,26158.90411,...,,,,,,,,,,
1993-04-19,0.0,0.0,0.0,0.0,0.0,0.0,-1.373342e+06,-4.431318e+06,3.154502e+06,26158.90411,...,,,,,,,,,,
1993-04-26,0.0,0.0,0.0,0.0,0.0,0.0,-1.373342e+06,-4.431318e+06,3.154502e+06,26158.90411,...,0.0,1.490116e-08,0.000004,9.536743e-07,0.0,1.490116e-08,0.400,0.49,0.31,-0.022
1993-05-03,0.0,0.0,0.0,0.0,0.0,0.0,-1.373342e+06,-4.431318e+06,3.154502e+06,26158.90411,...,0.0,-7.450581e-09,0.000000,-9.536743e-07,0.0,1.490116e-08,-0.135,-0.22,-0.05,0.018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,-1.744799e+05,0.00000,...,0.0,0.000000e+00,-0.000004,0.000000e+00,0.0,0.000000e+00,-1.330,15.07,13.81,0.038
2023-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,-1.744799e+05,0.00000,...,0.0,0.000000e+00,0.000000,0.000000e+00,0.0,0.000000e+00,3.200,-8.19,-11.07,0.056
2023-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,-9.970279e+04,0.00000,...,0.0,0.000000e+00,180821.556156,0.000000e+00,0.0,0.000000e+00,-8.590,-0.31,3.63,0.071
2023-01-16,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.00000,...,0.0,0.000000e+00,60273.852070,0.000000e+00,0.0,0.000000e+00,-0.590,-6.70,-7.05,-0.252


In [7]:
# FIX TO NOT INFER MEANS FROM FUTURE
# maybe use expanding window?

# percent_change
# 0 -> 0 results in NaN percent change, replace with 0
# 0 -> !0 results in inf percent change, replace with mean

data_percent_change = data.pct_change()
positive_means = data_percent_change.replace([0, np.inf, -np.inf], np.nan)[
    data_percent_change > 0
].mean()

negative_means = data_percent_change.replace([0, np.inf, -np.inf], np.nan)[
    data_percent_change < 0
].mean()

data_percent_change = data_percent_change.fillna(0)
data_percent_change = (
    data_percent_change.replace(np.inf, positive_means)
    .replace(-np.inf, negative_means)
    .iloc[1:]  # the first row is nonsensical
)
data_percent_change = data_percent_change.rename(columns={col: f"{col}_pct"})

In [8]:
# rolling mean and variance
# one month
one_month_mean = (
    data.rolling(window=4, min_periods=0)
    .mean()
    .rename(columns={col: f"{col}_rm4" for col in data.columns})
)
one_month_var = (
    data.rolling(window=4, min_periods=0)
    .var()
    .rename(columns={col: f"{col}_rv4" for col in data.columns})
)

# six month
six_month_mean = (
    data.rolling(window=26, min_periods=0)
    .mean()
    .rename(columns={col: f"{col}_rm26" for col in data.columns})
)
six_month_var = (
    data.rolling(window=26, min_periods=0)
    .var()
    .rename(columns={col: f"{col}_rv26" for col in data.columns})
)

# one year
twelve_month_mean = (
    data.rolling(window=52, min_periods=0)
    .mean()
    .rename(columns={col: f"{col}_rm52" for col in data.columns})
)
twelve_month_var = (
    data.rolling(window=52, min_periods=0)
    .var()
    .rename(columns={col: f"{col}_rv52" for col in data.columns})
)

expanding_mean = (
    data.expanding().mean().rename(columns={col: f"{col}_rmex" for col in data.columns})
)
expanding_var = (
    data.expanding().var().rename(columns={col: f"{col}_rvex" for col in data.columns})
)

data_rolling = pd.concat(
    [
        one_month_mean,
        one_month_var,
        six_month_mean,
        six_month_var,
        twelve_month_mean,
        twelve_month_var,
        expanding_mean,
        expanding_var,
    ],
    axis=1,
    join="inner",
)
data_rolling

Unnamed: 0_level_0,inflation_one_month_rm4,inflation_two_month_rm4,inflation_three_month_rm4,inflation_six_month_rm4,inflation_twelve_month_rm4,oil_reserves_algeria_rm4,oil_reserves_kuwait_rm4,oil_reserves_united_states_rm4,oil_reserves_middle_east_bp_rm4,oil_reserves_iran_rm4,...,oil_reserves_iran_rvex,oil_reserves_vietnam_rvex,oil_reserves_lower_middle_income_countries_rvex,oil_reserves_nigeria_rvex,oil_reserves_libya_rvex,oil_reserves_congo_rvex,opec_price_rvex,wti_price_rvex,brent_price_rvex,gas_price_rvex
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-05,0.400000,1.000000,1.500,2.200000,3.400000,1.254880e+09,1.314534e+10,4.329006e+09,9.017555e+10,1.266643e+10,...,,,,,,,,,,
1993-04-12,0.300000,0.800000,1.350,2.000000,3.450000,1.254880e+09,1.314465e+10,4.326791e+09,9.017712e+10,1.266645e+10,...,3.421441e+08,5.270944e+09,2.634534e+13,2.877432e+13,0.000000e+00,1.080642e+10,0.070312,0.096800,0.048050,0.000000
1993-04-19,0.266667,0.733333,1.300,1.933333,3.466667,1.254880e+09,1.314396e+10,4.324575e+09,9.017870e+10,1.266646e+10,...,6.842883e+08,1.054189e+10,5.269068e+13,5.754864e+13,0.000000e+00,2.161283e+10,0.041925,0.055900,0.030100,0.000040
1993-04-26,0.250000,0.700000,1.275,1.900000,3.475000,1.254880e+09,1.314328e+10,4.322359e+09,9.018028e+10,1.266647e+10,...,1.140480e+09,1.756981e+10,8.781779e+13,9.591441e+13,0.000000e+00,3.602139e+10,0.028175,0.037292,0.020692,0.000040
1993-05-03,0.200000,0.600000,1.200,1.800000,3.500000,1.254880e+09,1.314190e+10,4.317928e+09,9.018343e+10,1.266650e+10,...,1.710721e+09,2.635472e+10,1.317267e+14,1.438716e+14,0.000000e+00,5.403209e+10,0.028933,0.048770,0.016570,0.000061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.175000,0.325000,0.650,1.950000,5.775000,1.664080e+09,1.384460e+10,9.378455e+09,1.140139e+11,2.152392e+10,...,1.411308e+19,4.235909e+16,2.697954e+19,8.510551e+17,1.571914e+18,8.845684e+15,1017.927390,868.934942,1093.023631,0.872612
2023-01-02,0.200000,0.300000,0.600,1.800000,5.700000,1.664080e+09,1.384460e+10,9.378455e+09,1.140137e+11,2.152392e+10,...,1.411365e+19,4.235357e+16,2.697537e+19,8.507617e+17,1.571812e+18,8.849417e+15,1017.814519,868.771130,1092.764333,0.872575
2023-01-09,0.200000,0.300000,0.600,1.800000,5.700000,1.664080e+09,1.384460e+10,9.378455e+09,1.140136e+11,2.152392e+10,...,1.411422e+19,4.234802e+16,2.697118e+19,8.504684e+17,1.571708e+18,8.853134e+15,1017.694394,868.671668,1092.544530,0.872697
2023-01-16,0.200000,0.300000,0.600,1.800000,5.700000,1.664080e+09,1.384460e+10,9.378455e+09,1.140135e+11,2.152392e+10,...,1.411477e+19,4.234246e+16,2.696699e+19,8.501752e+17,1.571603e+18,8.856833e+15,1017.402929,868.417118,1092.174988,0.872865


In [9]:
data_basic = pd.concat(
    [data_lagged, data_deriv, data_percent_change, data_rolling],
    axis=1,
    join="inner",
)
data_basic

Unnamed: 0_level_0,inflation_one_month_l1,inflation_two_month_l1,inflation_two_month_l5,inflation_two_month_l10,inflation_three_month_l1,inflation_three_month_l5,inflation_three_month_l10,inflation_three_month_l14,inflation_three_month_l40,inflation_six_month_l1,...,oil_reserves_iran_rvex,oil_reserves_vietnam_rvex,oil_reserves_lower_middle_income_countries_rvex,oil_reserves_nigeria_rvex,oil_reserves_libya_rvex,oil_reserves_congo_rvex,opec_price_rvex,wti_price_rvex,brent_price_rvex,gas_price_rvex
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-12,0.4,1.0,,,1.5,,,,,2.2,...,3.421441e+08,5.270944e+09,2.634534e+13,2.877432e+13,0.000000e+00,1.080642e+10,0.070312,0.096800,0.048050,0.000000
1993-04-19,0.2,0.6,,,1.2,,,,,1.8,...,6.842883e+08,1.054189e+10,5.269068e+13,5.754864e+13,0.000000e+00,2.161283e+10,0.041925,0.055900,0.030100,0.000040
1993-04-26,0.2,0.6,,,1.2,,,,,1.8,...,1.140480e+09,1.756981e+10,8.781779e+13,9.591441e+13,0.000000e+00,3.602139e+10,0.028175,0.037292,0.020692,0.000040
1993-05-03,0.2,0.6,,,1.2,,,,,1.8,...,1.710721e+09,2.635472e+10,1.317267e+14,1.438716e+14,0.000000e+00,5.403209e+10,0.028933,0.048770,0.016570,0.000061
1993-05-10,0.2,0.6,1.0,,1.2,1.5,,,,1.8,...,2.395009e+09,3.689661e+10,1.844174e+14,2.014203e+14,0.000000e+00,7.564492e+10,0.024010,0.040427,0.013707,0.000066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,1.411308e+19,4.235909e+16,2.697954e+19,8.510551e+17,1.571914e+18,8.845684e+15,1017.927390,868.934942,1093.023631,0.872612
2023-01-02,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,1.411365e+19,4.235357e+16,2.697537e+19,8.507617e+17,1.571812e+18,8.849417e+15,1017.814519,868.771130,1092.764333,0.872575
2023-01-09,0.2,0.3,0.4,0.7,0.6,0.8,1.2,1.3,1.9,1.8,...,1.411422e+19,4.234802e+16,2.697118e+19,8.504684e+17,1.571708e+18,8.853134e+15,1017.694394,868.671668,1092.544530,0.872697
2023-01-16,0.2,0.3,0.3,0.4,0.6,0.6,0.8,1.2,1.7,1.8,...,1.411477e+19,4.234246e+16,2.696699e+19,8.501752e+17,1.571603e+18,8.856833e+15,1017.402929,868.417118,1092.174988,0.872865


In [10]:
data_basic.to_csv("../data/basic_features.csv")