In [1]:
from    imputation      import  core_utils, core_imputation_model
import  numpy           as      np
#from    tqdm.notebook   import  tqdm
from    tqdm import tqdm
print("Imported Packages")

Imported Packages


# Loading the Data

`core_utils.get_data_panel` loads the data from the corresponding `data_path`, which is a feather file shared on Google drive. We have to use Gdrive as it is too large to host on Github. The data file it contains the characteristic percentile ranks as a numpy array of shape TxNxL where T is the number of dates, N the number of stocks, and L the number of characteristics. The file also includes the raw characteristics, the characteristic namess, the dates and permnos

In [2]:
data_path = "data/example_data.fthr"
print(data_path)

data/example_data.fthr


In [3]:
percentile_rank_chars, chars, date_vals, permnos = core_utils.get_data_panel(
    path=data_path, computstat_data_present_filter=True,start_date=19770000)

In [14]:
print(np.isnan(percentile_rank_chars).sum()) # number of missing

345367


In [5]:
char_groupings = core_utils.CHAR_GROUPINGS

In [6]:
print(char_groupings)

[('A2ME', 'Q'), ('AC', 'Q'), ('AT', 'Q'), ('ATO', 'Q'), ('B2M', 'QM'), ('BETA_d', 'M'), ('BETA_m', 'M'), ('C2A', 'Q'), ('CF2B', 'Q'), ('CF2P', 'QM'), ('CTO', 'Q'), ('D2A', 'Q'), ('D2P', 'M'), ('DPI2A', 'Q'), ('E2P', 'QM'), ('FC2Y', 'QY'), ('IdioVol', 'M'), ('INV', 'Q'), ('LEV', 'Q'), ('ME', 'M'), ('TURN', 'M'), ('NI', 'Q'), ('NOA', 'Q'), ('OA', 'Q'), ('OL', 'Q'), ('OP', 'Q'), ('PCM', 'Q'), ('PM', 'Q'), ('PROF', 'QY'), ('Q', 'QM'), ('R2_1', 'M'), ('R12_2', 'M'), ('R12_7', 'M'), ('R36_13', 'M'), ('R60_13', 'M'), ('HIGH52', 'M'), ('RVAR', 'M'), ('RNA', 'Q'), ('ROA', 'Q'), ('ROE', 'Q'), ('S2P', 'QM'), ('SGA2S', 'Q'), ('SPREAD', 'M'), ('SUV', 'M'), ('VAR', 'M')]


# Running Imputations

In this section we will run the imputation method described in the paper.

Two methods we want to highlight are
- `core_imputation_model_new.run_imputation`
- `core_imputation_model_new.fit_factors_and_loadings`

The first code runs the full method as described in the paper, including potentially different time series information sets depending on the arguments given.

The second code estimates the XS factor model. 

The examples below correspond to global and local fits. The parameters are documented in the function definition. 

In [7]:
T, N, L = percentile_rank_chars.shape
print(f"T = {T:d}, N = {N:d}, L = {L:d}")

T = 12, N = 4469, L = 45


## Estimating the Model

We start with the local estimation. In this case, we show how to estimate either the purely cross-sectional model (local XS) or the cross-sectional model with backwards time series information (local B-XS). 

We would like to emphasize two parameters in this estimation. This first is the number of cross-sectional factors K: `n_xs_factors` the second is the cross-sectional factor regularization gamma: `xs_factor_reg`.

These two hyperparameters have a significant impact on the performance of the model, and should be selected carefully. The parameters we use in this example are selected for the data-set from Missing Financial Data, and should not be considered default aprameters for alternative data-sets.

In [37]:
#!jupyter nbextension enable --py widgetsnbextension

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: kernel kernelspec migrate run troubleshoot

Jupyter command `jupyter-nbextension` not found.


In [8]:
imputation = core_imputation_model.impute_data_xs(
    percentile_rank_chars, 
    n_xs_factors            = 20,
    time_varying_loadings   = True,
    xs_factor_reg           = 0.01 / L,
    min_xs_obs              = 1
)

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   4 out of  12 | elapsed:   15.0s remaining:   30.1s
[Parallel(n_jobs=30)]: Done   7 out of  12 | elapsed:   15.0s remaining:   10.7s
[Parallel(n_jobs=30)]: Done  10 out of  12 | elapsed:   15.0s remaining:    2.9s
[Parallel(n_jobs=30)]: Done  12 out of  12 | elapsed:   15.0s finished


  0%|          | 0/12 [00:00<?, ?it/s]

resids rmse are  0.09522440538615277


0it [00:00, ?it/s]

In [13]:
print(np.isnan(imputation).sum())

148725


In [16]:
bw_xs_imputation = core_imputation_model.impute_data_bxs(
    percentile_rank_chars, 
    n_xs_factors=20,
    time_varying_loadings=True,
    xs_factor_reg=0.01 / L,
    min_xs_obs=1
)

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   4 out of  12 | elapsed:    7.3s remaining:   14.6s
[Parallel(n_jobs=30)]: Done   7 out of  12 | elapsed:    7.3s remaining:    5.2s
[Parallel(n_jobs=30)]: Done  10 out of  12 | elapsed:    7.3s remaining:    1.4s
[Parallel(n_jobs=30)]: Done  12 out of  12 | elapsed:    7.3s finished


  0%|          | 0/12 [00:00<?, ?it/s]

resids rmse are  0.09522440538615277


0it [00:00, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

In [19]:
print(np.isnan(bw_xs_imputation).sum())

np.int64(503775)

In [20]:
gamma_ts, lmbda = core_imputation_model.fit_factors_and_loadings(
    char_panel=percentile_rank_chars, 
    min_chars=1, 
    K=20, 
    num_months_train=T,
    reg=0.01 / L,
    time_varying_lambdas=True,
    eval_data=None,
    run_in_parallel=True
)

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   4 out of  12 | elapsed:   12.9s remaining:   26.0s
[Parallel(n_jobs=30)]: Done   7 out of  12 | elapsed:   13.0s remaining:    9.2s
[Parallel(n_jobs=30)]: Done  10 out of  12 | elapsed:   13.0s remaining:    2.5s
[Parallel(n_jobs=30)]: Done  12 out of  12 | elapsed:   13.0s finished


  0%|          | 0/12 [00:00<?, ?it/s]

resids rmse are  0.09522440538615277


# On the Selction of the Number of Factors and Regularization

Below we show the plots from Figures 8 \& 9 in the paper. These figures illustrate how to determine the optimal regularization and number of factors. In more detail, we evaluate the out-of-sample performance of the model for different number of factors and regularization across a grid of these choices.

![example_of_cval.png](data/example_of_cval.png)

![reg_cval.png](data/reg_cval.png)

The `core_imputation_model_new.fit_factors_and_loadings` method allows to pass on an argument `eval_data`. This, if provided, is compared against the imputation and the RMSE is reported. This is a simple way for evaluating the tuning parameter choice (number of factors and regularization) for the model.