# Generate BFRE placeholder data

The BFRE dataset is not publicly available. However, in this notebook, we generate completely synthetic data that mimics the structure of the BFRE model. This allows one to run the ``bfre_analysis`` notebook on the synthetic (placeholder) data.

## Imports

In [None]:
import os
import sys
# Import custom code---you can also just install mosaicperm via pip
sys.path.insert(0, "../../mosaicperm/")
import mosaicperm as mp
from mosaicperm.utilities import elapsed, vrange
from bfre_preprocessing import load_data, CACHE_DIR, DATA_DIR

# Typical imports
import time
import numpy as np
import pandas as pd
from scipy import stats
import scipy.sparse as sp
import datetime 

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from plotnine import *

# Save directory
PLACEHOLDER_DIR = "../data/bfre_placeholder"

## Create placeholder data

In [None]:
np.random.seed(123)

In [None]:
data = load_data()

In [None]:
## Copy dimensionality and null pattern from original data
dates = data['outcomes'].index
T, p = data['outcomes'].shape
asset_ids = [f'STOCK{k}' for k in range(10000, 10000+p)]

In [None]:
# Time-varying heteroskedasticity
timedeltas = (dates - datetime.datetime(2020, 2, 20)).values.astype(float) / 1e17
sigma_t = np.ones(len(dates))
sigma_t += (timedeltas > 0) / (0.5 + timedeltas)
sigma_t = sigma_t.reshape(-1, 1)

In [None]:
np.random.seed(123)
### Create fake exposures/industries
inds = [
    'EGY', 'HLC', 'HLC',
    'CDI', 'IND', 'ITC', 'ITCSOFT',
    'UTL', 'MAT', 'CST', 
    'FINREAL', 'FIN', 'FINBANK',
]
industries = pd.Series(
    np.random.choice(inds, size=len(asset_ids), replace=True),
    index=asset_ids,
)
industries.index.name = 'ASSET'
industries.name = 'Industry'

### Create exposures   
# industry exposures
ind_exposures = []
for ind in inds:
    ind_exposures.append((industries == ind).astype(float))
ind_exposures = np.stack(ind_exposures, axis=1)
ind_exposures = np.stack([ind_exposures for _ in range(T)], axis=0)
# cts exposures
k = 20
cts_exposures = np.zeros((T, p, k))
# markers where the exposures change
starts = np.around(np.linspace(0, T, int(T/5))).astype(int)
ends = starts[1:]
starts = starts[0:-1]
dfs = 4 / (sigma_t / sigma_t.max())
for start, end in zip(starts, ends):
    cts_exposures[start:end] = stats.t(df=dfs[start]).rvs(size=(p, k))

exposures = np.concatenate(
    [np.ones((T, p, 1)), ind_exposures, cts_exposures], 
    axis=-1
)
factor_cols = np.array(['MARKET'] + inds + [f'fake_factor{ell}' for ell in range(k)])

In [None]:
np.random.seed(123)
### Create residuals
# Asset-varying heteroskedasticity
sigma_j = np.random.uniform(0.1, 1, size=len(asset_ids))
epsilon = sigma_t * np.random.randn(*data['outcomes'].shape) * sigma_j
# Add non-null noise
np.random.seed(123)
covid = datetime.datetime(2020, 2, 20)
sdate = datetime.datetime(2000, 1, 1)
## Create null violations for three industries
## starting at different times
for industry, start_date, transient in zip(
    ['HLC', 'FINREAL', 'ITC'],
    [covid, sdate, sdate],
    [True, False, False],
):
    t_inds = np.where(dates >= start_date)[0]
    j_inds = [i for i, asset in enumerate(asset_ids) if industries[asset][0:len(industry)] == industry]
    nonnull_inds = np.random.choice(j_inds, 50)
    if not transient:
        epsilon[np.ix_(t_inds, nonnull_inds)] += 2 * np.random.randn(len(t_inds)).reshape(-1, 1)
    else:
        # additional exposures vary over time
        addn_exposures = np.stack(
            [
                np.convolve(np.random.randn(len(t_inds)) + np.linspace(1, 0.2, len(t_inds)), np.ones(200) / 200, mode='same')
                for _ in nonnull_inds
            ], 
            axis=1
        )
        epsilon[np.ix_(t_inds, nonnull_inds)] += addn_exposures * np.random.randn(len(t_inds), 1)

## True outcomes
Y = epsilon + 2 * np.sum(exposures * np.random.randn(T, 1, exposures.shape[-1]), axis=-1) / np.sqrt(exposures.shape[-1])
Y /= (5*Y.std())
Y += 1

## Outcomes
outcomes = pd.DataFrame(
    Y,
    index=dates,
    columns=asset_ids
)

In [None]:
# Save results
np.save(f"{PLACEHOLDER_DIR}/exposures.npy", exposures)
np.save(f"{PLACEHOLDER_DIR}/factor_cols.npy", factor_cols)
pd.DataFrame(industries).to_csv(f"{PLACEHOLDER_DIR}/industries.csv")
outcomes.to_csv(f"{PLACEHOLDER_DIR}/returns.csv")

In [None]:
asset_names = pd.Series(
    asset_ids, index=asset_ids
)
asset_names.name = "name_sec"
asset_names.index.name = "invariant_id"
pd.DataFrame(asset_names).to_csv(f"{PLACEHOLDER_DIR}/assets_id_to_name.csv")

## Simulation exposures

In [None]:
np.random.seed(123)
L = np.load("../data/bfre_cache/simulation_exposures.npy")
Lfake = L.std(axis=0) * np.random.randn(*L.shape) * L.std(axis=1).reshape(-1, 1)
np.save(f"{PLACEHOLDER_DIR}/simulation_exposures", Lfake)