# 0. Outline 
1. [Globals and generic functions](#generic)
2. [MMC statistic plots, main text](#mmc)
   - [Intro plot](#intro_plot)
   - [MMC combined Z-value plot](#mmc_comb)
   - [MMC ablation](#mmc_ablation)
3. [R2 plots, main text](#r2)
    - [Main R2 plot](#r2_main)
    - [Real estate](#real_estate)
4. [Appendix A.1, more sectors](#othersectors)
   - [MMC for other sectors](#mmc_othersectors)
   - [R2 for other sectors](#r2_othersectors)
6. [Appendix A.2, window sensitivity](#window)
7. [Appendix A.3, residual variance over time](#residvar)

## README

- The data for this analysis is not publicly available. To run this analysis using fake placeholder data, please set the global variable USE_PLACEHOLDER_DATA=True.
- To debug the analysis using only a few permutations (to save time), set DEBUG=True.

In [None]:
import numpy as np
USE_PLACEHOLDER_DATA = False
DEBUG = True
np.random.seed(123)

# 1. Globals and generic functions <a class="anchor" id="generic"></a>

In [None]:
import os
import sys
# Import mosaicperm---you can also just install mosaicperm via pip
sys.path.insert(0, "../../mosaicperm/")
import mosaicperm as mp
if mp.__version__ != '0.1.4':
    raise ValueError(f"Using mosaicperm version={mp.__version__}. Use version 0.1.4 or the results may fail to replicate.")
from mosaicperm.utilities import elapsed, vrange
from bfre_preprocessing import load_data, CACHE_DIR, DATA_DIR, PLACEHOLDER_DIR
from bfre_preprocessing import mmc_stat, compute_active_subset

# Typical imports
import time
import numpy as np
import pandas as pd
from scipy import stats
import scipy.sparse as sp
import datetime 

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from plotnine import *

# Save directory
SAVE_DIR = "../data/bfre_results/"
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
if not USE_PLACEHOLDER_DATA:
    data_dir = DATA_DIR
else:
    data_dir = PLACEHOLDER_DIR

# For R^2 plots
TRAIN_START = datetime.datetime(2015, 1, 1)
TRAIN_END = datetime.datetime(2020, 9, 1)
# minimum window of 9 months in the plot, else the estimates are really noisy
PLOT_START = TRAIN_END + datetime.timedelta(days=270) 

# Cache final data before plots
os.makedirs("preplot_data/", exist_ok=True)

# For plotting
THRESHOLD_NAME = 'Mosaic significance threshold'

In [None]:
names = ['EGY', 'FIN', 'HLC', 'ITC', 'IND', 'CDI']
sector_names = { 
    "FIN":"Financials",
    "EGY":"Energy",
    "CDI":"Consumer Disc.",
    "HLC":"Healthcare",
    "IND":"Industrials",
    "ITC":"Tech",
    "ITCSOFT":"Software (Tech)",
    "FINREAL":"Real estate (Finance)",
    "UTL":"Utilities",
    "MAT":"Materials",
    "CST":"Consumer Staples",
    # all stocks
    "assets_na50":"US stocks",
}
for name in names:
    if name not in sector_names:
        sector_names[name] = name

def names_to_ncode(series):
    series = series.copy()
    for key in sector_names.keys():
        series[series.str.contains(sector_names[key])] = key
    return series

# Events to label
events = [
    ('Covid', datetime.date(year=2020, month=2, day=20)),
    #('Vaccine', datetime.date(year=2021, month=6, day=21)),
    #('Ukraine', datetime.date(year=2022, month=2, day=24)),
]

In [None]:
asset_names = pd.read_csv(f"{data_dir}/assets_id_to_name.csv").rename(
    columns={"invariant_id":"ASSET", "name_sec":"name"}
)
asset2names = asset_names.set_index("ASSET")['name']
names2asset = asset_names.set_index("name")['ASSET']

In [None]:
def compute_many_ols_stats(mpt):
    """
    Computes the corresponding OLS residuals for a MosaicFactorTest object.
    """
    residuals = mp.factor.ols_residuals(mpt.outcomes, exposures=mpt.exposures)
    ols_stats = []
    for start, end in zip(mpt.starts, mpt.ends):
        ols_stats.append(
            mpt.test_stat(residuals[start:end], **mpt.tstat_kwargs)
        )
    return np.array(ols_stats)

def p_to_z(p):
    return np.maximum(stats.norm.ppf(1 - p), 0)

In [None]:
def section_1_data(
    names=['HLC', 'EGY', 'FIN'],
    window=350,
    reps=5000,
    n_timepoints=100,
    start_date=datetime.datetime(2017, 1, 1),
    test_stat=mmc_stat,
    shift2match=True,
    full_only=False,
    mpt_kwargs=dict(),
    **kwargs,
):


    t0 = time.time()
    dfs4plot = []
    ### Loop through analyses
    for name in names:
        print(f"At name={name} at {elapsed(t0)}.")
        for which_factors, slabel in zip(
            ['all', 'industry'],
            ['Full BFRE', 'Missing\nStyle Factors']
        ):
            if 'Missing' in slabel and full_only==1:
                continue
            if 'Full' in slabel and full_only==-1:
                continue
            if name == 'EGY':
                # Exclude EGYOGINT subindustry as specified in the paper;
                # this is full of subsidiaries/partner corporations.
                to_exclude = ['EGYOGINT']
            else:
                to_exclude = None
            data = load_data(
                industry=name, 
                which_factors=which_factors,
                use_placeholder=USE_PLACEHOLDER_DATA,
                to_exclude=to_exclude,
                start_date=start_date,
                **kwargs,
            )
            n_obs, n_subjects, n_factors = data['exposures'].shape

            ### Run mosaic test ###
            mpt = mp.factor.MosaicFactorTest(
                outcomes=data['outcomes'],
                exposures=data['exposures'],
                test_stat=test_stat,
                **mpt_kwargs,
            )
            mpt.fit_tseries(nrand=reps, n_timepoints=n_timepoints, window=window)

            ### Approximate Z-statistics ###
            Zapprx = (mpt.stats_tseries - mpt.null_tseries.mean(axis=1)) / mpt.null_tseries.std(axis=1)
    
            ### Compute OLS statistics ###
            Ts_ols = compute_many_ols_stats(mpt)
    
            ### Extract and save---uses bonferroni correction across industries
            Ts = mpt.stats_tseries[:, 0]
            Tq = np.quantile(mpt.null_tseries[:, :, 0], 1-0.05/len(names), axis=1)
            pvals = mpt.pval_tseries
            dates = data['outcomes'].index[mpt.ends-1]
            if shift2match and slabel == 'Full BFRE':
                shift = Ts_ols.mean() - Ts.mean()
                print(f"At name={name}, shift={np.around(shift, 2)}")
            else:
                shift = 0
            Tq += shift
            Ts += shift
    
            # Append to output
            df = pd.DataFrame(
                [dates, Tq, Ts, Ts_ols, Zapprx, pvals], 
                index=['Date', 'Tq', 'T', 'Tols', 'Zapprx', 'pval']
            ).T
            nassets = data['outcomes'].shape[1]
            df['ncode'] = name
            df['name'] = sector_names[name] + f" ({nassets} assets)"
            df['slabel'] = slabel
            dfs4plot.append(df)
    
    df4plot = pd.concat(dfs4plot, axis='index')
    df4plot = pd.melt(df4plot, id_vars=['Date', 'name', 'ncode', 'slabel'])
    df4plot['value'] = df4plot['value'].astype(float)
    print(f"Finished at {elapsed(t0)}.")
    return df4plot

# 2. MMC statistic analysis, main text <a class="anchor" id="mmc"></a>

In [None]:
df4plot = section_1_data(
    reps=5000 if not DEBUG else 20,
    test_stat=mmc_stat,
)
if not DEBUG and not USE_PLACEHOLDER_DATA:
    df4plot.to_csv("preplot_data/mmc_data.csv")

In [None]:
# ## After caching the output, we can mess with the plots by loading the data
# df4plot = pd.read_csv("preplot_data/mmc_data.csv", index_col=0)
# df4plot['Date'] = pd.to_datetime(df4plot['Date'])

In [None]:
# Date to star
stardate = df4plot.iloc[
    np.argmin(np.abs(df4plot['Date'] - datetime.datetime(year=2020, month=5, day=20)))
]['Date']
df4plot.loc[df4plot['Date'] == stardate]

## 2.1 Intro plot <a class="anchor" id="intro_plot"></a>

In [None]:
## Additional metadata for plots
ncode2name = df4plot[['ncode', 'name']].drop_duplicates().set_index("ncode")['name']
covid = datetime.datetime(year=2020, month=2, day=20)
annots = pd.DataFrame(
    [[covid, 0.38, 'EGY'],
    [covid, 0.42, 'FIN'],
    [covid, 0.35, 'ITC'],
    [covid, 0.35, 'IND'],
    [covid, 0.32, 'HLC'],
    [covid, 0.35, 'CDI']],
    columns=['Date', 'value', 'ncode']
)
annots['label'] = 'Covid'
annots['name'] = annots['ncode'].map(ncode2name)

## Prevent modification just in case
sub = df4plot.copy()

In [None]:
## Figure 1 and 4
shift2match = True
subncodes = ['EGY', 'FIN', 'HLC']
for suffix in ['full', 'no_quantile']:
    # For one variant, only plot OLS curves
    if suffix == 'full':
        sub0 = sub.copy()
    else:
        sub0 = sub.loc[~sub['variable'].str.contains("95")].copy()
        sub0 = sub0.loc[sub0['variable'].str.contains("ols")]
    sub0 = sub0.loc[sub0['ncode'].isin(subncodes)]
    sub0 = sub0.loc[sub0['slabel'].str.contains("Full")]
    subannots = annots.loc[annots['ncode'].isin(sub0['ncode'].unique())]
    if shift2match:
        suffix = suffix + '_shift'
        
    # Names
    sub0['variable'] = sub0['variable'].map({
        "T":r"Mosaic stat. $(S_t)$",
        "Tols":r"OLS stat. $(S_t^{ols})$" ,
        "Tq":THRESHOLD_NAME,
    })
    # get rid of any variables which aren't in the previous dictionary, 
    # e.g. p-values, which aren't plotted
    sub0 = sub0.loc[sub0['variable'].notnull()] 
    # Choose where the stars go 
    sdata = sub0.loc[sub0['Date'] == stardate].copy()
    sdata = sdata.loc[sdata['variable'].str.contains("OLS")]
    g = (
        ggplot(
            sub0, 
            aes(x='Date', y='value', linetype='variable', color='variable')
        ) 
        + geom_line(size=1) 
        + facet_wrap("~name", ncol=4)
        + theme_bw()
        + theme(figure_size=(8, 3))
        #+ theme(axis_text_x = element_text(angle = 90))
        + geom_vline(data=subannots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
        + geom_text(
            data=subannots, 
            mapping=aes(x='Date', y='value', label='label'), 
            inherit_aes=False, size=9, angle=90, nudge_x=-60
        )
        + geom_point(
            mapping=aes(shape='Date.astype(str)'),
            data=sdata, size=4.5, fill='white', color='black',
        )
        + scale_x_datetime(
            labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
        )
        + labs(
            y=r'Mean Max. Corr. ($S_t$)', 
            x='',
            title='',
        )
        #+ ylim(None, 0.52)
    )
    if 'full' in suffix:
        g += scale_color_manual(['orangered', 'cornflowerblue', 'indigo'])
        g += scale_linetype_manual(['dotted', 'solid', 'solid'])
    else:
        g += scale_color_manual(['indigo'])
        g += scale_linetype_manual(['solid'])
    g += scale_shape_manual(['*'])
    g += labs(color='', linetype='', shape='')
    g += theme(legend_position="bottom", legend_box_spacing=-0.01)
    if not DEBUG and not USE_PLACEHOLDER_DATA:
        g.save(f"{SAVE_DIR}/mmc_{suffix}_sec1.png", dpi=500)
    print(g)

## 2.2 Combined approximate and exact Z-statistics  <a class="anchor" id="mmc_comb"></a>

In [None]:
# Exact p-values
sub_pval = sub.copy()
sub_pval = sub_pval.loc[sub_pval['variable'] == 'pval']
sub_pval['value'] = p_to_z(np.minimum(1, 3*sub_pval['value'].astype(float))) # Bonferroni across industries
sub_pval['variable'] = 'Exact Z-statistic'
# Approximate p-values
sub_apprx = sub.loc[sub['variable'] == 'Zapprx'].copy()
sub_apprx['value'] = np.maximum(sub_apprx['value'], 0)
sub_apprx['variable'] = 'Apprx. Z-statistic'
# Concatenate
sub0 = pd.concat([sub_pval, sub_apprx], axis=0)

# Annotations
subannots = annots.loc[annots['ncode'].isin(subncodes)]
sa0 = subannots.copy(); sa0['variable'] = sub_pval['variable'].unique()[0]
sa1 = subannots.copy(); sa1['variable'] = sub_apprx['variable'].unique()[0]
subannots = pd.concat([sa0, sa1], axis=1)

# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
# Make plot
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', color='slabel', linetype='slabel')
    ) 
    + geom_line(size=1.25)
    + facet_wrap("~variable+name", scales='free_y')
    + theme_bw()
    + theme(figure_size=(8, 4.5))
    + theme(axis_text_x = element_text(angle = 90))
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + geom_hline(yintercept=stats.norm.ppf(1 - 0.05), linetype='dotted')
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        #y=r'(S - E[S]$)_+$ / sd(S)        $\Phi^{-1}(1-p_{val})_+$',
        y='Z-statistic value',
        x='',
        title='Approximate and exact Z-statistics',
    )
    + ylim(0, None)
)
g += scale_color_manual(['cornflowerblue', 'red'])
g += scale_linetype_manual(['solid', 'dotted'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
if not DEBUG and not USE_PLACEHOLDER_DATA:
    g.save(f"{SAVE_DIR}/mmc_Zcomb.png", dpi=500)
print(g)

## 2.3 MMC ablation statistics <a class="anchor" id="mmc_ablation"></a>

In [None]:
abannots = pd.DataFrame(
    [[covid, 0.4, 'EGY'],
    [covid, 0.45, 'FIN'],
    [covid, 0.5, 'ITC'],
    [covid, 0.5, 'IND'],
    [covid, 0.35, 'HLC'],
    [covid, 0.5, 'CDI']],
    columns=['Date', 'value', 'ncode']
)
abannots['label'] = 'Covid'
abannots['name'] = annots['ncode'].map(ncode2name)

In [None]:
## Figure 1 and 4
subncodes = ['EGY', 'FIN', 'HLC']
sub0 = df4plot.copy()
sub0 = sub0.loc[sub0['ncode'].isin(subncodes)]
sub0 = sub0.loc[sub0['slabel'].str.contains("Missing")]
subannots = abannots.loc[abannots['ncode'].isin(sub0['ncode'].unique())]

# Names
sub0['variable'] = sub0['variable'].map({
    "T":r"Mosaic stat. $(S_t)$",
    "Tq":THRESHOLD_NAME
})
sub0 = sub0.loc[sub0['variable'].notnull()] # e.g., p-vals aren't plotted

# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
sdata = sdata.loc[sdata['variable'].str.contains("OLS")]
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', linetype='variable', color='variable')
    ) 
    + geom_line(size=1) 
    + facet_wrap("~name")
    + theme_bw()
    + theme(figure_size=(8, 3))
    + geom_vline(data=subannots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
    + geom_text(
        data=subannots, 
        mapping=aes(x='Date', y='value', label='label'), 
        inherit_aes=False, size=9, angle=90, nudge_x=-80
    )
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'Mean Max. Corr. ($S_t$)', 
        x='',
        title='',
    )
    #+ ylim(None, 0.52)
)
g += scale_color_manual(['orangered', 'blue'])
g += scale_linetype_manual(['dotted', 'solid', 'solid'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
if not DEBUG and not USE_PLACEHOLDER_DATA:
    g.save(f"{SAVE_DIR}/mmc_ablation.png", dpi=500)
print(g)

# 3. R2 plots, main text <a class="anchor" id="r2"></a>

### 3.1 Main R2 plot <a class="anchor" id="r2_main"></a>

In [None]:
def r2_plot_data(
    window=350,
    reps=5000,
    names=['EGY', 'HLC', 'FIN'],
    full_only=False,
    start_date=TRAIN_START,
    train_end=TRAIN_END,
    train_kwargs=dict(),
    **kwargs,
):

    dfs = []
    t0 = time.time()
    for industry in names:
        print(f"At industry={industry} at {elapsed(t0)}.")

        # Types of models to loop through
        which_factors = ['all', 'industry']
        slabels = ['Full BFRE', 'Missing\nStyle Factors']
        to_exclude = [None, None]
        if industry == 'FIN':
            which_factors.append('all')
            slabels.append('Missing\nReal estate assets')
            to_exclude.append("FINREAL")
        if industry == 'EGY':
            # Exclude EGYOGINT industry, which includes mostly partner/shell corporations,
            to_exclude = ["EGYOGINT", "EGYOGINT"]
        for which_fac, slabel, to_exclude in zip(
            which_factors, slabels, to_exclude,
        ):
            if full_only and 'Missing' in slabel:
                continue
            data = load_data(
                industry=industry,
                which_factors=which_fac, 
                to_exclude=to_exclude,
                use_placeholder=USE_PLACEHOLDER_DATA,
                start_date=start_date,
                **kwargs,
            )
            ### Data
            outcomes = data['outcomes']
            exposures = data['exposures']
            ### First split of data
            n = len(outcomes)
            n0 = np.argmin(outcomes.index <= train_end)
            # exposures on first split
            exp_train = exposures[0:n0]
            exp_train[np.isnan(exp_train)] = 0
            # outcomes on first split
            out_train = outcomes.fillna(0).values[0:n0]
            # estimate new potential exposures
            mpt_train = mp.factor.MosaicFactorTest(
                outcomes=out_train, 
                exposures=exp_train,
                test_stat=None,
                **train_kwargs
            )
            mpt_train.compute_mosaic_residuals()
            resid0 = mpt_train.residuals
            asub = compute_active_subset(resid0, thresh=0.2)
            qs = np.around(np.linspace(20, len(asub), 10)).astype(int) / len(asub)
            new_exp = mp.statistics.approximate_sparse_pcas(
                np.cov(resid0[:, asub].T), quantiles=qs
            )
            # add zero padding
            new_exposures = np.zeros((new_exp.shape[0], resid0.shape[1]))
            new_exposures[:, asub] = new_exp
    
            ### Second split of data
            mpt = mp.factor.MosaicBCV(
                outcomes=outcomes.iloc[n0:],
                exposures=exposures[n0:],
                new_exposures=new_exposures,
            )
            mpt.fit_tseries(nrand=reps, window=window, convolution_mode='full')
    
            ## Use maximum of r2s as test statistic
            maxr2s = np.max(mpt.stats_tseries, axis=-1)
            nullr2s = np.max(mpt.null_tseries, axis=-1)
            pval_tseries = np.sum(maxr2s.reshape(-1, 1) <= nullr2s, axis=1) + 1
            pval_tseries = pval_tseries / (1 + reps)

            ## Get rid of the "tail" of the convolution, 
            # i.e., windows of less than ``window`` ending at
            # the last observed time-point.
            non_boundary_inds = np.arange(np.argmax(mpt.ends))
            dx = outcomes.iloc[n0:].index[mpt.ends[non_boundary_inds]-1]
            df = pd.DataFrame(
                [dx, mpt.pval_tseries[non_boundary_inds]],
                index=['Date', 'pval']
            ).T
            
            # This is a hack which adjusts the labelled # assets for FIN minus FINREAL
            # so that it doesn't get its own panel in the plot
            if 'Real estate' not in slabel: 
                n_assets = outcomes.shape[1]
            
            # Save metadata
            df['name'] = sector_names[industry] + f" ({n_assets} assets)"
            df['ncode'] = industry
            df['slabel'] = slabel
            df['zval'] = p_to_z(np.minimum(3*pval_tseries[non_boundary_inds], 1)) # bonferroni correction
            df['maxr2'] = maxr2s[non_boundary_inds]
            dfs.append(df)
        
    df4plot = pd.concat(dfs, axis='index')
    return df4plot

In [None]:
RIBBON_DELTAS = {
    "EGY":0.0015,
    "FIN":0.003,
    "HLC":0.002,
    "ITC":0.003,
    "CDI":0.003,
    "IND":0.003,
    "CST":0.003,
    "MAT":0.003,
    "UTL":0.003,
}

def retrieve_fill(bucket):
    if 'geq' in bucket:
        return 'white'
    if '0.05' in bucket:
        return 'gray'
    else:
        return 'black'

def r2_ribbon_plot(r2data, figure_size=(8, 3.25)):
    comb = r2data.copy()
    ## Hack to make legend spacing nicer
    comb['slabel'] = comb['slabel'].apply(
        lambda x: x if "Style" not in x else x + "       "
    )
    # multiplicity correction
    ncodes = len(r2data['ncode'].unique())
    ## P-value buckets
    comb['pval_bucket'] = pd.cut(
        np.minimum(ncodes*comb['pval'], 1), bins=[0, 1e-3, 5e-2, 1.01]
    ).astype(str)
    def pval_bucket_fmt(x):
        if '(0.05, 1' in x:
            return r"$p_{val} \geq 0.05$"
        elif '(0.0, 0.001]' in x:
            return r"$p_{val} \leq 0.001$"
        else:
            return r"$p_{val} \leq 0.05$"
    comb['pval_bucket'] = comb['pval_bucket'].apply(pval_bucket_fmt)
    comb['pval_bucket'] = pd.Categorical(
        comb['pval_bucket'], ordered=True, categories=[
            pval_bucket_fmt("(0.05, 1"),
            pval_bucket_fmt("(0.001, 0.05)"),
            pval_bucket_fmt("(0.0, 0.001]"),
        ]
    )
    
    # Width of ribbon
    comb['ribbon_delta'] = comb['ncode'].map(RIBBON_DELTAS)
    comb['ribbon_alpha'] = (comb['pval'] <= 0.05).astype(float)

    ## Plot
    g = (
        ggplot(
            comb,
            aes(x='Date', y='maxr2', color='slabel', group='slabel')
        ) 
        + facet_wrap("~name", scales='free')
                + theme(axis_text_x = element_text(angle = 90))
        + scale_x_datetime(
            labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
        )
        + theme_bw()
        + theme(figure_size=figure_size)
        + scale_color_manual(['cornflowerblue', 'green', 'red', 'black'])
        + labs(
            x='',
            y=r'Max. Bi-cross $R^2$',
            title='',
            color='Model',
            fill='Significance',
        )
        + theme(legend_position="bottom", legend_box_spacing=-0.01)
    )

    ## Add the significance ribbons.
    ## These seem to stretch the abilities of ggplot
    for slabel in comb['slabel'].unique():
        for name in comb['name'].unique():
            sub = comb.loc[
                (comb['slabel'] == slabel) &
                (comb['name'] == name)
            ].sort_values("Date")
            starts = np.where(sub['pval_bucket'] != sub['pval_bucket'].shift(1))[0]
            ends = np.where(sub['pval_bucket'] != sub['pval_bucket'].shift(-1))[0]
            for start, end in zip(starts,ends):
                bucket = sub.iloc[start]['pval_bucket']
                alpha = (1 - float("\geq" in bucket)) * 1.0
                g += geom_ribbon(
                    data=sub.iloc[start:(end+1)],
                    mapping=aes(
                        x='Date', 
                        ymin='maxr2-ribbon_delta', 
                        ymax='maxr2+ribbon_delta',
                        group='name+slabel',
                        fill='pval_bucket',
                    ), 
                    color='none',
                    inherit_aes=False,
                    alpha=alpha,
                )
    g += scale_fill_manual(['white', 'orange', 'black'])
    g += geom_line(data=comb, mapping=aes(x='Date', y='maxr2', color='slabel', group='slabel'), size=1)
    return g

In [None]:
df4plot = r2_plot_data(reps=5000 if not DEBUG else 20)
if not USE_PLACEHOLDER_DATA and not DEBUG:
    df4plot.to_csv("preplot_data/r2_plot_data.csv")

In [None]:
## Load cached data to mess around with plots
# df4plot = pd.read_csv("preplot_data/r2_plot_data.csv", index_col=0)
# df4plot['Date'] = pd.to_datetime(df4plot['Date'])

In [None]:
# increase minimum window so that the beginning isn't too noisy
sub = df4plot.loc[df4plot['Date'] >= PLOT_START]
# Plot and save
g = r2_ribbon_plot(sub)
if not DEBUG and not USE_PLACEHOLDER_DATA:
    g.save(f"{SAVE_DIR}/r2_plot_v3.png", dpi=500)
print(g)

## 3.2 Identified real estate stocks, main text <a class="anchor" id="real_estate"></a>

In [None]:
data = load_data(
    'FIN', 
    start_date=TRAIN_START, 
    use_placeholder=USE_PLACEHOLDER_DATA,
)
n0 = np.argmin(
    data['outcomes'].index <= TRAIN_END
)
np.random.seed(123)
#estimate new potential exposures
exposures = data['exposures']
exposures[np.isnan(exposures)] = 0
exposures[data['outcomes'].isnull().values] = 0
outcomes = data['outcomes'].fillna(0).values
hateps = mp.factor.ols_residuals(
    outcomes=outcomes, exposures=exposures,
)

### Read results
B = compute_active_subset(hateps, thresh=0.2)
Sigma = np.corrcoef(hateps[0:n0][:, B].T)
subset = np.where(np.diag(Sigma) > 1e-5)[0]
p = len(subset)
Sig = Sigma[np.ix_(subset, subset)]
scale = np.sqrt(np.diag(Sig))
# Correlation matrix
C = Sig / np.outer(scale, scale)
absC = np.abs(C - np.eye(len(subset)))
macs = np.max(absC, axis=0)
inds = np.argsort(-1*macs)

In [None]:
# compute the proportion of days on which each asset has exposure to real estate
re_exposures = data['exposures'][:, :, np.where(data['factor_cols'] == 'FINREAL')[0][0]]
counts = np.maximum(np.sum(data['outcomes'] != 0, axis=0), 1)
re_props = np.sum(re_exposures, axis=0) / counts
re_assets = re_props > 0.9

In [None]:
k = 20
topk = inds[0:k]
topk_assets = data['outcomes'].columns[B][topk]
topk_names = asset2names[topk_assets]
industries = data['industries']
topk_inds = industries[industries.index.isin(topk_assets)]
np.mean(topk_inds == 'FINREAL'), np.mean(topk_inds == 'FINMRGREIT')

In [None]:
assets = data['outcomes'].columns
# Sort by evec
topk_re = topk[0:10]
evec = np.linalg.eigh(C[topk_re][:, topk_re])[1][:, -1]
topk_re = topk_re[np.argsort(evec)]
# Out of sample correlation plot
fig, ax = plt.subplots(figsize=(16, 6))
Sig2 = np.corrcoef(hateps[n0:][:, B].T)
Sig2pd = pd.DataFrame(
    Sig2[topk_re][:, topk_re], columns=asset2names[assets[B][topk_re]]
)
Sig2pd.columns.name = 'Asset'


sns.heatmap(Sig2pd.T, cmap='RdBu', center=0, annot=True,  fmt=".2f", ax=ax)
ax.set(xlabel='Asset', title=f'Out-of-sample correlation matrix for residuals of selected financial assets')
# make heatmap
if np.any(evec < 0) and np.any(evec > 0):
    last_neg = np.sum(evec < 0)
    ax.axvline(last_neg, color='black', linestyle='dotted')
    ax.axhline(last_neg, color='black', linestyle='dotted')
    
plt.savefig(f"{SAVE_DIR}/finreal_oos_corr.png", dpi=500, bbox_inches='tight')
plt.show()

My understanding:
- MGIC investment, Radian group, Essent group are all mortgate insurers [see here](https://www.fool.com/investing/2018/04/09/why-mgic-investment-corp-radian-group-essent-group.aspx)
- In fact, MGIC + Radian announced plans to merge in 2007, although they terminated this [decision later](https://radian.com/-/media/Files/Enterprise/Investor-Relations/Toolkit/Radian-Overview.pdf)
- Welltower and Ventas invest in healthcare infrastructure
- Life storage is a storage company
- Regency centers seems to operate retail centers

# Appendix A.1 Additional sectors <a class="anchor" id="othersectors"></a>

## A.1.1: MMC statistic <a class="anchor" id="mmc_othersectors"></a>

In [None]:
moresectors = section_1_data(
    reps=500 if not DEBUG else 20, 
    names=['ITC', 'IND', 'CDI', 'CST', 'UTL', 'MAT', 'ITCSOFT', 'FINREAL'], 
    full_only=True,
    shift2match=False,
    test_stat=mmc_stat,
)
if not DEBUG and not USE_PLACEHOLDER_DATA:
    moresectors.to_csv("preplot_data/moresectors_mmc.csv")

In [None]:
## Additional metadata for plots
ncode2name = moresectors[['ncode', 'name']].drop_duplicates().set_index("ncode")['name']
covid = datetime.datetime(year=2020, month=2, day=20)
annots = pd.DataFrame(
    [[covid, 0.30, 'ITC'],
    [covid, 0.30, 'IND'],
    [covid, 0.30, 'CDI'],
    [covid, 0.30, 'CST'],
    [covid, 0.30, 'MAT'],
    [covid, 0.36, 'FINREAL'],
    [covid, 0.3, 'ITCSOFT'],
    [covid, 0.32, 'UTL']],
    columns=['Date', 'value', 'ncode']
)
annots['label'] = 'Covid'
annots['name'] = annots['ncode'].map(ncode2name)
# Date to star
stardate = moresectors.iloc[
    np.argmin(np.abs(moresectors['Date'] - datetime.datetime(year=2020, month=5, day=21)))
]['Date']

In [None]:
# Names
sub0 = moresectors.copy()
sub0['variable'] = sub0['variable'].map({
    "T":r"Mosaic stat. $(S_t)$",
    "Tq":THRESHOLD_NAME
})
sub0 = sub0.loc[sub0['variable'].notnull()] # don't plot p-vals
# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
sdata = sdata.loc[sdata['variable'].str.contains("Mosaic")]
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', linetype='variable', color='variable')
    ) 
    + geom_line(size=1) 
    + facet_wrap("~name", ncol=4)
    + theme_bw()
    + theme(figure_size=(10, 6))
    #+ theme(axis_text_x = element_text(angle = 90))
    + geom_vline(data=annots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
    + geom_text(
        data=annots.loc[annots['ncode'].isin(sub0['ncode'].unique())], 
        mapping=aes(x='Date', y='value', label='label'), 
        inherit_aes=False, size=9, angle=90, nudge_x=-60
    )
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'Mean Max. Corr. ($S_t$)', 
        x='',
        title='',
    )
)
g += scale_color_manual(['orangered', 'cornflowerblue', 'indigo'])
g += scale_linetype_manual(['dotted', 'solid', 'solid'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
if not DEBUG and not USE_PLACEHOLDER_DATA:
    g.save(f"{SAVE_DIR}/appendix/mmc_othersectors.png",  dpi=500, bbox_inches='tight')
print(g)

## A.1.2: R2 plot <a class="anchor" id="r2_othersectors"></a>

In [None]:
# This takes forever; cache each industry separately.
more_industries = ['ITC', 'IND', 'CDI', 'CST', 'UTL', 'MAT']
for industry in more_industries:
    moresectors_r2 = r2_plot_data(
        names=[industry],
        reps=10000 if not DEBUG else 10,
    )
    suffix = '' if not DEBUG else '_debug'
    moresectors_r2.to_csv(f"preplot_data/moresectors_r2_{industry}{suffix}.csv")

In [None]:
more_industries = ['ITC', 'IND', 'CDI', 'CST', 'UTL', 'MAT']
moresectors_r2 = []
for industry in more_industries:
    suffix = '' if not DEBUG else '_debug'
    moresectors_r2.append(
        pd.read_csv(f"preplot_data/moresectors_r2_{industry}{suffix}.csv", index_col=0)
    )
moresectors_r2 = pd.concat(moresectors_r2)
moresectors_r2['Date'] = pd.to_datetime(moresectors_r2['Date'])
moresectors_r2 = moresectors_r2.loc[moresectors_r2['Date'] >= PLOT_START]

In [None]:
g = r2_ribbon_plot(moresectors_r2.sort_values(['name', 'slabel', 'Date']).iloc[0::5], figure_size=(8, 6))
if not DEBUG and not USE_PLACEHOLDER_DATA:
    g.save(f"{SAVE_DIR}/appendix/r2_othersectors_v2.png", dpi=500, bbox_inches='tight')
print(g)

# Appendix A.2: Sensitivity to window <a class="anchor" id="window"></a>

In [None]:
wdfs = []
for window in [200, 300, 400]:
    wdf = section_1_data(
        reps=1000 if not DEBUG else 20, 
        full_only=True,
        window=window,
        test_stat=mmc_stat,
    )
    wdf['window'] = window
    wdfs.append(wdf)

wdfs = pd.concat(wdfs, axis='index')
if not DEBUG and not USE_PLACEHOLDER_DATA: 
    wdfs.to_csv("preplot_data/window_sensitivity.csv")

In [None]:
# wdfs = pd.read_csv("preplot_data/window_sensitivity.csv", index_col=0)
# wdfs['Date'] = pd.to_datetime(wdfs['Date'])

In [None]:
# Annotations
ncode2name = wdfs[['ncode', 'name']].drop_duplicates().set_index("ncode")['name']
covid = datetime.datetime(year=2020, month=2, day=20)
annots_base = pd.DataFrame(
    [[covid, 0.45, 'EGY'],
    [covid, 0.45, 'FIN'],
    [covid, 0.45, 'HLC'],],
    columns=['Date', 'value', 'ncode']
)
annots_base['label'] = 'Covid'
annots_base['name'] = annots_base['ncode'].map(ncode2name)
annots = []
for window in wdfs['window'].unique():
    adf = annots_base.copy()
    adf['window'] = f'Window={window}'
    annots.append(adf)
annots = pd.concat(annots)

In [None]:
# Names
sub0 = wdfs.copy()
sub0['variable'] = sub0['variable'].map({
    "T":r"Mosaic stat. $(S_t)$",
    "Tq":THRESHOLD_NAME,
})
sub0 = sub0.loc[sub0['variable'].notnull()] # get rid of p-vals
sub0['window'] = "Window=" + sub0['window'].astype(str)
# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
sdata = sdata.loc[sdata['variable'].str.contains("Mosaic")]
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', linetype='variable', color='variable')
    ) 
    + geom_line(size=1) 
    + facet_grid("window~name", scales='free')
    + theme_bw()
    + theme(figure_size=(8, 6))
    + geom_vline(xintercept=covid, linetype='dotted')
    + geom_text(
        data=annots, 
        mapping=aes(x='Date', y='value', label='label'), 
        inherit_aes=False, size=9, angle=90, nudge_x=-60
    )
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'Mean Max. Corr. ($S_t$)', 
        x='',
        title='',
    )
)
g += scale_color_manual(['orangered', 'cornflowerblue', 'indigo'])
g += scale_linetype_manual(['dotted', 'solid', 'solid'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
if not DEBUG and not USE_PLACEHOLDER_DATA:
    g.save(f"{SAVE_DIR}/appendix/mmc_window_sensitivity.png",  dpi=500, bbox_inches='tight')
g

# Appendix A.3: Residual variance over time  <a class="anchor" id="residvar"></a>

In [None]:
t0 = time.time()
sigma2_df = []
window = 350
for industry in ['EGY', 'FIN', 'HLC']:
    # load data
    data = load_data(industry, to_exclude=['EGYOGINT'], use_placeholder=USE_PLACEHOLDER_DATA)
    # Compute OLS residuals.
    # zero imputation = deterministically equivalent to 
    # ignoring residuals which are missing.
    outcomes = data['outcomes'].values
    outcomes[np.isnan(outcomes)] = 0
    exposures = data['exposures']
    exposures[np.isnan(exposures)] = 0
    residuals = mp.factor.ols_residuals(outcomes=outcomes, exposures=exposures)
    # Estimate sum of squares of residuals, ignoring zeros (which are missing)
    avg_squared_residual = np.sum(residuals**2, axis=1) / np.sum(residuals != 0, axis=1)
    sigma2s = np.convolve(avg_squared_residual, np.ones(window) / window, mode='valid')
    sigma2s = pd.DataFrame(pd.Series(sigma2s, data['outcomes'].index[(window-1):]))
    sigma2s.columns = ['sigma2']
    sigma2s['sector'] = sector_names[industry] + f" ({data['outcomes'].shape[1]} assets)"
    sigma2_df.append(sigma2s)
    print(f"Finished {industry} at {elapsed(t0)}.")

sigma2_df = pd.concat(sigma2_df).reset_index()

In [None]:
g = (
    ggplot(
        sigma2_df,
        aes(x='Date', y='sigma2')
    ) 
    + facet_wrap("~sector", scales='free')
    + geom_line(color='blue')
    + theme_bw()
    + theme(figure_size=(8, 2.5))
    + geom_vline(xintercept=datetime.datetime(2020, 2, 1), linetype='dotted')
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + theme(axis_text_x = element_text(angle = 90))
    + labs(y=f'Residual variance\n(window={window})')
)
if not USE_PLACEHOLDER_DATA:
    g.save(f"{SAVE_DIR}/appendix/sigma2s.png",  dpi=500, bbox_inches='tight')
print(g)