# 0. Outline 
1. [Globals and generic functions](#generic)
2. [MMC statistic plots, main text](#mmc)
   - [Intro plot](#intro_plot)
   - [MMC approximate Z-values](#mmc_apprxZ)
   - [MMC exact Z-values/p-values](#mmc_pvals)
   - [MMC ablation](#mmc_ablation)
3. [R2 plots, main text](#r2)
    - [Main R2 plot](#r2_main)
    - [Real estate](#real_estate)
4. [Appendix A.1, more sectors](#othersectors)
   - [MMC for other sectors](#mmc_othersectors)
   - [R2 for other sectors](#r2_othersectors)
6. [Appendix A.2, window sensitivity](#window)

# 1. Globals and generic functions <a class="anchor" id="generic"></a>

In [None]:
import os
import sys
# Import custom code---you can also just install mosaicperm via pip
sys.path.insert(0, "../../mosaicperm/")
import mosaicperm as mp
from mosaicperm.utilities import elapsed, vrange
from bfre_preprocessing import load_data, CACHE_DIR, DATA_DIR

# Typical imports
import time
import numpy as np
import pandas as pd
from scipy import stats
import scipy.sparse as sp
import datetime 

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from plotnine import *

# Save directory
SAVE_DIR = "../data/bfre_results/"
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

In [None]:
names = ['EGY', 'FIN', 'HLC', 'ITC', 'IND', 'CDI']
sector_names = { 
    "FIN":"Financials",
    "EGY":"Energy",
    "CDI":"Consumer Disc.",
    "HLC":"Healthcare",
    "IND":"Industrials",
    "ITC":"Tech",
    # very small sectors
    "UTL":"Utilities",
    "MAT":"Materials",
    "CST":"Consumer Staples",
    # all stocks
    "assets_na50":"US stocks",
}
for name in names:
    if name not in sector_names:
        sector_names[name] = name

def names_to_ncode(series):
    series = series.copy()
    for key in sector_names.keys():
        series[series.str.contains(sector_names[key])] = key
    return series

# Events to label
events = [
    ('Covid', datetime.date(year=2020, month=2, day=20)),
    #('Vaccine', datetime.date(year=2021, month=6, day=21)),
    #('Ukraine', datetime.date(year=2022, month=2, day=24)),
]

In [None]:
asset_names = pd.read_csv(f"{DATA_DIR}/assets_id_to_name.csv").rename(
    columns={"invariant_id":"ASSET", "name_sec":"name"}
)
asset2names = asset_names.set_index("ASSET")['name']
names2asset = asset_names.set_index("name")['ASSET']

In [None]:
def compute_many_ols_stats(mpt):
    """
    Computes the corresponding OLS residuals for a MosaicFactorTest object.
    """
    residuals = mp.factor.ols_residuals(mpt.outcomes, exposures=mpt.exposures)
    ols_stats = []
    for start, end in zip(mpt.starts, mpt.ends):
        ols_stats.append(
            mpt.test_stat(residuals[start:end], **mpt.tstat_kwargs)
        )
    return np.array(ols_stats)

def p_to_z(p):
    return np.maximum(stats.norm.ppf(1 - p), 0)

In [None]:
def section_1_data(
    names=['HLC', 'EGY', 'FIN'],
    window=350,
    reps=5000,
    n_timepoints=150,
    fpdate=datetime.datetime(2018, month=4, day=1),
    test_stat=mp.statistics.mean_maxcorr_stat,
    shift2match=True,
    full_only=False,
    **kwargs,
):


    t0 = time.time()
    dfs4plot = []
    ### Loop through analyses
    for name in names:
        print(f"At name={name} at {elapsed(t0)}.")
        for which_factors, slabel in zip(
            ['all', 'industry'],
            ['Full BFRE', 'Missing\nStyle Factors']
        ):
            if 'Missing' in slabel and full_only:
                continue
            data = load_data(
                industry=name, which_factors=which_factors, **kwargs,
            )
            n_obs, n_subjects, n_factors = data['exposures'].shape

            ### Run mosaic test ###
            np.random.seed(123)
            mpt = mp.factor.MosaicFactorTest(
                outcomes=data['outcomes'],
                exposures=data['exposures'],
                test_stat=test_stat,
                test_stat_kwargs=dict(subset=data['active_subset']),
                tiles=data['tiles'],
            )
            mpt.fit_tseries(nrand=reps, n_timepoints=n_timepoints, window=window)

            ### Approximate Z-statistics ###
            Zapprx = (mpt.stats_tseries - mpt.null_tseries.mean(axis=1)) / mpt.null_tseries.std(axis=1)
    
            ### Compute OLS statistics ###
            Ts_ols = compute_many_ols_stats(mpt)
    
            ### Extract and save---uses bonferroni correction across industries
            Ts = mpt.stats_tseries[:, 0]
            Tq = np.quantile(mpt.null_tseries[:, :, 0], 1-0.05/len(names), axis=1)
            pvals = mpt.pval_tseries
            dates = data['outcomes'].index[mpt.ends-1]
            dflags = dates >= fpdate
            if shift2match and slabel == 'Full BFRE':
                shift = Ts_ols[dflags].mean() - Ts[dflags].mean()
                print(f"At name={name}, shift={np.around(shift, 2)}")
            else:
                shift = 0
            Tq += shift
            Ts += shift
    
            # Append to output
            df = pd.DataFrame(
                [dates, Tq, Ts, Ts_ols, Zapprx, pvals], 
                index=['Date', 'Tq', 'T', 'Tols', 'Zapprx', 'pval']
            ).T
            nassets = data['outcomes'].shape[1]
            df['ncode'] = name
            df['name'] = sector_names[name] + f" ({nassets} assets)"
            df['slabel'] = slabel
            dfs4plot.append(df)
    
    df4plot = pd.concat(dfs4plot, axis='index')
    df4plot = pd.melt(df4plot, id_vars=['Date', 'name', 'ncode', 'slabel'])
    df4plot['value'] = df4plot['value'].astype(float)
    print(f"Finished at {elapsed(t0)}.")
    return df4plot



# 2. MMC statistic analysis, main text <a class="anchor" id="mmc"></a>

In [None]:
df4plot = section_1_data()

####df4plot.to_csv("preplot_data/mmc_data.csv")

# df4plot = pd.read_csv("preplot_data/mmc_data.csv", index_col=0)
# df4plot['Date'] = pd.to_datetime(df4plot['Date'])
# df4plot = df4plot.sort_values(["name", "ncode", "slabel", "variable", "Date" ])
# df4plot = df4plot.iloc[1::2]

In [None]:
# Date to star
stardate = df4plot.iloc[
    np.argmin(np.abs(df4plot['Date'] - datetime.datetime(year=2020, month=5, day=1)))
]['Date']

## 2.1 Intro plot <a class="anchor" id="intro_plot"></a>

In [None]:
## Additional metadata for plots
ncode2name = df4plot[['ncode', 'name']].drop_duplicates().set_index("ncode")['name']
covid = datetime.datetime(year=2020, month=2, day=20)
annots = pd.DataFrame(
    [[covid, 0.33, 'EGY'],
    [covid, 0.42, 'FIN'],
    [covid, 0.35, 'ITC'],
    [covid, 0.35, 'IND'],
    [covid, 0.32, 'HLC'],
    [covid, 0.35, 'CDI']],
    columns=['Date', 'value', 'ncode']
)
annots['label'] = 'Covid'
annots['name'] = annots['ncode'].map(ncode2name)

## Subset by time
sub = df4plot.loc[df4plot['Date'] >= datetime.datetime(year=2018, month=1, day=1)]
sub.loc[sub['Date'] <= datetime.datetime(year=2018, month=4, day=1), 'value'] = np.nan

In [None]:
## Figure 1 and 4
shift2match = True
subncodes = ['EGY', 'FIN', 'HLC']
for suffix in ['full', 'no_quantile']:
    if suffix == 'full':
        sub0 = sub.copy()
    else:
        sub0 = sub.loc[~sub['variable'].str.contains("95")].copy()
        sub0 = sub0.loc[sub0['variable'].str.contains("ols")]
    sub0 = sub0.loc[sub0['ncode'].isin(subncodes)]
    sub0 = sub0.loc[sub0['slabel'].str.contains("Full")]
    subannots = annots.loc[annots['ncode'].isin(sub0['ncode'].unique())]
    if shift2match:
        suffix = suffix + '_shift'
        
    # Names
    sub0['variable'] = sub0['variable'].map({
        "T":r"Mosaic stat. $(S_t)$",
        "Tols":r"OLS stat. $(S_t^{ols})$" ,
        "Tq":r'$Q_{0.95}(S_t)$ under $H_0$'
    })
    sub0 = sub0.loc[sub0['variable'].notnull()] # get rid of p-vals
    # Choose where the stars go
    sdata = sub0.loc[sub0['Date'] == stardate].copy()
    sdata = sdata.loc[sdata['variable'].str.contains("OLS")]
    g = (
        ggplot(
            sub0, 
            aes(x='Date', y='value', linetype='variable', color='variable')
        ) 
        + geom_line(size=1) 
        + facet_wrap("~name")
        + theme_bw()
        + theme(figure_size=(8, 3))
        #+ theme(axis_text_x = element_text(angle = 90))
        + geom_vline(data=subannots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
        + geom_text(
            data=subannots, 
            mapping=aes(x='Date', y='value', label='label'), 
            inherit_aes=False, size=9, angle=90, nudge_x=-60
        )
        + geom_point(
            mapping=aes(shape='Date.astype(str)'),
            data=sdata, size=4.5, fill='white', color='black',
        )
        + scale_x_datetime(
            labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
        )
        + labs(
            y=r'Mean Max. Corr. ($S_t$)', 
            x='',
            title='',
        )
        #+ ylim(None, 0.52)
    )
    if 'full' in suffix:
        g += scale_color_manual(['orangered', 'cornflowerblue', 'indigo'])
        g += scale_linetype_manual(['dotted', 'solid', 'solid'])
    else:
        g += scale_color_manual(['indigo'])
        g += scale_linetype_manual(['solid'])
    g += scale_shape_manual(['*'])
    g += labs(color='', linetype='', shape='')
    g += theme(legend_position="bottom", legend_box_spacing=-0.01)
    #g.save(f"{SAVE_DIR}/mmc_{suffix}_sec1.png", dpi=500)
    print(g)

## 2.2 Intro plot approximate Z-statistics <a class="anchor" id="mmc_apprxZ"></a>

In [None]:
## Figure 6 extended (this figure is practically informative but not published)
subncodes = ['EGY', 'FIN', 'HLC']
sub0 = sub.copy()
sub0 = sub0.loc[sub0['ncode'].isin(subncodes)]
subannots = annots.loc[annots['ncode'].isin(subncodes)]
# Names
sub0 = sub0.loc[sub0['variable'] == 'Zapprx'].copy()
sub0['value'] = np.maximum(sub0['value'], 0)
# Correct typo...
sub0['slabel'] = sub0['slabel'].str.replace("\Style", "\nStyle", regex=False)
# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
# Make plot
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', color='slabel', linetype='slabel')
    ) 
    + geom_line(size=1.25)
    + facet_wrap("~name")
    + theme_bw()
    + theme(figure_size=(8, 3))
    + theme(axis_text_x = element_text(angle = 90))
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + geom_hline(yintercept=stats.norm.ppf(1 - 0.05/3), linetype='dotted')
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'(S - E[S]$)_+$ / sd(S)',#y=r'Z = $\Phi^{-1}(1-p_{val})_+$',
        x='',
        title='Approximate Z-statistics',
    )
)
g += scale_color_manual(['cornflowerblue', 'red'])
g += scale_linetype_manual(['solid', 'solid'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
#g.save(f"{SAVE_DIR}/mmc_Zapprx.png", dpi=500)
print(g)

## 2.3 Intro plot exact Z-statistics <a class="anchor" id="mmc_pvals"></a>

In [None]:
## Figure 6
subncodes = ['EGY', 'FIN', 'HLC']
sub0 = sub.copy()
sub0 = sub0.loc[sub0['ncode'].isin(subncodes)]
subannots = annots.loc[annots['ncode'].isin(subncodes)]
# Names
sub0 = sub0.loc[sub0['variable'] == 'pval']
sub0['zval'] = p_to_z(np.minimum(1, 3*sub0['value'].astype(float)))
# Correct typo...
sub0['slabel'] = sub0['slabel'].str.replace("\Style", "\nStyle", regex=False)
# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
# Make plot
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='zval', color='slabel', linetype='slabel')
    ) 
    + geom_line(size=1.25)
    + facet_wrap("~name")
    + theme_bw()
    + theme(figure_size=(8, 3))
    + theme(axis_text_x = element_text(angle = 90))
#     + geom_vline(data=subannots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
#     + geom_text(
#         data=subannots, 
#         mapping=aes(x='Date', y='value', label='label'), 
#         inherit_aes=False, size=9, angle=90, nudge_x=-60
#     )
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + geom_hline(yintercept=stats.norm.ppf(0.95), linetype='dotted')
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'Z = $\Phi^{-1}(1-p_{val})_+$',
        x='',
        title='',
    )
)
g += scale_color_manual(['cornflowerblue', 'red'])
g += scale_linetype_manual(['solid', 'dotted'])

g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
#g.save(f"{SAVE_DIR}/mmc_pval_.png", dpi=500)
print(g)

## 2.4 MMC ablation statistics <a class="anchor" id="mmc_ablation"></a>

In [None]:
abannots = pd.DataFrame(
    [[covid, 0.4, 'EGY'],
    [covid, 0.45, 'FIN'],
    [covid, 0.5, 'ITC'],
    [covid, 0.5, 'IND'],
    [covid, 0.35, 'HLC'],
    [covid, 0.5, 'CDI']],
    columns=['Date', 'value', 'ncode']
)
abannots['label'] = 'Covid'
abannots['name'] = annots['ncode'].map(ncode2name)

In [None]:
## Figure 1 and 4
subncodes = ['EGY', 'FIN', 'HLC']
sub = df4plot.loc[df4plot['Date'] >= datetime.datetime(year=2018, month=1, day=1)]
sub.loc[sub['Date'] <= datetime.datetime(year=2018, month=4, day=1), 'value'] = np.nan

sub0 = sub.copy()
sub0 = sub0.loc[sub0['ncode'].isin(subncodes)]
sub0 = sub0.loc[sub0['slabel'].str.contains("Missing")]
subannots = abannots.loc[abannots['ncode'].isin(sub0['ncode'].unique())]

# Names
sub0['variable'] = sub0['variable'].map({
    "T":r"Mosaic stat. $(S_t)$",
    "Tq":r'$Q_{0.95}(S_t)$ under $H_0$'
})
sub0 = sub0.loc[sub0['variable'].notnull()] # get rid of p-vals

# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
sdata = sdata.loc[sdata['variable'].str.contains("OLS")]
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', linetype='variable', color='variable')
    ) 
    + geom_line(size=1) 
    + facet_wrap("~name")
    + theme_bw()
    + theme(figure_size=(8, 3))
    #+ theme(axis_text_x = element_text(angle = 90))
    + geom_vline(data=subannots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
    + geom_text(
        data=subannots, 
        mapping=aes(x='Date', y='value', label='label'), 
        inherit_aes=False, size=9, angle=90, nudge_x=-80
    )
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'Mean Max. Corr. ($S_t$)', 
        x='',
        title='',
    )
    #+ ylim(None, 0.52)
)
g += scale_color_manual(['orangered', 'blue'])
g += scale_linetype_manual(['dotted', 'solid', 'solid'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
#g.save(f"{SAVE_DIR}/mmc_ablation.png", dpi=500)
print(g)

# 3. R2 plots, main text <a class="anchor" id="r2"></a>

### 3.1 Main R2 plot <a class="anchor" id="r2_main"></a>

In [None]:
def r2_plot_data(
    window=350,
    reps=5000,
    names=['EGY', 'HLC', 'FIN'],
    full_only=False,
):

    dfs = []
    t0 = time.time()
    for industry in names:
        print(f"At industry={industry} at {elapsed(t0)}.")

        # Types of models to loop through
        which_factors = ['all', 'industry']
        slabels = ['Full BFRE', 'Missing\nStyle Factors']
        to_exclude = [None, None]
        if industry == 'FIN':
            which_factors.append('all')
            slabels.append('Missing\nReal estate assets')
            to_exclude.append("FINREAL")
        for which_fac, slabel, to_exclude in zip(
            which_factors, slabels, to_exclude,
        ):
            if full_only and 'Missing' in slabel:
                continue
            np.random.seed(123)
            data = load_data(
                industry=industry,
                which_factors=which_fac, 
                to_exclude=to_exclude,
            )
            
            ### Data
            outcomes = data['outcomes'] #* weights
            exposures = data['exposures'] #* weights[..., np.newaxis]
            
            ### First split of data
            n0 = int(2.9 * outcomes.shape[0] / 4)
            # exposures on first split
            exp_train = exposures[0:n0]
            exp_train[np.isnan(exp_train)] = 0
            # outcomes on first split
            out_train = outcomes.fillna(0).values[0:n0]
            # estimate new potential exposures
            mpt_train = mp.factor.MosaicFactorTest(
                outcomes=out_train, 
                exposures=exp_train,
                test_stat=None
            )
            mpt_train.compute_mosaic_residuals()
            resid0 = mpt_train.residuals
            asub = data['active_subset']
            qs = np.around(np.linspace(20, len(asub), 10)).astype(int) / len(asub)
            new_exp = mp.statistics.approximate_sparse_pcas(
                np.cov(resid0[:, asub].T), quantiles=qs
            )
            # add zero padding
            new_exposures = np.zeros((new_exp.shape[0], resid0.shape[1]))
            new_exposures[:, asub] = new_exp
    
            ### Second split of data
            mpt = mp.factor.MosaicBCV(
                outcomes=outcomes.iloc[n0:],
                exposures=exposures[n0:],
                new_exposures=new_exposures,
            )
            mpt.fit_tseries(nrand=reps, window=window)
    
            ## Use maximum of r2s as test statistic
            maxr2s = np.max(mpt.stats_tseries, axis=-1)
            nullr2s = np.max(mpt.null_tseries, axis=-1)
            pval_tseries = np.sum(maxr2s.reshape(-1, 1) <= nullr2s, axis=1) + 1
            pval_tseries = pval_tseries / (1 + reps)
    
            dx = outcomes.iloc[n0:].index[mpt.ends-1]
            df = pd.DataFrame(
                [dx, mpt.pval_tseries],
                index=['Date', 'pval']
            ).T
            
            # This if statement is a hack which doesn't change labelled # assets for finreal
            if 'Real estate' not in slabel: 
                n_assets = outcomes.shape[1]
            
            # Save metadata
            df['name'] = sector_names[industry] + f" ({n_assets} assets)"
            df['ncode'] = industry
            df['slabel'] = slabel
            df['zval'] = p_to_z(np.minimum(3*pval_tseries, 1))
            df['maxr2'] = maxr2s
            dfs.append(df)
            
        
    df4plot = pd.concat(dfs, axis='index')
    return df4plot

In [None]:
RIBBON_DELTAS = {
    "EGY":0.002,
    "FIN":0.004,
    "HLC":0.004,
    "ITC":0.003,
    "CDI":0.003,
    "IND":0.003,
    "CST":0.003,
    "MAT":0.003,
    "UTL":0.003,
}

def r2_ribbon_plot(r2data, figure_size=(8, 3.25)):
    comb = r2data.copy()
    ## Hack to make legend spacing nicer
    comb['slabel'] = comb['slabel'].apply(
        lambda x: x if "Style" not in x else x + "       "
    )
    # multiplicity correction
    ncodes = len(r2data['ncode'].unique())
    
    ## P-value buckets
    comb['pval_bucket'] = pd.cut(
        np.minimum(ncodes*comb['pval'], 1), bins=[0, 1e-3, 5e-2, 1.01]
    ).astype(str)
    def pval_bucket_fmt(x):
        if '(0.05, 1' in x:
            return r"$p_{val} \geq 0.05$"
        elif '(0.0, 0.001]' in x:
            return r"$p_{val} \leq 0.001$"
        else:
            return r"$p_{val} \leq 0.05$"
        
    comb['pval_bucket'] = comb['pval_bucket'].apply(pval_bucket_fmt)
    comb['pval_bucket'] = pd.Categorical(
        comb['pval_bucket'], ordered=True, categories=[
            pval_bucket_fmt("(0.05, 1"),
            pval_bucket_fmt("(0.001, 0.05)"),
            pval_bucket_fmt("(0.0, 0.001]"),
        ]
    )
    
    # Delta for ribbon
    comb['ribbon_delta'] = comb['ncode'].map(RIBBON_DELTAS)
    comb['ribbon_alpha'] = (comb['pval'] <= 0.05).astype(float)

    ## Plot
    g = (
        ggplot(
            comb,
            aes(x='Date', y='maxr2', color='slabel', group='slabel')
        ) 
        + facet_wrap("~name", scales='free')
                + theme(axis_text_x = element_text(angle = 90))
        + scale_x_datetime(
            labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
        )
        + theme_bw()
        + theme(figure_size=figure_size)
        + scale_color_manual(['cornflowerblue', 'green', 'red', 'black'])
    #     + geom_hline(yintercept=0, linetype='dotted')
        + labs(
            x='',
            y=r'Max. Bi-cross $R^2$',
            title='',
            color='Model',
            fill='Significance',
        )
        + theme(legend_position="bottom", legend_box_spacing=-0.01)
    )
    for slabel in comb['slabel'].unique():
        for name in comb['name'].unique():
            sub = comb.loc[
                (comb['slabel'] == slabel) &
                (comb['name'] == name)
            ].sort_values("Date")
            starts = np.where(sub['pval_bucket'] != sub['pval_bucket'].shift(1))[0]
            ends = np.where(sub['pval_bucket'] != sub['pval_bucket'].shift(-1))[0]
            for start, end in zip(starts,ends):
                alpha = (1 - float("\geq" in sub.iloc[start]['pval_bucket'])) * 0.9
                g += geom_ribbon(
                    data=sub.iloc[start:end],
                    mapping=aes(
                        x='Date', 
                        ymin='maxr2-ribbon_delta', 
                        ymax='maxr2+ribbon_delta',
                        fill='pval_bucket',
                    ), 
                    inherit_aes=False,
                    color='none',
                    alpha=alpha,
            )        
    g += scale_fill_manual(['white', 'gray', 'black'])
    g += geom_line(mapping=aes(x='Date', y='maxr2', color='slabel', group='slabel'), size=1)
    return g

In [None]:
df4plot = r2_plot_data()
#df4plot.to_csv("preplot_data/r2_plot_data.csv")

In [None]:
# # read + process
# df4plot = pd.read_csv("preplot_data/r2_plot_data.csv")
# df4plot['Date'] = to_datetime(df4plot['Date'])
# df4plot['ncode'] = names_to_ncode_fn(df4plot['name'])

In [None]:
g = r2_ribbon_plot(df4plot)
#g.save(f"{SAVE_DIR}/r2_plot_v3.png", dpi=500)
print(g)

## 3.2 Identified real estate stocks, main text <a class="anchor" id="real_estate"></a>

In [None]:
asset_names = pd.read_csv("../../bfre_data/assets_id_to_name.csv").rename(
    columns={"invariant_id":"ASSET", "name_sec":"name"}
)
asset2names = asset_names.set_index("ASSET")['name']
names2asset = asset_names.set_index("name")['ASSET']

In [None]:
data = load_data('FIN', start_date=datetime.datetime(2013, 1, 1))
n0 = int(3 * data['outcomes'].shape[0] / 4)
np.random.seed(1234)
# estimate new potential exposures
mpt_train = mp.factor.MosaicFactorTest(
    outcomes=data['outcomes'], 
    exposures=data['exposures'],
    test_stat=None,
    ngroups=1,
)
mpt_train.compute_mosaic_residuals()
hateps = mpt_train.residuals

### Read results
B = data['active_subset']
Sigma = np.corrcoef(hateps[0:n0][:, B].T)
subset = np.where(np.diag(Sigma) > 1e-5)[0]
p = len(subset)
Sig = Sigma[np.ix_(subset, subset)]
scale = np.sqrt(np.diag(Sig))
# Correlation matrix
C = Sig / np.outer(scale, scale)
absC = np.abs(C - np.eye(len(subset)))
macs = np.max(absC, axis=0)
inds = np.argsort(-1*macs)

In [None]:
# compute the proportion of days on which each asset has exposure to real estate
re_exposures = data['exposures'][:, :, np.where(data['factor_cols'] == 'FINREAL')[0][0]]
counts = np.maximum(np.sum(data['outcomes'] != 0, axis=0), 1)
re_props = np.sum(re_exposures, axis=0) / counts
re_assets = re_props > 0.5

In [None]:
k = 20
topk = inds[0:k]
topk_assets = data['outcomes'].columns[B][topk]
topk_names = asset2names[topk_assets]
industries = data['industries']
topk_inds = industries[industries.index.isin(topk_assets)]
np.mean(topk_inds == 'FINREAL')

In [None]:
# These assets clearly have exposure to real estate despite not being classified as finreal
asset2names[topk_assets[topk_inds != 'FINREAL']]

In [None]:
assets = data['outcomes'].columns
# Sort by evec
topk_re = topk[0:8] #topk[topk_inds 'FINREAL'][0:10]
evec = np.linalg.eigh(C[topk_re][:, topk_re])[1][:, -1]
topk_re = topk_re[np.argsort(evec)]
# Out of sample correlation plot
fig, ax = plt.subplots(figsize=(16, 6))
Sig2 = np.corrcoef(hateps[n0:][:, B].T)
Sig2pd = pd.DataFrame(
    Sig2[topk_re][:, topk_re], columns=asset2names[assets[B][topk_re]]
)
Sig2pd.columns.name = 'Asset'


sns.heatmap(Sig2pd.T, cmap='RdBu', center=0, annot=True,  fmt=".2f", ax=ax)
ax.set(xlabel='Asset', title=f'Out-of-sample correlation matrix for residuals of selected financial assets')
# make heatmap
if np.any(evec < 0) and np.any(evec > 0):
    last_neg = np.where(evec < 0)[0].max()
    ax.axvline(last_neg+1, color='black', linestyle='dotted')
    ax.axhline(last_neg+1, color='black', linestyle='dotted')
    
plt.savefig(f"{SAVE_DIR}/finreal_oos_corr.png", dpi=500, bbox_inches='tight')
plt.show()

My understanding:
- MGIC investment, Radian group, Essent group are all mortgate insurers [see here](https://www.fool.com/investing/2018/04/09/why-mgic-investment-corp-radian-group-essent-group.aspx)
- In fact, MGIC + Radian announced plans to merge in 2007, although they terminated this [decision later](https://radian.com/-/media/Files/Enterprise/Investor-Relations/Toolkit/Radian-Overview.pdf)
- Welltower and Ventas invest in healthcare infrastructure
- Life storage is a storage company
- Regency centers seems to operate retail centers

# Appendix A.1 Additional sectors <a class="anchor" id="othersectors"></a>

## A.1.1: MMC statistic <a class="anchor" id="mmc_othersectors"></a>

In [None]:
moresectors = section_1_data(
    reps=1000, 
    names=['ITC', 'IND', 'CDI', 'CST', 'UTL', 'MAT'], 
    full_only=True,
    shift2match=False
)

In [None]:
## Additional metadata for plots
ncode2name = moresectors[['ncode', 'name']].drop_duplicates().set_index("ncode")['name']
covid = datetime.datetime(year=2020, month=2, day=20)
annots = pd.DataFrame(
    [[covid, 0.25, 'ITC'],
    [covid, 0.25, 'IND'],
    [covid, 0.24, 'CDI'],
    [covid, 0.26, 'CST'],
    [covid, 0.25, 'MAT'],
    [covid, 0.32, 'UTL']],
    columns=['Date', 'value', 'ncode']
)
annots['label'] = 'Covid'
annots['name'] = annots['ncode'].map(ncode2name)

## Subset by time
sub = moresectors.loc[moresectors['Date'] >= datetime.datetime(year=2018, month=1, day=1)].copy()
sub.loc[sub['Date'] <= datetime.datetime(year=2018, month=4, day=1), 'value'] = np.nan
# Date to star
stardate = sub.iloc[
    np.argmin(np.abs(sub['Date'] - datetime.datetime(year=2020, month=4, day=21)))
]['Date']

In [None]:
# Names
sub0 = sub.copy()
sub0['variable'] = sub0['variable'].map({
    "T":r"Mosaic stat. $(S_t)$",
    "Tols":r"OLS stat. $(S_t^{ols})$" ,
    "Tq":r'$Q_{0.95}(S_t)$ under $H_0$'
})
sub0 = sub0.loc[sub0['variable'].notnull()] # get rid of p-vals
sub0 = sub0.loc[~sub0['variable'].str.contains("OLS")] # no need for OLS on this plot
# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
sdata = sdata.loc[sdata['variable'].str.contains("Mosaic")]
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', linetype='variable', color='variable')
    ) 
    + geom_line(size=1) 
    + facet_wrap("~name")
    + theme_bw()
    + theme(figure_size=(8, 6))
    #+ theme(axis_text_x = element_text(angle = 90))
    + geom_vline(data=annots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
    + geom_text(
        data=annots, 
        mapping=aes(x='Date', y='value', label='label'), 
        inherit_aes=False, size=9, angle=90, nudge_x=-60
    )
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'Mean Max. Corr. ($S_t$)', 
        x='',
        title='',
    )
)
g += scale_color_manual(['orangered', 'cornflowerblue', 'indigo'])
g += scale_linetype_manual(['dotted', 'solid', 'solid'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
#g.save(f"{SAVE_DIR}/appendix/mmc_othersectors.png",  dpi=500, bbox_inches='tight')
g

## A.1.2: R2 plot <a class="anchor" id="r2_othersectors"></a>

In [None]:
moresectors_r2 = r2_plot_data(
    names=['ITC', 'IND', 'CDI', 'CST', 'UTL', 'MAT'], 
    reps=10000,
)

In [None]:
g = r2_ribbon_plot(moresectors_r2, figure_size=(8, 6))
#g.save(f"{SAVE_DIR}/appendix/r2_othersectors_v2.png", dpi=500, bbox_inches='tight')
print(g)

# Appendix A.2 Sensitivity to window <a class="anchor" id="window"></a>

In [None]:
wdfs = []
for window in [200, 300, 400]:
    wdf = section_1_data(
        reps=300, 
        full_only=True,
        shift2match=True,
        window=window
    )
    wdf['window'] = window
    wdfs.append(wdf)

wdfs = pd.concat(wdfs, axis='index')

In [None]:
sub = wdfs.loc[wdfs['Date'] >= datetime.datetime(year=2018, month=1, day=1)].copy()
sub.loc[sub['Date'] <= datetime.datetime(year=2018, month=4, day=1), 'value'] = np.nan

# Names
sub0 = sub.copy()
sub0['variable'] = sub0['variable'].map({
    "T":r"Mosaic stat. $(S_t)$",
    "Tols":r"OLS stat. $(S_t^{ols})$" ,
    "Tq":r'$Q_{0.95}(S_t)$ under $H_0$'
})
sub0 = sub0.loc[sub0['variable'].notnull()] # get rid of p-vals
sub0 = sub0.loc[~sub0['variable'].str.contains("OLS")] # no need for OLS on this plot
sub0['window'] = "Window=" + sub0['window'].astype(str)
# Choose where the stars go
sdata = sub0.loc[sub0['Date'] == stardate].copy()
sdata = sdata.loc[sdata['variable'].str.contains("Mosaic")]
g = (
    ggplot(
        sub0, 
        aes(x='Date', y='value', linetype='variable', color='variable')
    ) 
    + geom_line(size=1) 
    + facet_grid("window~name")
    + theme_bw()
    + theme(figure_size=(8, 6))
    #+ theme(axis_text_x = element_text(angle = 90))
    # + geom_vline(data=annots, mapping=aes(xintercept='Date'), color='black', linetype='dotted')
    # + geom_text(
    #     data=annots, 
    #     mapping=aes(x='Date', y='value', label='label'), 
    #     inherit_aes=False, size=9, angle=90, nudge_x=-60
    # )
    + geom_point(
        mapping=aes(shape='Date.astype(str)'),
        data=sdata, size=4.5, fill='white', color='black',
    )
    + scale_x_datetime(
        labels=lambda lst: [x.year if x.month==1 and x.day==1 else "" for x in lst]
    )
    + labs(
        y=r'Mean Max. Corr. ($S_t$)', 
        x='',
        title='',
    )
)
g += scale_color_manual(['orangered', 'cornflowerblue', 'indigo'])
g += scale_linetype_manual(['dotted', 'solid', 'solid'])
g += scale_shape_manual(['*'])
g += labs(color='', linetype='', shape='')
g += theme(legend_position="bottom", legend_box_spacing=-0.01)
g.save(f"{SAVE_DIR}/appendix/mmc_window_sensitivity.png",  dpi=500, bbox_inches='tight')
g