In [1]:
from utils import *

In [2]:
period = timedelta(days=730)  # 2 years

# Load symbols

In [3]:
path_simus = Path('./simus_with_more_shifts/')

paths_universes = (
    pl.scan_parquet(path_simus / 'index.parquet')
    .filter(pl.col('n_stocks') <= 100)
    .select('path')
    .collect()
)

symbols = [
    set(
        pl.scan_parquet(Path(path).parent / ('universe_' + Path(path).name))
        .select('symbol')
        .unique()
        .collect()
        .get_column('symbol')
        .to_list()
    )
    for path in paths_universes.get_column('path')
]
columns = sorted(set.union(*symbols))
len(columns)

880

# Load returns

In [4]:
returns = (
    load_and_prepare_market_data(start_date=init_date - period)
    .with_columns(pl.col('price').pct_change().over(keys[1]).alias('return'))
    .filter(pl.col(keys[1]).is_in(columns))
    .collect()
    .pivot(index=keys[0], columns=keys[1], values='return', sort_columns=True)
    .sort('date')
)
dates = returns.get_column('date')
print(len(dates))

6609


# Compute covariance for one day

In [5]:
%%time
date = dates[1000]
name = date.strftime('%y%m%d') + '.parquet'
sub = (
    returns
    .filter(pl.col('date') <= date)
    .filter(pl.col('date') > date - period)
)
(
    shape_matrix(
        sub
        .select(pairs_cov(columns))
        .row(0),
        columns
    )
)

CPU times: user 4.02 s, sys: 961 ms, total: 4.98 s
Wall time: 3.66 s


Unnamed: 0,AACG,AAL,AAOI,AAPL,AAXJ,ABEO,ABIO,ABNB,ABTS,ABUS,...,YY,Z,ZBRA,ZD,ZG,ZI,ZION,ZM,ZS,ZUMZ
AACG,1.0,-0.0,-0.0,-0.000000,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-0.0,-0.0,-0.0
AAL,-0.0,1.0,-0.0,-0.000000,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-0.0,-0.0,-0.0
AAOI,-0.0,-0.0,1.0,-0.000000,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-0.0,-0.0,-0.0
AAPL,-0.0,-0.0,-0.0,1.000000,-0.0,0.000067,0.000818,-0.0,-0.0,-0.0,...,-0.0,-0.0,0.000558,0.000191,-0.0,-0.0,0.000166,-0.0,-0.0,-0.0
AAXJ,-0.0,-0.0,-0.0,-0.000000,1.0,-0.000000,-0.000000,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-0.0,-0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZI,-0.0,-0.0,-0.0,-0.000000,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.000000,-0.000000,-0.0,1.0,-0.000000,-0.0,-0.0,-0.0
ZION,-0.0,-0.0,-0.0,0.000166,-0.0,-0.000019,0.000245,-0.0,-0.0,-0.0,...,-0.0,-0.0,0.000124,-0.000007,-0.0,-0.0,1.000000,-0.0,-0.0,-0.0
ZM,-0.0,-0.0,-0.0,-0.000000,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.000000,1.0,-0.0,-0.0
ZS,-0.0,-0.0,-0.0,-0.000000,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-0.0,1.0,-0.0


# Compute PCA

In [6]:
returns_pd = sub.to_pandas().set_index('date')

In [7]:
%%time
returns_pd.cov()

CPU times: user 172 ms, sys: 0 ns, total: 172 ms
Wall time: 171 ms


Unnamed: 0,AACG,AAL,AAOI,AAPL,AAXJ,ABEO,ABIO,ABNB,ABTS,ABUS,...,YY,Z,ZBRA,ZD,ZG,ZI,ZION,ZM,ZS,ZUMZ
AACG,,,,,,,,,,,...,,,,,,,,,,
AAL,,,,,,,,,,,...,,,,,,,,,,
AAOI,,,,,,,,,,,...,,,,,,,,,,
AAPL,,,,0.002298,,0.000067,0.000818,,,,...,,,0.000558,0.000191,,,0.000166,,,
AAXJ,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZI,,,,,,,,,,,...,,,,,,,,,,
ZION,,,,0.000166,,-0.000019,0.000245,,,,...,,,0.000124,-0.000007,,,0.000666,,,
ZM,,,,,,,,,,,...,,,,,,,,,,
ZS,,,,,,,,,,,...,,,,,,,,,,


In [8]:
%%time
n_components = 0.70
res = get_pca_factors_model(returns_pd, n_components)

CPU times: user 9.15 s, sys: 817 ms, total: 9.97 s
Wall time: 4.24 s


In [9]:
res['cleaned_corr']

Unnamed: 0,AACG
AACG,1.0
