In [1]:
# load some libs
import sys
import time
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import random

In [2]:
# process data
data = pd.read_csv('../data/DailyReturn.csv', sep=',')
new_data = data.drop(columns=['Unnamed: 0', 'SPY'])
new_data = new_data.iloc[:, 0:10] # just select 10 company

In [3]:
# calculate covariance, mean, Exponentially weighted(EW) covraiance ,PCA,

# means
means = new_data.mean().values
# covariance
norm = (new_data - new_data.mean()).to_numpy()
cov = norm.T @ norm
# EW covariance
alpha = 0.97
weights = (1-alpha)**np.arange(len(new_data))[::-1]
norm_new_data = (new_data - new_data.mean()).fillna(0).to_numpy()
EW_cov = ((weights * norm_new_data.T)@norm_new_data)/(weights.sum())
# PCA
scaler = StandardScaler()
pca = PCA()
new_data_scale = scaler.fit_transform(new_data)
new_data_pca = pca.fit(new_data_scale)

In [4]:
print(new_data_pca.explained_variance_ratio_)
print(new_data_pca.singular_values_)
new_data_pca.get_covariance().shape

[0.46872862 0.16703891 0.1016091  0.07137857 0.07000315 0.04807623
 0.0333704  0.02619545 0.0130088  0.00059077]
[16.77012739 10.01116093  7.80803804  6.54424509  6.48088633  5.37082305
  4.4746219   3.96450136  2.79379339  0.59536601]


(10, 10)

In [5]:
# implement a multivariate normal simulation
def multi_norm_simu(means, cov, sample_num=200):
    random.seed(1)
    N = sample_num
    B = np.linalg.cholesky(cov)

    # norm() return one number from standard normal distribution
    origin_simu = np.random.rand(len(means), sample_num)
#     n1 = np.array([random.gauss(0, 1) for _ in range(sample_num)])
#     n2 = np.array([random.gauss(0, 1) for _ in range(sample_num)])#features*samples
    res = origin_simu.T.dot(B.T) + means
    return res

simu_data = multi_norm_simu(means, cov, sample_num=10)


In [6]:
np.all(np.linalg.eigvals(cov) > (0))

True

In [7]:
# generate correlation matrix and variance
pearson_cor = np.corrcoef(new_data.values.T)
print(cov)
print(EW_cov)


[[ 1.51899353e-02  9.19076256e-03  9.66187195e-03  1.89518770e-02
   7.21726303e-03  7.24539772e-03  7.19689871e-03  1.88000671e-02
   2.85582662e-05  1.19474333e-03]
 [ 9.19076256e-03  1.50331953e-02  7.65306033e-03  1.78111426e-02
   1.18631594e-02  1.17552908e-02  8.70510218e-03  2.05863333e-02
  -3.69618795e-04 -9.47848600e-04]
 [ 9.66187195e-03  7.65306033e-03  1.53878232e-02  9.80299142e-03
   9.13834648e-03  9.07235255e-03  8.98054825e-03  2.36333984e-02
  -3.73608449e-04 -2.37331210e-04]
 [ 1.89518770e-02  1.78111426e-02  9.80299142e-03  1.21092434e-01
   9.98668912e-03  1.05217227e-02  1.62231820e-02  4.31893713e-02
  -1.23217728e-03  2.96852074e-03]
 [ 7.21726303e-03  1.18631594e-02  9.13834648e-03  9.98668912e-03
   1.46054006e-02  1.42413857e-02  8.28189707e-03  1.94441449e-02
   2.85216455e-04  6.98872426e-04]
 [ 7.24539772e-03  1.17552908e-02  9.07235255e-03  1.05217227e-02
   1.42413857e-02  1.40676270e-02  8.52186379e-03  1.92480584e-02
   2.48778231e-04  7.84250198e-04

In [27]:
# simulate 25000 draws
# direct simulation
N = 25000

start_time = time.time()
simu_data_1 = np.random.multivariate_normal(means, cov, N)
end_time = time.time()
print("time consuming: {:.4f}s".format(end_time - start_time))

print(simu_data_1.shape)

# PCA with 100% explained
scaler = StandardScaler()
pca = PCA()
new_data_scale = scaler.fit_transform(new_data)
new_data_pca = pca.fit(new_data)
cov2 = new_data_pca.get_covariance()
start_time = time.time()
simu_data_2 = np.random.multivariate_normal(means,cov2, N)
end_time = time.time()
print("time consuming: {:.4f}s".format(end_time - start_time))
print(simu_data_2.shape)

# PCA with 75% explained

scaler = StandardScaler()
pca = PCA(0.75)
new_data_scale = scaler.fit_transform(new_data)
new_data_pca = pca.fit(new_data)
cov3 = new_data_pca.get_covariance()

start_time = time.time()

simu_data_3 = np.random.multivariate_normal(means,cov3, N)
end_time = time.time()
print("time consuming: {:.4f}s".format(end_time - start_time))

print(simu_data_3.shape)

# PCA with 50% explained

scaler = StandardScaler()
pca = PCA(0.50)
new_data_scale = scaler.fit_transform(new_data)
new_data_pca = pca.fit(new_data)
cov4 = new_data_pca.get_covariance()
start_time = time.time()
simu_data_4 = np.random.multivariate_normal(means,cov4, N)
end_time = time.time()
print("time consuming: {:.4f}s".format(end_time - start_time))
print(simu_data_4.shape)


time consuming: 0.0250s
(25000, 10)
time consuming: 0.0133s
(25000, 10)
time consuming: 0.0125s
(25000, 10)
time consuming: 0.0058s
(25000, 10)


In [28]:
# compare L2 norm
np.cov(simu_data_1.T).shape

(10, 10)

In [29]:
np.cov(simu_data_2.T)

array([[ 2.57335827e-04,  1.55344343e-04,  1.63483767e-04,
         3.21248563e-04,  1.21865368e-04,  1.22333842e-04,
         1.23214523e-04,  3.17287440e-04,  1.18213570e-06,
         2.13183013e-05],
       [ 1.55344343e-04,  2.53246132e-04,  1.29638176e-04,
         3.01831143e-04,  2.00563447e-04,  1.98357096e-04,
         1.48540597e-04,  3.46430286e-04, -6.09315616e-06,
        -1.71715665e-05],
       [ 1.63483767e-04,  1.29638176e-04,  2.60232219e-04,
         1.61625816e-04,  1.53911274e-04,  1.52836656e-04,
         1.52303659e-04,  3.96632931e-04, -6.11885327e-06,
        -5.00035678e-06],
       [ 3.21248563e-04,  3.01831143e-04,  1.61625816e-04,
         2.06628076e-03,  1.67470023e-04,  1.75937229e-04,
         2.69996421e-04,  7.22224877e-04, -1.98844501e-05,
         5.49406905e-05],
       [ 1.21865368e-04,  2.00563447e-04,  1.53911274e-04,
         1.67470023e-04,  2.46034342e-04,  2.39634975e-04,
         1.41012051e-04,  3.26470047e-04,  4.66499119e-06,
         9.

In [30]:
np.cov(simu_data_3.T)

array([[ 2.73269065e-04,  1.19809639e-04,  1.14457204e-04,
         3.14308102e-04,  1.09335728e-04,  1.10908353e-04,
         1.09862536e-04,  3.46964810e-04, -4.08261017e-06,
         8.44354377e-06],
       [ 1.19809639e-04,  2.94598642e-04,  1.35290748e-04,
         2.94582629e-04,  1.26378271e-04,  1.27577907e-04,
         1.20833689e-04,  3.89721640e-04, -2.73064279e-06,
         9.64029234e-06],
       [ 1.14457204e-04,  1.35290748e-04,  3.07429956e-04,
         1.67004508e-04,  1.39444823e-04,  1.37643345e-04,
         1.19809077e-04,  4.07278302e-04, -2.46564705e-06,
         5.94609545e-06],
       [ 3.14308102e-04,  2.94582629e-04,  1.67004508e-04,
         2.06223407e-03,  1.68179466e-04,  1.66202585e-04,
         2.74231071e-04,  7.20510553e-04, -2.07976396e-05,
         5.33858257e-05],
       [ 1.09335728e-04,  1.26378271e-04,  1.39444823e-04,
         1.68179466e-04,  2.90021932e-04,  1.28285750e-04,
         1.13102459e-04,  3.79844137e-04, -3.79829063e-06,
         7.

In [31]:
norm_num_1 = np.linalg.norm(np.cov(simu_data_1.T), ord =2)
norm_num_2 = np.linalg.norm(np.cov(simu_data_2.T), ord =2)*60# for different normal appraoch 60 days
norm_num_3 = np.linalg.norm(np.cov(simu_data_3.T), ord =2)*60
norm_num_4 = np.linalg.norm(np.cov(simu_data_4.T), ord =2)*60
print(norm_num_1)
print(norm_num_2)
print(norm_num_3)
print(norm_num_4)

0.17756589070425732
0.17957492771318537
0.17860561659801294
0.18284065473868427


In [33]:
norm_num = np.linalg.norm(cov) 

In [34]:
norm_num

0.19323538497896178

In [25]:
0.0029836533739408173*60

0.17901920243644903

In [26]:
0.00302953101278869*60

0.1817718607673214