## DarQube: Calculating Huge Correlation Matrix

In [6]:
import numpy as np
from numpy.random import rand
import pandas as pd
np.random.seed(1)

## Generate Prices Dataframe

In [7]:
%%time
# Some constants
num_dfs = 10  # Number of random dataframes to generate
n_rows = 1000
n_cols = 1500
df_prices=pd.DataFrame(rand(n_rows*n_cols).reshape((n_rows, n_cols)), columns=np.arange(n_cols))
df_prices.shape

CPU times: user 47.3 ms, sys: 11.9 ms, total: 59.2 ms
Wall time: 58.3 ms


In [8]:
df_prices.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0.417022,0.720324,0.000114,0.302333,0.146756,0.092339,0.18626,0.345561,0.396767,0.538817,...,0.551432,0.185289,0.442101,0.185318,0.947376,0.493825,0.963918,0.644942,0.484316,0.967695
1,0.14319,0.63233,0.661888,0.43384,0.059357,0.492723,0.465134,0.806747,0.256661,0.39143,...,0.972021,0.214398,0.541121,0.453435,0.967609,0.836176,0.214147,0.439539,0.168477,0.529087
2,0.793485,0.035879,0.191359,0.977456,0.960797,0.556592,0.646091,0.272093,0.952091,0.669775,...,0.816128,0.154246,0.185194,0.542595,0.061398,0.538059,0.335997,0.324454,0.229853,0.004687
3,0.687073,0.752781,0.338283,0.469804,0.797382,0.647468,0.022978,0.283972,0.954395,0.883155,...,0.698019,0.670183,0.973578,0.465109,0.104017,0.024147,0.8813,0.026191,0.950828,0.569918
4,0.657568,0.90917,0.901952,0.390855,0.612539,0.521835,0.817041,0.056183,0.626975,0.429738,...,0.067117,0.464784,0.936318,0.148084,0.881589,0.198591,0.482828,0.490967,0.2348,0.596583


## Calculate Returns Dataframe

In [9]:
%%time
df_returns=df_prices / df_prices.shift(1) - 1
df_returns.shape
df_returns.head()

CPU times: user 85 ms, sys: 83.5 ms, total: 168 ms
Wall time: 169 ms


## Calculate Correlation Dataframe

In [10]:
%%time
df_correl=df_returns.corr()
df_correl.shape

CPU times: user 47 s, sys: 389 ms, total: 47.4 s
Wall time: 1min 9s


In [11]:
df_correl.shape

(1500, 1500)

In [12]:
df_correl.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,1.0,-0.01231,-0.00574,-0.001611,0.001267,-0.000877,-0.013167,-0.003743,-0.004015,-0.007209,...,-0.009442,0.001161,0.019179,-0.012405,-0.004861,-0.010676,-0.003614,0.002386,0.008031,-0.004265
1,-0.01231,1.0,0.00997,-0.006946,-0.001729,-0.008895,-0.030501,-0.003586,-0.006444,0.002945,...,-0.013677,-0.001803,-0.00615,0.018797,-0.00993,-0.008366,0.026748,-0.005813,-0.002921,0.005866
2,-0.00574,0.00997,1.0,-0.00254,-0.006361,0.000107,0.005433,-0.002602,-0.002319,-0.005931,...,-0.00156,-0.004067,0.002483,-0.006524,-0.003167,-0.005041,-0.005673,-0.006333,-0.002818,-0.004245
3,-0.001611,-0.006946,-0.00254,1.0,0.02455,-0.005921,-0.003132,-0.002795,-0.002452,-0.003195,...,-0.002504,-0.001056,-0.002505,0.014712,-0.002384,-0.00526,-0.002032,-0.00324,-0.002495,-0.002918
4,0.001267,-0.001729,-0.006361,0.02455,1.0,-0.003613,-0.004909,-0.003903,-0.002986,-0.000978,...,-0.003866,-0.005233,-0.00295,-0.009597,-0.000441,0.000903,-0.008268,-0.008458,-0.003595,-0.006215


## Calculate Beta/Multiplier Dataframe

In [13]:
%%time
df_beta=df_returns.cov()/df_returns.var()  #Calculate betas

CPU times: user 39.9 s, sys: 166 ms, total: 40 s
Wall time: 40.6 s


In [14]:
df_beta.shape

(1500, 1500)

In [15]:
df_beta.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,1.0,-0.028206,-0.000819,-0.000394,0.001297,-0.0011,-0.03619,-0.001055,-0.001118,-0.006657,...,-0.006746,0.000855,0.005274,-0.043412,-0.001167,-0.008853,-0.002508,0.002472,0.001388,-0.002578
1,-0.005373,1.0,0.000621,-0.00074,-0.000773,-0.004866,-0.036585,-0.000441,-0.000783,0.001187,...,-0.004265,-0.00058,-0.000738,0.02871,-0.00104,-0.003028,0.0081,-0.002628,-0.00022,0.001548
2,-0.040229,0.160113,1.0,-0.004349,-0.045648,0.000943,0.104661,-0.005141,-0.004525,-0.038386,...,-0.007814,-0.020994,0.004786,-0.160019,-0.005328,-0.029296,-0.027588,-0.045978,-0.003413,-0.017985
3,-0.006597,-0.065156,-0.001484,1.0,0.102911,-0.030389,-0.035244,-0.003226,-0.002794,-0.012077,...,-0.007324,-0.003185,-0.00282,0.210787,-0.002343,-0.017857,-0.005771,-0.013739,-0.001765,-0.007221
4,0.001237,-0.00387,-0.000886,0.005857,1.0,-0.004423,-0.013178,-0.001074,-0.000812,-0.000882,...,-0.002697,-0.003764,-0.000792,-0.032804,-0.000103,0.000731,-0.005603,-0.008557,-0.000607,-0.003669


In [16]:
df_covar.head()

NameError: name 'df_covar' is not defined

In [None]:
df_beta.head()