# Covariance - Pandas vs Numpy

In [35]:
import numpy as np
import pandas as pd
from time import process_time_ns

#### Define functions and prepare data

In [36]:
dates = pd.date_range('1/1/2000', periods=2000)
ts_5 = pd.DataFrame(np.random.randn(2000, 5), index=dates)
ts_50 = pd.DataFrame(np.random.randn(2000, 50), index=dates)
ts_100 = pd.DataFrame(np.random.randn(2000, 100), index=dates)

## Comparison

### 5 columns

In [37]:
%%timeit
for i in range(100):
    cov_pd = ts_5.cov()*252

10 loops, best of 5: 74.3 ms per loop


In [38]:
%%timeit
for i in range(100):
    v = ts_5.values
    v = v[~np.isnan(v).any(axis=1)]
    cov_np = np.cov(v, rowvar=False)*252
    cov_np = pd.DataFrame(data=cov_np)

10 loops, best of 5: 39.8 ms per loop


### 50 columns

In [39]:
%%timeit
for i in range(100):
    cov_pd = ts_50.cov()*252

1 loop, best of 5: 294 ms per loop


In [40]:
%%timeit
for i in range(100):
    v = ts_50.values
    v = v[~np.isnan(v).any(axis=1)]
    cov_np = np.cov(v, rowvar=False)*252
    cov_np = pd.DataFrame(data=cov_np)

1 loop, best of 5: 265 ms per loop


### 100 columns

In [41]:
%%timeit
for i in range(100):
    cov_pd = ts_100.cov()*252

1 loop, best of 5: 516 ms per loop


In [42]:
%%timeit
for i in range(100):
    v = ts_100.values
    v = v[~np.isnan(v).any(axis=1)]
    cov_np = np.cov(v, rowvar=False)*252
    cov_np = pd.DataFrame(data=cov_np)

1 loop, best of 5: 620 ms per loop


## Correctness check

In [43]:
# pandas
cov_pd = ts.cov()*252

# numpy
v = ts.values
v = v[~np.isnan(v).any(axis=1)]
cov_np = np.cov(v, rowvar=False)*252
cov_np = pd.DataFrame(data=cov_np)

# diff
(cov_pd-cov_np).sum().sum()

0.0