In [1]:
%load_ext Cython

In [2]:
import time

import cython
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp

# inputs
n_cols = 500
n_rows = 1000

ks_mode = 'asymp'

In [3]:
# create data
df = pd.DataFrame(
    np.random.rand(n_rows, n_cols),
    columns=[f'col{col}' for col in range(n_cols)]
)

# randomly select a base and focus window
df['window'] = np.where(np.random.rand(n_rows) >= 0.5, 'base', 'focus')

# print some info
print(df.shape)
print(df.head())
print(df['window'].value_counts())

(1000, 501)
       col0      col1      col2      col3      col4      col5      col6  \
0  0.576180  0.028715  0.993711  0.135535  0.346557  0.518403  0.729246   
1  0.660837  0.095143  0.371349  0.028041  0.233918  0.026286  0.836127   
2  0.106310  0.081704  0.978659  0.056065  0.651664  0.431180  0.564958   
3  0.672454  0.793970  0.077888  0.642459  0.507821  0.993883  0.947422   
4  0.172368  0.250165  0.734814  0.210094  0.451818  0.358249  0.495988   

       col7      col8      col9  ...    col491    col492    col493    col494  \
0  0.490158  0.494646  0.116077  ...  0.860490  0.569335  0.044060  0.971207   
1  0.886347  0.882663  0.605259  ...  0.103136  0.424617  0.150541  0.035092   
2  0.893656  0.501112  0.623185  ...  0.041430  0.825347  0.620048  0.942049   
3  0.136142  0.794745  0.639645  ...  0.841491  0.669624  0.081996  0.635159   
4  0.643380  0.699809  0.414895  ...  0.083681  0.623664  0.844328  0.572089   

     col495    col496    col497    col498    col499  win

In [4]:
# create two numpy array's from df
arr_base = df[df['window'] == 'base']._get_numeric_data().values
arr_focus = df[df['window'] == 'focus']._get_numeric_data().values
print(type(arr_base), arr_base.shape)
print(type(arr_focus), arr_focus.shape)

<class 'numpy.ndarray'> (507, 500)
<class 'numpy.ndarray'> (493, 500)


In [5]:
def ks_df_dumb(df, ks_mode):
    """Take in a df, loop over each column, split into base and focus, and apply test.
    """
    results = []
    for col in df._get_numeric_data():
        base = df[df['window'] == 'base'][col].values
        focus = df[df['window'] == 'focus'][col].values
        ks_stat, p_value = ks_2samp(base, focus, mode=ks_mode)
        results.append((ks_stat, p_value))
    return results

In [6]:
%%timeit -n 5 -r 5
results = ks_df_dumb(df, ks_mode)

1.45 s ± 39.3 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [7]:
print('ks_df_dumb')
start_time = time.time()
results = ks_df_dumb(df, ks_mode)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

ks_df_dumb
1.8 seconds


In [8]:
def ks_df_vec(df, ks_mode):
    """Take in a df, and use np.vectorize to avoid pandas loop.
    """
    
    def my_ks_2samp(a,b):
        return ks_2samp(a,b,mode='asymp')
    
    results = []
    base = df[df['window'] == 'base']._get_numeric_data().transpose().values
    focus = df[df['window'] == 'focus']._get_numeric_data().transpose().values
    ks_2samp_vec = np.vectorize(ks_2samp, signature='(n),(m)->(),()')
    results = ks_2samp_vec(base, focus)
    results = list(zip(results[0], results[1]))
    return results

In [None]:
%%timeit -n 5 -r 5
results = ks_df_vec(df, ks_mode)

In [None]:
print('ks_df_vec')
start_time = time.time()
results = ks_df_vec(df, ks_mode)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

In [None]:
def ks_np_dumb(arr_a, arr_b, ks_mode):
    results = []
    for n in range(arr_a.shape[1]):        
        ks_stat, p_value = ks_2samp(arr_a[:,n],arr_b[:,n], mode=ks_mode)
        results.append((ks_stat, p_value))
    return results

In [None]:
%%timeit -n 5 -r 5
results = ks_np_dumb(arr_base, arr_focus, ks_mode)

In [None]:
print('ks_np_dumb')
start_time = time.time()
results = ks_np_dumb(arr_base, arr_focus, ks_mode)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

In [None]:
def ks_np_vec(arr_a, arr_b, ks_mode):
    
    def my_ks_2samp(a,b):
        return ks_2samp(a,b,mode=ks_mode)
    
    ks_2samp_vec = np.vectorize(my_ks_2samp, signature='(n),(m)->(),()')
    results = ks_2samp_vec(arr_a.T, arr_b.T)
    results = list(zip(results[0], results[1]))
    return results

In [None]:
%%timeit -n 5 -r 5
results = ks_np_vec(arr_base, arr_focus, ks_mode)

In [None]:
print('ks_np_vec')
start_time = time.time()
results = ks_np_vec(arr_base, arr_focus, ks_mode)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

In [None]:
%%cython

import numpy as np
cimport numpy as np
cimport cython
from scipy.stats import ks_2samp

DTYPE = np.double


cpdef cy_ks_np(double[:, :] arr_a, double[:, :] arr_b, str ks_mode):

    cdef double k, p
    cdef Py_ssize_t i
    cdef Py_ssize_t m = arr_a.shape[1]
    
    result = np.zeros((m, 2), dtype=DTYPE)
    cdef double[:, :] result_view = result

    for i in range(m):
        k, p = ks_2samp(arr_a[:,i], arr_b[:,i], mode=ks_mode)
        result_view[i,0] = k
        result_view[i,1] = p

    return result

In [None]:
%%timeit -n 5 -r 5
results = cy_ks_np(arr_base, arr_focus, ks_mode)

In [None]:
print('cy_ks_np')
start_time = time.time()
results = cy_ks_np(arr_base, arr_focus, ks_mode)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

In [None]:
%prun cy_ks_np(arr_base, arr_focus, ks_mode)