In [37]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [38]:
import time

import cython
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp

# inputs
n_cols = 500
n_rows = 1000

In [39]:
# create data
df = pd.DataFrame(
    np.random.rand(n_rows, n_cols),
    columns=[f'col{col}' for col in range(n_cols)]
)

# randomly select a base and focus window
df['window'] = np.where(np.random.rand(n_rows) >= 0.5, 'base', 'focus')

# print some info
print(df.shape)
print(df.head())
print(df['window'].value_counts())

(1000, 501)
       col0      col1      col2      col3      col4      col5      col6  \
0  0.886069  0.342227  0.940609  0.208911  0.896751  0.829383  0.856272   
1  0.209245  0.043635  0.514498  0.693389  0.621050  0.217023  0.517723   
2  0.320506  0.180175  0.726251  0.573701  0.207949  0.572271  0.193500   
3  0.835452  0.904155  0.663100  0.749944  0.209785  0.029508  0.751463   
4  0.433899  0.056769  0.226017  0.600785  0.494551  0.236688  0.732381   

       col7      col8      col9  ...    col491    col492    col493    col494  \
0  0.423602  0.655348  0.537874  ...  0.448364  0.451231  0.747732  0.893936   
1  0.833583  0.247214  0.361892  ...  0.626373  0.960159  0.402688  0.727979   
2  0.355157  0.249714  0.631273  ...  0.749485  0.949959  0.424991  0.617194   
3  0.875575  0.339591  0.778173  ...  0.678897  0.706769  0.696911  0.561921   
4  0.021581  0.777907  0.299382  ...  0.774766  0.339363  0.744937  0.481794   

     col495    col496    col497    col498    col499  win

In [40]:
# create two numpy array's from df
arr_base = df[df['window'] == 'base']._get_numeric_data().values
arr_focus = df[df['window'] == 'focus']._get_numeric_data().values
print(type(arr_base), arr_base.shape)
print(type(arr_focus), arr_focus.shape)

<class 'numpy.ndarray'> (498, 500)
<class 'numpy.ndarray'> (502, 500)


In [41]:
def ks_df_dumb(df):
    results = []
    for col in df._get_numeric_data():
        base = df[df['window'] == 'base'][col].values
        focus = df[df['window'] == 'focus'][col].values
        ks_stat, p_value = ks_2samp(base, focus, mode='asymp')
        results.append((ks_stat, p_value))
    return results

In [51]:
def ks_df_vec(df):
    
    def my_ks_2samp(a,b):
        return ks_2samp(a,b,mode='asymp')
    
    results = []
    base = df[df['window'] == 'base']._get_numeric_data().transpose().values
    focus = df[df['window'] == 'focus']._get_numeric_data().transpose().values
    ks_2samp_vec = np.vectorize(ks_2samp, signature='(n),(m)->(),()')
    results = ks_2samp_vec(base, focus)
    results = list(zip(results[0], results[1]))
    return results

In [43]:
def ks_np_dumb(arr_a, arr_b):
    results = []
    for n in range(arr_a.shape[1]):        
        ks_stat, p_value = ks_2samp(arr_a[:,n],arr_b[:,n], mode='asymp')
        results.append((ks_stat, p_value))
    return results

In [50]:
def ks_np_vec(arr_a, arr_b):
    
    def my_ks_2samp(a,b):
        return ks_2samp(a,b,mode='asymp')
    
    ks_2samp_vec = np.vectorize(my_ks_2samp, signature='(n),(m)->(),()')
    results = ks_2samp_vec(arr_a.T, arr_b.T)
    results = list(zip(results[0], results[1]))
    return results

In [45]:
%%cython

import numpy as np
cimport numpy as np
cimport cython
from scipy.stats import ks_2samp

DTYPE = np.double


cpdef cy_ks_np(double[:, :] arr_a, double[:, :] arr_b):

    cdef double k, p
    cdef Py_ssize_t i
    cdef Py_ssize_t m = arr_a.shape[1]
    
    result = np.zeros((m, 2), dtype=DTYPE)
    cdef double[:, :] result_view = result

    for i in range(m):
        k, p = ks_2samp(arr_a[:,i], arr_b[:,i], mode='asymp')
        result_view[i,0] = k
        result_view[i,1] = p

    return result

In [46]:
%%timeit -n 5 -r 5
results = ks_df_dumb(df)

1.55 s ± 114 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [47]:
%%timeit -n 5 -r 5
results = ks_df_vec(df)

2.28 s ± 116 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [48]:
%%timeit -n 5 -r 5
results = ks_np_dumb(arr_base, arr_focus)

101 ms ± 2.84 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [52]:
%%timeit -n 5 -r 5
results = ks_np_vec(arr_base, arr_focus)

103 ms ± 4.53 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [53]:
%%timeit -n 5 -r 5
results = cy_ks_np(arr_base, arr_focus)

123 ms ± 6.25 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [54]:
print('ks_df_dumb')
start_time = time.time()
results = ks_df_dumb(df)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

ks_df_dumb
1.9 seconds


In [55]:
print('ks_df_vec')
start_time = time.time()
results = ks_df_vec(df)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

ks_df_vec
2.54 seconds


In [59]:
print('ks_np_dumb')
start_time = time.time()
results = ks_np_dumb(arr_base, arr_focus)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

ks_np_dumb
0.12 seconds


In [56]:
print('ks_np_vec')
start_time = time.time()
results = ks_np_vec(arr_base, arr_focus)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

ks_np_vec
0.16 seconds


In [57]:
print('cy_ks_np')
start_time = time.time()
results = cy_ks_np(arr_base, arr_focus)
end_time = time.time()
print(f'{round(end_time-start_time,2)} seconds')

cy_ks_np
0.16 seconds


In [61]:
%prun ks_np_dumb(arr_base, arr_focus)

 

         66504 function calls in 0.135 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1000    0.025    0.000    0.025    0.000 {method 'searchsorted' of 'numpy.ndarray' objects}
     1000    0.023    0.000    0.023    0.000 {method 'sort' of 'numpy.ndarray' objects}
      500    0.018    0.000    0.056    0.000 _distn_infrastructure.py:1862(sf)
      500    0.013    0.000    0.133    0.000 stats.py:5385(ks_2samp)
     1000    0.007    0.000    0.009    0.000 numerictypes.py:578(_can_coerce_all)
     1500    0.005    0.000    0.005    0.000 {method 'reduce' of 'numpy.ufunc' objects}
     1000    0.003    0.000    0.003    0.000 {method 'copy' of 'numpy.ndarray' objects}
     4500    0.003    0.000    0.003    0.000 {built-in method numpy.array}
     1500    0.003    0.000    0.011    0.000 fromnumeric.py:69(_wrapreduction)
     1500    0.002    0.000    0.002    0.000 {built-in method numpy.core._multiarray_umath._inse