In [1]:
%load_ext Cython

In [2]:
import time

import cython
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp

# inputs
n_cols = 200
n_rows = 1000

In [3]:
# create data
df = pd.DataFrame(
    np.random.rand(n_rows, n_cols),
    columns=[f'col{col}' for col in range(n_cols)]
)

# randomly select a base and focus window
df['window'] = np.where(np.random.rand(n_rows) >= 0.5, 'base', 'focus')

# print some info
print(df.shape)
print(df.head())
print(df['window'].value_counts())

(1000, 201)
       col0      col1      col2      col3      col4      col5      col6  \
0  0.948151  0.614034  0.108205  0.277074  0.897316  0.022170  0.365907   
1  0.569964  0.679303  0.116523  0.996159  0.687061  0.542842  0.169815   
2  0.334469  0.250663  0.986200  0.926311  0.628055  0.179119  0.592241   
3  0.218385  0.870784  0.294704  0.472960  0.207863  0.221755  0.730130   
4  0.912946  0.888567  0.296331  0.695963  0.257647  0.398912  0.643933   

       col7      col8      col9  ...    col191    col192    col193    col194  \
0  0.295494  0.128025  0.431435  ...  0.045427  0.767367  0.483426  0.900184   
1  0.126485  0.149533  0.521056  ...  0.850938  0.129488  0.346979  0.832179   
2  0.032384  0.560780  0.475293  ...  0.387899  0.815845  0.644342  0.495470   
3  0.690949  0.427892  0.068691  ...  0.147644  0.793434  0.429706  0.563573   
4  0.915435  0.494238  0.068424  ...  0.193682  0.212297  0.225945  0.662959   

     col195    col196    col197    col198    col199  win

In [4]:
# create two numpy array's from df
arr_base = df[df['window'] == 'base']._get_numeric_data().values
arr_focus = df[df['window'] == 'focus']._get_numeric_data().values
print(type(arr_base), arr_base.shape)
print(type(arr_focus), arr_focus.shape)

<class 'numpy.ndarray'> (491, 200)
<class 'numpy.ndarray'> (509, 200)


In [5]:
def ks_df_dumb(df):
    results = []
    for col in df._get_numeric_data():
        base = df[df['window'] == 'base'][col].values
        focus = df[df['window'] == 'focus'][col].values
        ks_stat, p_value = ks_2samp(base, focus)
        results.append((ks_stat, p_value))
    return results

In [6]:
def ks_df_vec(df):
    results = []
    base = df[df['window'] == 'base']._get_numeric_data().transpose().values
    focus = df[df['window'] == 'focus']._get_numeric_data().transpose().values
    ks_2samp_vec = np.vectorize(ks_2samp, signature='(n),(m)->(),()')
    results = ks_2samp_vec(base, focus)
    results = list(zip(results[0], results[1]))
    return results

In [7]:
def ks_np_dumb(arr_a, arr_b):
    results = []
    for n in range(arr_a.shape[1]):        
        ks_stat, p_value = ks_2samp(arr_a[:,n],arr_b[:,n])
        results.append((ks_stat, p_value))
    return results

In [8]:
def ks_np_vec(arr_a, arr_b):
    ks_2samp_vec = np.vectorize(ks_2samp, signature='(n),(m)->(),()')
    results = ks_2samp_vec(arr_a.T, arr_b.T)
    results = list(zip(results[0], results[1]))
    return results

In [9]:
%%cython

import numpy as np
cimport numpy as np
cimport cython
from scipy.stats import ks_2samp

DTYPE = np.double


cpdef cy_ks_np(double[:, :] arr_a, double[:, :] arr_b):

    cdef double k, p
    cdef Py_ssize_t i
    cdef Py_ssize_t m = arr_a.shape[1]
    
    result = np.zeros((m, 2), dtype=DTYPE)
    cdef double[:, :] result_view = result

    for i in range(m):
        k, p = ks_2samp(arr_a[:,i], arr_b[:,i])
        result_view[i,0] = k
        result_view[i,1] = p

    return result

In [None]:
%%timeit -n 5 -r 5
results = ks_df_dumb(df)

In [None]:
%%timeit -n 5 -r 5
results = ks_df_vec(df)

In [None]:
%%timeit -n 5 -r 5
results = ks_np_dumb(arr_base, arr_focus)

In [None]:
%%timeit -n 5 -r 5
results = ks_np_vec(arr_base, arr_focus)

In [None]:
%%timeit -n 5 -r 5
results = cy_ks_np(arr_base, arr_focus)