In [1]:
%load_ext line_profiler
%load_ext cython

In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [3]:
y = np.random.randint(2, size=(500, 1))
x = np.random.randint(10, size=(500, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [4]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

def test_eq(a, b):
    assert np.allclose(a, b)

In [5]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [5]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v2(data, 'y', 'x'))

In [6]:
%%timeit
res_v1 = target_mean_v1(data, 'y', 'x')

1.48 s ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
res_v2 = target_mean_v2(data, 'y', 'x')

27.1 ms ± 1.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
print(f"V2: Speed up {1.48 / (27.1 / 1e3):.1f} times")

V2: Speed up 54.6 times


In [9]:
def target_mean_v3(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    value_dict = defaultdict(lambda:0)
    count_dict = defaultdict(lambda:0)
    for x, y in zip(X, Y):
        value_dict[x] += y
        count_dict[x] += 1
    result = [(value_dict[x] - y) / (count_dict[x] - 1) for x, y in zip(X, Y)]
    return result

In [10]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v3(data, 'y', 'x'))

In [11]:
%%timeit
res_v3 = target_mean_v3(data, 'y', 'x')

698 µs ± 5.93 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
print(f"V3: Speed up {1.48 / (698 / 1e6):.1f} times")

V3: Speed up 2120.3 times


In [24]:
%%cython -a

import numpy as np
cimport numpy as c_np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef void _target_mean_v3_cy(double[:] result, int[:] X, double[:] Y, int[:] cnt, double[:] val, int n):
    for i from 0 <= i < n:
#         int x = X[i]
        val[X[i]] += Y[i]
        cnt[X[i]] += 1

    for i from 0 <= i < n:
        result[i] = (val[X[i]] - Y[i]) / (cnt[X[i]] - 1)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def target_mean_v3_cy(data, x_name, y_name):
    cdef int n = data.shape[0]
    cdef c_np.ndarray[int, ndim=1] X = data[x_name].values
    cdef c_np.ndarray[double, ndim=1] Y = data[y_name].values
    cdef c_np.ndarray[short, ndim=1] cnt = np.zeros(10, dtype=np.int16)
    cdef c_np.ndarray[double, ndim=1] val = np.zeros(10)
    out = np.empty(n)
    cdef c_np.ndarray[double, ndim=1] result = out
    _target_mean_v3_cy(result, X, Y, cnt, val, n)
    return out

In [25]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v3_cy(data, 'y', 'x'))

ValueError: Buffer dtype mismatch, expected 'int' but got 'long'

In [10]:
%%cython -a

cimport numpy as np

cpdef void _target_mean_v3_cy(double[:] result, int[:] X, double[:] Y, int n, int[:] cnt, int[:] val):    
    for i from 0 <= i < n:
#         cdef int x = X[i]
        val[X[i]] += Y[i]
        cnt[X[i]] += 1

    for i from 0 <= i < n:
        result[i] = (val[X[i]] - Y[i]) / (cnt[X[i]] - 1)
        
def target_mean_v3_cy(data, y_name, x_name):
    cdef long int n = data.shape[0]
    cdef 
    out = np.empty(n)
    n_class = 10
    
    _target_mean_v3_cy(out, data[x_name].values, data[y_name].values, n)
    return out


Error compiling Cython file:
------------------------------------------------------------
...

import numpy as np

cpdef void _target_mean_v3_cy(double[:] result, int[:] X, double[:] Y, int n, int n_class):
    cdef int[:] cnt = [0] * n_class
                     ^
------------------------------------------------------------

/home/alex/.cache/ipython/cython/_cython_magic_d0a9aaa36d518ba6653b25cd7b94f77d.pyx:5:22: Cannot coerce multiplied list to 'int[:]'

Error compiling Cython file:
------------------------------------------------------------
...

import numpy as np

cpdef void _target_mean_v3_cy(double[:] result, int[:] X, double[:] Y, int n, int n_class):
    cdef int[:] cnt = [0] * n_class
    cdef int[:] val = [0] * n_class
                     ^
------------------------------------------------------------

/home/alex/.cache/ipython/cython/_cython_magic_d0a9aaa36d518ba6653b25cd7b94f77d.pyx:6:22: Cannot coerce multiplied list to 'int[:]'

Error compiling Cython file:
------------

In [None]:
# cpdef void _target_mean_v3_cy(double[:] result, int[:] X, str[:] x_name):
#     cdef long int n = data.shape[0]
#     cdef int[:] X = data[x_name].values
#     cdef double[:] Y = data[y_name].values
#     cdef list cnt = [0] * 10
#     cdef list val = [0] * 10
    
#     for i from 0 <= i < n:
# #         int x = X[i]
#         val[X[i]] += Y[i]
#         cnt[X[i]] += 1

#     for i from 0 <= i < n:
#         result[i] = (val[X[i]] - Y[i]) / (cnt[X[i]] - 1)

In [16]:
import ray
ray.init()

In [23]:
# ray.init()

@ray.remote
def mean_enc(val_dict, cnt_dict, x, y):
    return (val_dict[x] - y) / (cnt_dict[x] - 1)

def target_mean_v4(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    value_dict = defaultdict(lambda:0)
    count_dict = defaultdict(lambda:0)
    for x, y in zip(X, Y):
        value_dict[x] += y
        count_dict[x] += 1
    result = [mean_enc.remote(value_dict, count_dict, x, y) for x, y in zip(X, Y)]
    return ray.get(result)

In [24]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v4(data, 'y', 'x'))

In [25]:
%%timeit
res_v4 = target_mean_v4(data, 'y', 'x')

338 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
def target_mean_v4(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    result = np.empty_like(Y, dtype='float')
    
    for i in range(n):
        which = (X == x[i]) # has for-loop?
        which[i] = False
        idxs = np.where(which)[0] # has for-loop?
        result[i] = Y[idxs].mean()  # has for-loop
    return result

In [14]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v4(data, 'y', 'x'))

In [15]:
%%timeit
res_v4 = target_mean_v4(data, 'y', 'x')

5.58 ms ± 14.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [69]:
def target_mean_v5(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    result = np.empty_like(Y, dtype='float')
#     value_dict = defaultdict(lambda:0)
#     count_dict = defaultdict(lambda:0)
    which = []
    for i in range(n):
        _which = (X == x[i])
        _which[i] = False
        which.append(_which)
        idxs = np.where(which)[0] # inner for-loop?
        result[i] = Y[idxs].mean()  # inner for-loop
    
    for i, o in enumerate(which):
        result[i] = np.mean([Y[j] for j, e in enumerate(o) if e])
    
    return result

In [70]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v5(data, 'y', 'x'))

In [71]:
%%timeit
res_v5 = target_mean_v5(data, 'y', 'x')

543 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [75]:
def target_mean_v6(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    value_dict = defaultdict(lambda:0)
    count_dict = defaultdict(lambda:0)
    for i in range(n):
        value_dict[X[i]] += Y[i]
        count_dict[X[i]] += 1
    result = [(value_dict[X[i]] - Y[i]) / (count_dict[X[i]] - 1) for i in range(n)]
    return result

In [76]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v6(data, 'y', 'x'))

In [77]:
%%timeit
res_v6 = target_mean_v6(data, 'y', 'x')

1.23 ms ± 36.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
