把提供的 target encoding 代码改为 cython 代码并比较速度区别

In [3]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

data.head

<bound method NDFrame.head of       y  x
0     1  0
1     1  9
2     1  4
3     0  2
4     0  0
...  .. ..
4995  0  6
4996  1  3
4997  1  5
4998  0  2
4999  1  9

[5000 rows x 2 columns]>

In [4]:
def target_mean_v1(data, y_name, x_name):
    
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

target_mean_v1(data, 'y', 'x')

array([0.51020408, 0.50877193, 0.49005425, ..., 0.48625793, 0.45436893,
       0.50877193])

使用 Cython

In [5]:
%load_ext Cython


In [6]:
%%cython -a
import numpy as np 
cimport numpy as cnp

cpdef target_mean_v2(data, y_name, x_name):

    cdef int shape = data.shape[0]
    
    cdef cnp.ndarray[double] result = np.zeros(shape)
    value = {}
    count = {}
    
    cdef cnp.ndarray[long] x_values = data[x_name].values
    cdef cnp.ndarray[long] y_values = data[y_name].values

    for i in range(shape):
        index = x_values[i]
        count[index] = 1 if index not in value else count[index] + 1
        value[index] = y_values[i] if index not in value else value[index] + y_values[i]
    

    for i in range(shape):
        index = x_values[i]
        result[i] = (value[index] - y_values[i]) / (count[index] - 1)
    return result

In [None]:
print(np.linalg.norm(target_mean_v1(data, 'y', 'x')- target_mean_v2(data, 'y', 'x')))

0.0


In [None]:
%timeit -n 1 target_mean_v1(data, 'y', 'x')
%timeit -n 1 target_mean_v2(data, 'y', 'x')


1 loop, best of 3: 23.3 s per loop
1 loop, best of 3: 1.09 ms per loop


In [9]:
%timeit -n 5 target_mean_v1(data, 'y', 'x')




5 loops, best of 3: 28.4 s per loop


In [8]:
%timeit -n 100 target_mean_v2(data, 'y', 'x')


100 loops, best of 3: 1.07 ms per loop
