# chap02 作业

使用 cython 优化 target_encoding 代码

## 优化步骤
1. python 代码逻辑简化
2. 使用 cython 定义精确类型，提取多余计算变量，进一步提速

In [12]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(5000,1))
x = np.random.randint(10, size=(5000,1))
data = pd.DataFrame(np.concatenate([y,x], axis=1), columns=['y', 'x'])

data.head()

Unnamed: 0,y,x
0,1,7
1,0,3
2,1,8
3,1,2
4,1,4


## 比较两个版本python代码


### 版本一

In [56]:
def target_mean_v1(data , y_name, x_name):
  result = np.zeros(data.shape[0])
  for i in range(data.shape[0]):
    groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
    result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
  return result

In [57]:
v1_result = target_mean_v1(data, 'y', 'x')
v1_result

array([0.51345756, 0.50097847, 0.46969697, ..., 0.51219512, 0.50877193,
       0.48502994])

In [None]:
%%timeit

target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 23.9 s per loop


### 版本二

In [None]:
def target_mean_v2(data, y_name, x_name):
  length = data.shape[0]
  result = np.zeros(length)
  value_dict = dict()
  count_dict = dict()
  for i in range(length):
    x = data.loc[i, x_name]
    if x not in value_dict.keys():
      value_dict[x] = data.loc[i, y_name]
      count_dict[x] = 1
    else:
      value_dict[x] += data.loc[i, y_name]
      count_dict[x] +=1

  for i in range(length):
    x = data.loc[i, x_name]
    result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)

  return result

In [None]:
v2_result = target_mean_v2(data, 'y', 'x')

In [None]:
%%timeit
target_mean_v2(data, 'y', 'x')

10 loops, best of 3: 159 ms per loop


In [None]:
np.linalg.norm(v2_result - v1_result)

0.0

### 版本三
使用cython


In [None]:
%load_ext Cython

In [None]:
%%cython -a --cplus

import numpy as np
cimport numpy as np
import pandas as pd

cpdef target_mean_v3(data, y_name, x_name):
  cdef long length = data.shape[0]
  cpdef np.ndarray[np.float64_t, ndim=1] result = np.zeros(length, dtype = np.float64)
  cdef dict value_dict = dict()
  cdef dict count_dict = dict()
  for i in range(length):
    x = data.loc[i, x_name]
    y = data.loc[i, y_name]
    if x not in value_dict.keys():
      value_dict[x] = y
      count_dict[x] = 1
    else:
      value_dict[x] += y
      count_dict[x] += 1
    
  for i in range(length):
    x = data.loc[i, x_name]
    result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)

  return result



In [None]:
v3_result = target_mean_v3(data, 'y', 'x')

In [None]:
%%timeit
target_mean_v3(data, 'y', 'x')

10 loops, best of 3: 155 ms per loop


In [None]:
np.linalg.norm(v3_result - v1_result)

0.0

### 版本四
pandas loc 查找， 替换为numpy索引


In [None]:
%%cython -a --cplus

import numpy as np
cimport numpy as np
import pandas as pd

cpdef target_mean_v4(data, y_name, x_name):
  cdef long length = data.shape[0]
  cpdef np.ndarray[np.float64_t, ndim=1] result = np.zeros(length, dtype = np.float64)
  cdef dict value_dict = dict()
  cdef dict count_dict = dict()

  cdef long x_index = data.columns.get_loc(x_name)
  cdef long y_index = data.columns.get_loc(y_name)
  cpdef np.ndarray matrix = data.values


  for i in range(length):
    x = matrix[i][x_index]
    y = matrix[i][y_index]
    if x not in value_dict.keys():
      value_dict[x] = y
      count_dict[x] = 1
    else:
      value_dict[x] += y
      count_dict[x] += 1
    
  for i in range(length):
    x = matrix[i][x_index]
    result[i] = (value_dict[x] - matrix[i][y_index]) / (count_dict[x] - 1)

  return result

In [None]:
v4_result = target_mean_v4(data, 'y', 'x')

In [None]:
%%timeit
target_mean_v4(data, 'y', 'x')

100 loops, best of 3: 9.26 ms per loop


In [None]:
np.linalg.norm(v4_result - v1_result)

0.0

### 版本五
并行

In [85]:
%%cython -a
# distutils: language = c++

import numpy as np
import pandas as pd
from cython.parallel import prange
from libcpp.map cimport map
cimport cython
cimport numpy as cnp

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_cython_parallel(long[:] x, long[:] y,const int shape, double[:] result):
  cdef map[int, int] value_dict
  cdef map[int, int] count_dict
  cdef int i
  cdef int x_i
  cdef int y_i
  for i in prange(shape, nogil = True):
    if value_dict.find(x[i]) == value_dict.end():
      value_dict[x[i]] = y[i]
      count_dict[x[i]] = 1
    else:
      value_dict[x[i]] += y[i]
      count_dict[x[i]] += 1
  
  for i in prange(shape, nogil = True):
    result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)


def target_mean_v5(data, y_name, x_name):
  x = data[x_name].values
  y = data[y_name].values
  shape = data.shape[0]
  result = np.zeros(shape)
  target_mean_cython_parallel(x, y, shape, result)
  return result

In [86]:
v5_result = target_mean_v5(data, 'y', 'x')

In [87]:
%%timeit
target_mean_v5(data, 'y', 'x')

1000 loops, best of 3: 397 µs per loop


In [88]:
np.linalg.norm(v5_result - v1_result)

0.0

####版本1：23.9 s per loop
####版本2：159 ms per loop
####版本3：155 ms per loop（使用cython）
####版本4：9.26 ms per loop （将pandas的loc改为numpy索引）
####版本5：397 µs per loop （使用并行）