In [None]:
import numpy as np
import pandas as pd

In [None]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [None]:

def target_mean_v1(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  for i in range(data.shape[0]):
    groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
    result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
  return result

In [None]:

def target_mean_v2(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()
  for i in range(data.shape[0]):
    if data.loc[i, x_name] not in value_dict.keys():
      value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] = 1
    else:
      value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] += 1
  for i in range(data.shape[0]):
    result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
  return result

In [None]:
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

In [None]:

%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

In [None]:

result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(data, 'y', 'x')
diff = np.linalg.norm(result_1 - result_2)
print(diff)

In [None]:

%load_ext Cython

In [None]:

%%cython -a

import numpy as np
import numpy as cnp

def target_mean_v3(data, y_name, x_name):
  cdef:
    int length = data.shape[0]
    dict value_dict = {}
    dict count_dict = {}
    cnp.ndarray[cnp.float64_t] result = np.zeros(length, dtype=np.float64)
    cnp.ndarray[cnp.int_t] y = data[y_name].values
    cnp.ndarray[cnp.int_t] x = data[x_name].values
  for i in range(length):
    if x[i] not in value_dict:
      value_dict[x[i]] = y[i]
      count_dict[x[i]] = 1
    else:
      value_dict[x[i]] += y[i]
      count_dict[x[i]] += 1
  for i in range(length):
    result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
  return result

In [None]:
result_4 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(data, 'y', 'x')
diff = np.linalg.norm(result_4 - result_2)
print(diff)