In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [52]:
y = np.random.randint(2, size=(500, 1))
x = np.random.randint(10, size=(500, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

def test_eq(a, b):
    assert np.allclose(a, b)

In [4]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [5]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v2(data, 'y', 'x'))

In [6]:
%%timeit
res_v1 = target_mean_v1(data, 'y', 'x')

6.39 s ± 440 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
res_v2 = target_mean_v2(data, 'y', 'x')

67.2 ms ± 4.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
print(f"V2: Speed up {6.39 / (67.2 / 1e3):.1f} times")

V2: Speed up 95.1 times


In [29]:
def target_mean_v3(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    value_dict = defaultdict(lambda:0)
    count_dict = defaultdict(lambda:0)
    for x, y in zip(X, Y):
        value_dict[x] += y
        count_dict[x] += 1
    result = [(value_dict[x] - y) / (count_dict[x] - 1) for x, y in zip(X, Y)]
    return result

In [31]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v3(data, 'y', 'x'))

In [30]:
%%timeit
res_v3 = target_mean_v3(data, 'y', 'x')

938 µs ± 78.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [32]:
print(f"V3: Speed up {6.39 / (938 / 1e6):.1f} times")

V3: Speed up 6812.4 times


In [72]:
def target_mean_v4(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    result = np.empty_like(Y, dtype='float')
    
    for i in range(n):
        which = (X == x[i]) # has for-loop?
        which[i] = False
        idxs = np.where(which)[0] # has for-loop?
        result[i] = Y[idxs].mean()  # has for-loop
    return result

In [73]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v4(data, 'y', 'x'))

In [74]:
%%timeit
res_v4 = target_mean_v4(data, 'y', 'x')

9.95 ms ± 382 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [69]:
def target_mean_v5(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    result = np.empty_like(Y, dtype='float')
#     value_dict = defaultdict(lambda:0)
#     count_dict = defaultdict(lambda:0)
    which = []
    for i in range(n):
        _which = (X == x[i])
        _which[i] = False
        which.append(_which)
        idxs = np.where(which)[0] # inner for-loop?
        result[i] = Y[idxs].mean()  # inner for-loop
    
    for i, o in enumerate(which):
        result[i] = np.mean([Y[j] for j, e in enumerate(o) if e])
    
    return result

In [70]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v5(data, 'y', 'x'))

In [71]:
%%timeit
res_v5 = target_mean_v5(data, 'y', 'x')

543 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [75]:
def target_mean_v6(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values
    value_dict = defaultdict(lambda:0)
    count_dict = defaultdict(lambda:0)
    for i in range(n):
        value_dict[X[i]] += Y[i]
        count_dict[X[i]] += 1
    result = [(value_dict[X[i]] - Y[i]) / (count_dict[X[i]] - 1) for i in range(n)]
    return result

In [76]:
test_eq(target_mean_v1(data, 'y', 'x'), target_mean_v6(data, 'y', 'x'))

In [77]:
%%timeit
res_v6 = target_mean_v6(data, 'y', 'x')

1.23 ms ± 36.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
