<a href="https://colab.research.google.com/github/aeonplutolucifer/ml-training-camp/blob/main/ML_camp_homework01_target_mean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import numpy as np
import pandas as pd
import time

#Baseline for Python V1

In [51]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

#Baseline for python V2

In [52]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

# Baseline for Cython V3

In [53]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [54]:
%%cython
import numpy as np
cimport numpy as np

cpdef target_mean_v3(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef np.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
    cdef np.ndarray[double] x = np.asfortranarray(data[x_name], dtype=np.float64)

    target_mean_v3_impl(result, y, x, nrow)
    return result

cdef void target_mean_v3_impl(double[:] result, double[:] y, double[:] x, const long nrow):
    cdef dict value_dict = dict()
    cdef dict count_dict = dict()

    cdef long i
    for i in range(nrow):
        if x[i] not in value_dict.keys():
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
        else:
            value_dict[x[i]] += y[i]
            count_dict[x[i]] += 1

    i=0
    for i in range(nrow):
        result[i] = (value_dict[x[i]] - y[i])/(count_dict[x[i]]-1)


#Homework

In [55]:
%%cython
import numpy as np
cimport numpy as np
from libcpp.pair cimport pair

cdef extern from "<boost/unordered_map.hpp>" namespace "boost":
    cdef cppclass unordered_map[K, T]: # K: key_type, T: mapped_type
        cppclass iterator:
            pair& operator*()
            bint operator==(iterator)
            bint operator!=(iterator)
        unordered_map()
        bint empty()
        size_t size()
        iterator begin()
        iterator end()
        pair emplace(K, T)
        iterator find(K)
        void clear()
        size_t count(K)
        T& operator[](K)


cpdef target_mean_v4(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef np.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
    cdef np.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int)

    target_mean_v4_impl(result, y, x, nrow)
    return result

cdef void target_mean_v4_impl(double[:] result, double[:] y, int[:] x, const long nrow):
    cdef unordered_map[int, double] value_dict
    cdef unordered_map[int, double] count_dict

    cdef long i
    for i in xrange(nrow):
        if value_dict.find(x[i]) == value_dict.end():
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
        else:
            value_dict[x[i]] += y[i]
            count_dict[x[i]] += 1

    i=0
    for i in xrange(nrow):
        result[i] = (value_dict[x[i]] - y[i])/(count_dict[x[i]]-1)


# Main

In [56]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

start = time.time()
result_1 = target_mean_v1(data, 'y', 'x')
end = time.time()

print("v1 used time: ",end - start)

start = time.time()
result_2 = target_mean_v2(data, 'y', 'x')
end = time.time()

print("v2 used time: ",end - start)

start = time.time()
result_3 = target_mean_v3(data, 'y', 'x')
end = time.time()

print("v3:Teacher Wang 's version used time: ",end - start)


start = time.time()
result_4 = target_mean_v4(data, 'y', 'x')
end = time.time()

print("v4:Myversion used time: ",end - start)


diff = np.linalg.norm(result_1 - result_2)
diff2 = np.linalg.norm(result_1 - result_4)
print(diff)
print(diff2)


v1 used time:  29.73409605026245
v2 used time:  0.32313084602355957
v3:Teacher Wang 's version used time:  0.0028769969940185547
v4:Myversion used time:  0.0018837451934814453
0.0
0.0
