In [1]:
pip install implicit umap-learn -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
from umap import UMAP
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

  from .autonotebook import tqdm as notebook_tqdm


## 데이터프레임 로드

In [3]:
# parquet_path = "/content/drive/MyDrive/Colab Notebooks/Datasets/cp2_ecommerce/df_mv_filled.parquet"
parquet_path = "D:/cp2_dataset/df_cat_add.parquet"
df = pd.read_parquet(parquet_path, engine='pyarrow')

In [4]:
df.head(3)

Unnamed: 0,event_time,month,day,day_name,hour,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,main_cat,sub_cat_1,sub_cat_2
0,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,44600062,2103807459595387724,no_cat,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,no_cat,,
1,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,appliances,environment,water_heater
2,2019-10-01 04:00:01+04:00,10,1,Tuesday,4,view,17200506,2053013559792632471,furniture.living_room.sofa,no_brand,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,furniture,living_room,sofa


### event_type = cart, day_name = 금, 토, 일 데이터만 사용

In [5]:
cart_cond = (df["event_type"] == "cart")
df_cart = df[cart_cond]

In [6]:
df.shape, df_cart.shape

((42380091, 16), (926366, 16))

In [7]:
sub_cat_1 = (df_cart["sub_cat_1"] == "smartphone") \
            | (df_cart["sub_cat_1"] == "video") \
            | (df_cart["sub_cat_1"] == "notebook") \
            | (df_cart["sub_cat_1"] == "kitchen")

day_cond = (df_cart["day_name"] == "Friday") \
           | (df_cart["day_name"] == "Saturday") \
           | (df_cart["day_name"] == "Sunday") 

In [8]:
df_cart_cat = df_cart.loc[sub_cat_1]
df_day = df_cart_cat.loc[day_cond]
df_day.shape

(296936, 16)

In [9]:
cart_day = df_day[["product_id","user_id"]]

### counting을 위해 count 칼럼 생성

In [10]:
cart_day.loc[:, "count"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [11]:
cart_day.head(3)

Unnamed: 0,product_id,user_id,count
3515590,1005136,512433763,1
3515915,1003310,556510717,1
3516066,1005141,514302701,1


In [12]:
cart_day.shape

(296936, 3)

### product_id에 따른 설명 테이블 item_lookup 생성

In [13]:
item_lookup = df_day[['product_id','main_cat', "sub_cat_1", "sub_cat_2", "brand"]].drop_duplicates()
item_lookup.head()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
3515590,1005136,electronics,smartphone,,apple
3515915,1003310,electronics,smartphone,,apple
3516066,1005141,electronics,smartphone,,apple
3516110,1004945,electronics,smartphone,,samsung
3516183,1005118,electronics,smartphone,,apple


In [14]:
# 고객과 아이템 별로 총 얼마나 구매했는지 
grouped_cart = cart_day.groupby(['user_id','product_id'])["count"].sum().reset_index()

grouped_cart.head(5)

Unnamed: 0,user_id,product_id,count
0,284344819,1005122,1
1,318611205,1005003,1
2,336595257,1004767,2
3,403013066,1003304,2
4,403013066,1004836,1


In [15]:
grouped_cart.shape

(168327, 3)

### ALS 알고리즘의 입력값 형태 만들기 위해, 희소 행렬 생성
- 희소 행렬
    - row : 유니크한 user_id
    - column : 유니크한 product_id
    - 행렬의 값은 유저가 각 제품 구매한 횟수

In [16]:
# matrix가 매우 크기 때문에 Sparse matrix로 바꾸어주어서
# zero가 아닌 값들의 위치와 그 값만 저장하도록 메모리 절약!

users = list(np.sort(grouped_cart['user_id'].unique()))
products = list (grouped_cart['product_id'].unique())
counts = list(grouped_cart['count'])

rows = grouped_cart['user_id'].astype('category').cat.codes
cols = grouped_cart['product_id'].astype('category').cat.codes

print(len(users))  # 126195
print(len(products)) # 3229

# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((grouped_cart['product_id'].astype(float), (cols, rows)))
# sparse_user_item = sparse.csr_matrix((grouped_cart['product_id'].astype(float), (rows, cols)))

126195
3229


In [17]:
bm25_sparse_item_user = bm25_weight(sparse_item_user, K1=100, B=0.8)
sparse_user_item = bm25_sparse_item_user.T.tocsr()

In [18]:
bm25_sparse_item_user.shape, sparse_user_item.shape

((3229, 126195), (126195, 3229))

## 모델링

#### Random Model (Base line)

In [19]:
class RandomModel():
    def __init__(self):
        self.products = None
    
    def train(self, products):
        self.products = np.asarray(products)
        return self

    def recommend(self, size=(1,10)):
        return np.random.choice(self.products, size=size, replace=False)

In [20]:
random_model = RandomModel().train(item_lookup.index.tolist())

### Alternating Least Squares (ALS) model

In [21]:
model = AlternatingLeastSquares(factors=64,
                                random_state=42,
                                regularization=0.05,
                                iterations=20, 
                                calculate_training_loss=True)
# model.fit(5 * sparse_user_item)
model.fit(sparse_user_item * 10)
# model.fit(sparse_user_item)

100%|██████████| 20/20 [01:03<00:00,  3.19s/it, loss=0.00473]


## Evaluation

#### Random Model

In [22]:
random_rec = random_model.recommend()

item_lookup.loc[random_rec[0].tolist(), :]

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
14992205,1004080,electronics,smartphone,,xiaomi
25343923,1802097,electronics,video,tv,blaupunkt
27272765,18000890,electronics,smartphone,phone_accessories,apple
3760271,15800049,appliances,kitchen,dishwasher,huter
34862135,1802102,electronics,video,tv,arg
3523348,1004766,electronics,smartphone,,samsung
14157721,3500055,appliances,kitchen,meat_grinder,bosch
3777259,1003532,electronics,smartphone,,samsung
3971785,1307042,computers,notebook,,lenovo
23878370,2602226,appliances,kitchen,cooker,haier


#### ALS Model

In [50]:
user_id = 539016593

##### 해당 user_id가 실제 cart에 가장 많이 넣은 제품들

In [24]:
user_prod_freq = (
    cart_day
    .loc[cart_day["user_id"] == user_id, "product_id"]
    .value_counts()
    .iloc[:10]
)
freq_item = item_lookup.set_index("product_id").loc[user_prod_freq.index.to_list(), :]
freq_item.loc[:, "counts"] = user_prod_freq.to_list()
freq_item.reset_index()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,counts
0,1802037,electronics,video,tv,lg,6
1,1005100,electronics,smartphone,,samsung,4
2,1004809,electronics,smartphone,,xiaomi,3
3,1004777,electronics,smartphone,,xiaomi,3
4,1005204,electronics,smartphone,,xiaomi,3
5,1004838,electronics,smartphone,,oppo,2
6,1801998,electronics,video,tv,artel,1
7,1005006,electronics,smartphone,,xiaomi,1
8,1801770,electronics,video,tv,changhong,1
9,1005195,electronics,smartphone,,xiaomi,1


##### 추천 시스템

In [51]:
# user_id = 513414926
# user_id = 539016593
user_idx = list(grouped_cart['user_id']).index(user_id)

In [52]:
user_idx

89012

In [27]:
ids, scores = model.recommend(user_idx, sparse_user_item[user_idx], N=10, filter_already_liked_items=False)

In [28]:
als_rec = pd.DataFrame({"product_id": grouped_cart["product_id"][ids], 
                        "score": scores, 
                        "already_carted": np.in1d(ids, sparse_user_item[user_idx].indices)})
als_rec

Unnamed: 0,product_id,score,already_carted
99,1004739,1.001667,True
100,4501774,0.412204,False
360,1004748,0.390251,False
362,1004209,0.38966,False
296,1004653,0.379635,False
171,1004781,0.360558,False
191,1002633,0.327186,False
299,1004856,0.317014,False
284,2900011,0.311154,False
297,1004659,0.308444,False


In [29]:
als_item_lookup = item_lookup.set_index("product_id").loc[als_rec["product_id"].to_list(), :].reset_index()
als_item_lookup.merge(als_rec, how="left")

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score,already_carted
0,1004739,electronics,smartphone,,xiaomi,1.001667,True
1,4501774,appliances,kitchen,hob,hansa,0.412204,False
2,1004748,electronics,smartphone,,huawei,0.390251,False
3,1004209,electronics,smartphone,,samsung,0.38966,False
4,1004653,electronics,smartphone,,samsung,0.379635,False
5,1004781,electronics,smartphone,,huawei,0.360558,False
6,1002633,electronics,smartphone,,apple,0.327186,False
7,1004856,electronics,smartphone,,samsung,0.317014,False
8,2900011,appliances,kitchen,microwave,samsung,0.311154,False
9,1004659,electronics,smartphone,,samsung,0.308444,False


##### 비슷한 아이템

In [30]:
product_id = 1004739
product_idx = list(grouped_cart['product_id']).index(product_id)

In [31]:
similar_ids, similar_scores = model.similar_items(product_idx)

In [32]:
sim_items = pd.DataFrame({"product_id": grouped_cart["product_id"][similar_ids], 
                        "score": similar_scores})

In [33]:
als_sim_item_lookup = item_lookup.set_index("product_id").loc[sim_items["product_id"].to_list(), :].reset_index()
als_sim_item_lookup.merge(sim_items).sort_values("score", ascending=False).reset_index(drop=True)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score
0,1004739,electronics,smartphone,,xiaomi,1.0
1,1004739,electronics,smartphone,,xiaomi,1.0
2,1005116,electronics,smartphone,,apple,0.864113
3,1004739,electronics,smartphone,,xiaomi,0.863436
4,1004739,electronics,smartphone,,xiaomi,0.863436
5,1005140,electronics,smartphone,,apple,0.861955
6,1004839,electronics,smartphone,,oppo,0.859616
7,1003304,electronics,smartphone,,apple,0.852684
8,1004246,electronics,smartphone,,apple,0.842328
9,3300492,appliances,kitchen,multicooker,redmond,0.837229


In [53]:
recs = model.recommend_all(user_items=sparse_user_item, N=10, filter_already_liked_items=False)

In [54]:
recs_df = pd.DataFrame(recs, )

array([[ 423,  425,  411, ...,  424,  100, 1477],
       [ 358,    8,  239, ...,  463,  375, 2167],
       [ 252,  296,  308, ...,  243,  127,  251],
       ...,
       [ 416,  127,  296, ...,  405,  209,  305],
       [ 445,  433,  436, ...,  442,  413,  124],
       [ 435,  444,  428, ...,  161,  438,  439]])

In [55]:
cart_day

Unnamed: 0,product_id,user_id,count
3515590,1005136,512433763,1
3515915,1003310,556510717,1
3516066,1005141,514302701,1
3516110,1004945,550696493,1
3516183,1005118,515905214,1
...,...,...,...
42447907,1004767,542774966,1
42448124,1005134,566280291,1
42448135,1005134,566280291,1
42448156,1003306,512717356,1


In [115]:
valid_pct = 0.3
cart_day["random"] = np.random.random(size=len(cart_day))

train_mask = cart_day["random"] < (1-valid_pct)
valid_mask = cart_day["random"] >= (1-valid_pct)

grouped_cart_train = cart_day[train_mask].groupby(["user_id", "product_id"]).size().to_frame("count").reset_index()
grouped_cart_valid = cart_day[valid_mask].groupby(["user_id", "product_id"]).size().to_frame("count").reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [116]:
sample_weight_train = np.log2(grouped_cart_train['count'] + 1)
sample_weight_valid = np.log2(grouped_cart_valid['count'] + 1)

grouped_cart_train = grouped_cart_train[['user_id', 'product_id']]
grouped_cart_valid = grouped_cart_valid[['user_id', 'product_id']]

train_users = np.sort(grouped_cart_train.user_id.unique())
valid_users = np.sort(grouped_cart_valid.user_id.unique())
cold_start_users = set(valid_users) - set(train_users)

train_items = np.sort(grouped_cart_train.product_id.unique())
valid_items = np.sort(grouped_cart_valid.product_id.unique())
cold_start_items = set(valid_items) - set(train_items)


item_lookup_train = item_lookup[item_lookup.product_id.isin(train_items)]
item_lookup_valid = item_lookup[item_lookup.product_id.isin(valid_items)]

In [117]:
# create zero-based index position <-> user/item ID mappings
index_to_user = pd.Series(np.sort(np.unique(grouped_cart_train['user_id'])))
index_to_item = pd.Series(np.sort(np.unique(grouped_cart_train['product_id'])))

# create reverse mappings from user/item ID to index positions
user_to_index = pd.Series(data=index_to_user.index, index=index_to_user.values)
item_to_index = pd.Series(data=index_to_item.index, index=index_to_item.values)

# convert user/item identifiers to index positions
grouped_cart_train_imp = grouped_cart_train.copy()
grouped_cart_train_imp['user_id'] = grouped_cart_train['user_id'].map(user_to_index)
grouped_cart_train_imp['product_id'] = grouped_cart_train['product_id'].map(item_to_index)


In [118]:
# prepare the data for CSR creation
data = sample_weight_train
rows = grouped_cart_train_imp['user_id']
cols = grouped_cart_train_imp['product_id']

In [119]:
len(train_users)

103220

In [120]:
len(train_items)

2987

In [121]:
# create the required user-item and item-user CSR matrices
items_user_imp = csr_matrix((data, (cols, rows)), shape=(len(train_items), len(train_users)))
user_items_imp = items_user_imp.T.tocsr()

In [122]:
from implicit.als import AlternatingLeastSquares

# initialize and fit the model
imp_model = AlternatingLeastSquares(factors=64, 
                                    random_state=42,
                                    regularization=0.05,
                                    iterations=20, 
                                    calculate_training_loss=True)
imp_model.fit(user_items_imp)

100%|██████████| 20/20 [00:50<00:00,  2.55s/it, loss=0.000203]


In [123]:
# generate recommendations for all users and map back to original user/item ID values
recs_imp = imp_model.recommend_all(user_items=user_items_imp, N=10, filter_already_liked_items=False)
recs_imp = pd.DataFrame(recs_imp, index=index_to_user.values).apply(lambda c: c.map(index_to_item))

In [124]:
valid_user_items = grouped_cart_valid.groupby('user_id')['product_id'].apply(set).to_dict()
combined_users = set(train_users) & set(valid_users)

imp_hrt = np.mean([int(len(set(recs_imp.loc[u]) & valid_user_items[u]) > 0) for u in combined_users])
imp_pre = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(recs_imp.loc[u]) for u in combined_users])
imp_rec = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(valid_user_items[u]) for u in combined_users])

In [125]:
imp_hrt

0.6455829702200883

In [126]:
imp_pre

0.07199633648802421

In [127]:
imp_rec

0.5986102215252114

##### 해당 user_id가 실제 cart에 많이 넣은 제품들

In [128]:
user_id = 539016593

user_prod_freq = (
    cart_day
    .loc[cart_day["user_id"] == user_id, "product_id"]
    .value_counts()
    .iloc[:10]
)
freq_item = item_lookup.set_index("product_id").loc[user_prod_freq.index.to_list(), :]
freq_item.loc[:, "counts"] = user_prod_freq.to_list()
freq_item.reset_index()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,counts
0,1802037,electronics,video,tv,lg,6
1,1005100,electronics,smartphone,,samsung,4
2,1004809,electronics,smartphone,,xiaomi,3
3,1004777,electronics,smartphone,,xiaomi,3
4,1005204,electronics,smartphone,,xiaomi,3
5,1004838,electronics,smartphone,,oppo,2
6,1801998,electronics,video,tv,artel,1
7,1005006,electronics,smartphone,,xiaomi,1
8,1801770,electronics,video,tv,changhong,1
9,1005195,electronics,smartphone,,xiaomi,1


##### 추천 시스템

In [130]:
user_idx = list(grouped_cart['user_id']).index(user_id)
user_idx

89012

In [131]:
ids, scores = imp_model.recommend(user_idx, user_items_imp[user_idx], filter_already_liked_items=False)

In [135]:
ids

array([285, 216, 208, 231, 232, 217, 326, 209, 439, 344])

In [142]:
als_rec = pd.DataFrame({"product_id": grouped_cart_train["product_id"][ids], 
                        "score": scores, 
                        "already_carted": np.in1d(ids, user_items_imp[user_idx].indices)})
als_rec.drop_duplicates("product_id")

Unnamed: 0,product_id,score,already_carted
285,1004250,0.992859,True
216,1004739,0.796557,True
208,3300349,0.106431,False
231,1801904,0.090774,False
232,1802111,0.087332,False
217,2900011,0.084162,False
326,1004857,0.081563,False
209,1004767,0.079522,False
439,1004777,0.077574,False


In [145]:
als_item_lookup = item_lookup_train.set_index("product_id").loc[als_rec["product_id"].to_list(), :].reset_index()
als_item_lookup.merge(als_rec).drop_duplicates("product_id").reset_index(drop=True)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score,already_carted
0,1004250,electronics,smartphone,,apple,0.992859,True
1,1004739,electronics,smartphone,,xiaomi,0.796557,True
2,3300349,appliances,kitchen,multicooker,redmond,0.106431,False
3,1801904,electronics,video,tv,samsung,0.090774,False
4,1802111,electronics,video,tv,blaupunkt,0.087332,False
5,2900011,appliances,kitchen,microwave,samsung,0.084162,False
6,1004857,electronics,smartphone,,samsung,0.081563,False
7,1004767,electronics,smartphone,,samsung,0.079522,False
8,1004777,electronics,smartphone,,xiaomi,0.077574,False


##### 비슷한 아이템

In [146]:
product_id = 1004250
product_idx = list(grouped_cart_train['product_id']).index(product_id)

similar_ids, similar_scores = imp_model.similar_items(product_idx)

sim_items = pd.DataFrame({"product_id": grouped_cart_train["product_id"][similar_ids], 
                        "score": similar_scores})

als_sim_item_lookup = item_lookup_train.set_index("product_id").loc[sim_items["product_id"].to_list(), :].reset_index()
als_sim_item_lookup.merge(sim_items).sort_values("score", ascending=False).reset_index(drop=True)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score
0,1004250,electronics,smartphone,,apple,1.0
1,1003306,electronics,smartphone,,apple,0.997207
2,1004856,electronics,smartphone,,samsung,0.975958
3,1002544,electronics,smartphone,,apple,0.971187
4,1004209,electronics,smartphone,,samsung,0.947645
5,4502579,appliances,kitchen,hob,electrolux,0.811395
6,1004767,electronics,smartphone,,samsung,0.777213
7,1004781,electronics,smartphone,,huawei,0.774965
8,1801516,electronics,video,tv,haier,0.767825
9,1004873,electronics,smartphone,,samsung,0.691687


In [34]:
_, top_contribution, _ = model.explain(user_idx, sparse_user_item, product_idx)


In [35]:
for product_id,score in top_contribution:
    print(product_id, score)

99 0.03095440947143913


In [36]:
[(grouped_cart["product_id"][product_id], score) for product_id, score in top_contribution]


[(1004739, 0.03095440947143913)]

In [37]:
item_lookup[item_lookup["product_id"] == 1004767]

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
3516243,1004767,electronics,smartphone,,samsung


In [38]:
def make_train (matrix, percentage = .2):
    '''
    -----------------------------------------------------
    설명
    유저-아이템 행렬 (matrix)에서 
    1. 0 이상의 값을 가지면 1의 값을 갖도록 binary하게 테스트 데이터를 만들고
    2. 훈련 데이터는 원본 행렬에서 percentage 비율만큼 0으로 바뀜
    
    -----------------------------------------------------
    반환
    training_set: 훈련 데이터에서 percentage 비율만큼 0으로 바뀐 행렬
    test_set:     원본 유저-아이템 행렬의 복사본
    user_inds:    훈련 데이터에서 0으로 바뀐 유저의 index
    '''
    test_set = matrix.copy()
    test_set[test_set !=0] = 1 # binary하게 만들기
    
    training_set = matrix.copy()
    nonzero_inds = training_set.nonzero()
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))
    
    random.seed(0)
    num_samples = int(np.ceil(percentage * len(nonzero_pairs)))
    samples = random.sample (nonzero_pairs, num_samples)
    
    user_inds = [index[0] for index in samples]
    item_inds = [index[1] for index in samples]
    
    training_set[user_inds, item_inds] = 0
    training_set.eliminate_zeros()
    
    return training_set, test_set, list(set(user_inds))

# 훈련, 테스트 데이터 생성
product_train, product_test, product_users_altered = make_train(sparse_item_user, 0.2)

In [39]:
bm25_product_train = bm25_weight(product_train, K1=100, B=0.8)
sparse_user_item = bm25_product_train.T.tocsr()

In [40]:
model2 = AlternatingLeastSquares(factors=64, regularization=0.05)
model2.fit(2 * sparse_user_item)

100%|██████████| 15/15 [00:31<00:00,  2.08s/it]


In [41]:
model2.user_factors.shape

(126195, 64)

In [42]:
model2.item_factors.shape

(3229, 64)

In [43]:
# pred = np.dot(model2.user_factors, model2.item_factors.T)
pred = [sparse.csr_matrix(model2.user_factors), sparse.csr_matrix(model2.item_factors.T)]

In [44]:
pred

[<126195x64 sparse matrix of type '<class 'numpy.float32'>'
 	with 6753152 stored elements in Compressed Sparse Row format>,
 <64x3229 sparse matrix of type '<class 'numpy.float32'>'
 	with 194112 stored elements in Compressed Sparse Row format>]

In [45]:
from sklearn import metrics

def auc_score (test, predictions):
    '''
    fpr, tpr를 이용해서 AUC를 계산하는 함수
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr,tpr)

In [46]:
def calc_mean_auc(training_set, predictions, altered_users, test_set):
    '''
    가려진 정보가 있는 유저마다 AUC 평균을 구하는 함수
    ----------------------------------------
    input
    1. training_set: make_train 함수에서 만들어진 훈련 데이터 (일정 비율로 아이템 구매량이 0으로 가려진 데이터)
    2. prediction: implicit MF에서 나온 유저/아이템 별로 나온 예측 평점 행렬
    3. altered_users: make_train 함수에서 아이템 구매량이 0으로 가려진 유저
    4. test_set: make_train함수에서 만든 테스트 데이터
    ----------------------------------------
    반환
    추천 시스템 유저의 평균 auc
    인기아이템 기반 유저 평균 auc
    '''
    # 리스트 초기화
    store_auc = []
    popularity_auc = []
    
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # 모든 유저의 아이템별 구매횟수 합
    item_vecs = predictions[1] # 아이템 latent 벡터
    
    for user in altered_users:
        training_row = training_set[user,:].toarray().reshape(-1) # 유저의 훈련데이터
        zero_inds = np.where(training_row == 0) # 가려진 아이템 Index
        
        # 가려진 아이템에 대한 예측
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # 가려진 아이템에 대한 실제값
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        
        # 가려진 아이템에 대한 popularity (구매횟수 합)
        pop = pop_items[zero_inds]
        
        # AUC 계산 
        store_auc.append(auc_score(actual, pred))
        popularity_auc.append(auc_score(actual,pop))
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  

In [47]:
calc_mean_auc(product_train, pred, product_users_altered, product_test)

IndexError: index 3229 is out of bounds for axis 1 with size 3229