In [1]:
pip install implicit umap-learn -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
from umap import UMAP
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

  from .autonotebook import tqdm as notebook_tqdm


## 데이터프레임 로드

In [4]:
# parquet_path = "/content/drive/MyDrive/Colab Notebooks/Datasets/cp2_ecommerce/df_mv_filled.parquet"
parquet_path = "D:/cp2_dataset/df_cat_add.parquet"
df = pd.read_parquet(parquet_path, engine='pyarrow')

In [5]:
df.head(3)

Unnamed: 0,event_time,month,day,day_name,hour,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,main_cat,sub_cat_1,sub_cat_2
0,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,44600062,2103807459595387724,no_cat,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,no_cat,,
1,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,appliances,environment,water_heater
2,2019-10-01 04:00:01+04:00,10,1,Tuesday,4,view,17200506,2053013559792632471,furniture.living_room.sofa,no_brand,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,furniture,living_room,sofa


### event_type = cart, day_name = 금, 토, 일 데이터만 사용

In [6]:
cart_cond = (df["event_type"] == "cart")
df_cart = df[cart_cond]

In [6]:
df.shape, df_cart.shape

((42380091, 16), (926366, 16))

In [7]:
sub_cat_1 = (df_cart["sub_cat_1"] == "smartphone") \
            | (df_cart["sub_cat_1"] == "video") \
            | (df_cart["sub_cat_1"] == "notebook") \
            | (df_cart["sub_cat_1"] == "kitchen")

day_cond = (df_cart["day_name"] == "Friday") \
           | (df_cart["day_name"] == "Saturday") \
           | (df_cart["day_name"] == "Sunday") 

In [8]:
df_cart_cat = df_cart.loc[sub_cat_1]
df_day = df_cart_cat.loc[day_cond]
df_day.shape

(296936, 16)

In [100]:
cart_day = df_day[["product_id","user_id"]]

### counting을 위해 count 칼럼 생성

In [101]:
cart_day.loc[:, "count"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [102]:
cart_day.head(3)

Unnamed: 0,product_id,user_id,count
3515590,1005136,512433763,1
3515915,1003310,556510717,1
3516066,1005141,514302701,1


In [20]:
cart_day.shape

(296936, 3)

### product_id에 따른 설명 테이블 item_lookup 생성

In [136]:
item_lookup = df_day[['product_id','main_cat', "sub_cat_1", "sub_cat_2", "brand"]].drop_duplicates()
item_lookup.head()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
3515590,1005136,electronics,smartphone,,apple
3515915,1003310,electronics,smartphone,,apple
3516066,1005141,electronics,smartphone,,apple
3516110,1004945,electronics,smartphone,,samsung
3516183,1005118,electronics,smartphone,,apple


In [104]:
# 고객과 아이템 별로 총 얼마나 구매했는지 
grouped_cart = cart_day.groupby(['user_id','product_id'])["count"].sum().reset_index()

grouped_cart.head(5)

Unnamed: 0,user_id,product_id,count
0,284344819,1005122,1
1,318611205,1005003,1
2,336595257,1004767,2
3,403013066,1003304,2
4,403013066,1004836,1


In [14]:
grouped_cart.shape

(168327, 3)

### ALS 알고리즘의 입력값 형태 만들기 위해, 희소 행렬 생성
- 희소 행렬
    - row : 유니크한 user_id
    - column : 유니크한 product_id
    - 행렬의 값은 유저가 각 제품 구매한 횟수

In [105]:
# matrix가 매우 크기 때문에 Sparse matrix로 바꾸어주어서
# zero가 아닌 값들의 위치와 그 값만 저장하도록 메모리 절약!

users = list(np.sort(grouped_cart['user_id'].unique()))
products = list (grouped_cart['product_id'].unique())
counts = list(grouped_cart['count'])

rows = grouped_cart['user_id'].astype('category').cat.codes
cols = grouped_cart['product_id'].astype('category').cat.codes

print(len(users))  # 126195
print(len(products)) # 3229

# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((grouped_cart['product_id'].astype(float), (cols, rows)))
# sparse_user_item = sparse.csr_matrix((grouped_cart['product_id'].astype(float), (rows, cols)))

126195
3229


In [93]:
bm25_sparse_item_user = bm25_weight(sparse_item_user, K1=100, B=0.8)
sparse_user_item = bm25_sparse_item_user.T.tocsr()

## 모델링

#### Random Model (Base line)

In [106]:
class RandomModel():
    def __init__(self):
        self.products = None
    
    def train(self, products):
        self.products = np.asarray(products)
        return self

    def recommend(self, size=(1,10)):
        return np.random.choice(self.products, size=size, replace=False)

In [139]:
random_model = RandomModel().train(item_lookup.index.tolist())

### Alternating Least Squares (ALS) model

In [146]:
model = AlternatingLeastSquares(factors=64,
                                regularization=0.05,
                                iterations=10, 
                                calculate_training_loss=True)
model.fit(2 * sparse_user_item)

100%|██████████| 10/10 [00:30<00:00,  3.04s/it, loss=0.0106]


## Evaluation

#### Random Model

In [148]:
random_rec = random_model.recommend()

item_lookup.loc[random_rec[0].tolist(), :]

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
33365081,1005182,electronics,smartphone,,samsung
23349166,1005210,electronics,smartphone,,jinga
17300083,1005215,electronics,smartphone,,samsung
35085713,2601752,appliances,kitchen,cooker,artel
3616329,1003965,electronics,smartphone,,samsung
23468316,1801883,electronics,video,tv,philips
5795764,4501432,appliances,kitchen,hob,bosch
24221621,3601612,appliances,kitchen,dishwasher,lg
34428968,15902204,furniture,kitchen,pan,tefal
23497628,1801796,electronics,video,tv,elenberg


#### ALS Model

In [110]:
user_id = 539016593

##### 해당 user_id가 실제 cart에 가장 많이 넣은 제품들

In [145]:
user_prod_freq = (
    cart_day
    .loc[cart_day["user_id"] == user_id, "product_id"]
    .value_counts()
    .iloc[:10]
)
freq_item = item_lookup.set_index("product_id").loc[user_prod_freq.index.to_list(), :]
freq_item.loc[:, "counts"] = user_prod_freq.to_list()
freq_item.reset_index()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,counts
0,1802037,electronics,video,tv,lg,6
1,1005100,electronics,smartphone,,samsung,4
2,1004809,electronics,smartphone,,xiaomi,3
3,1004777,electronics,smartphone,,xiaomi,3
4,1005204,electronics,smartphone,,xiaomi,3
5,1004838,electronics,smartphone,,oppo,2
6,1801998,electronics,video,tv,artel,1
7,1005006,electronics,smartphone,,xiaomi,1
8,1801770,electronics,video,tv,changhong,1
9,1005195,electronics,smartphone,,xiaomi,1


##### 추천 시스템

In [149]:
# user_id = 513414926
user_id = 539016593
user_idx = list(grouped_cart['user_id']).index(user_id)

In [150]:
user_idx

89012

In [151]:
ids, scores = model.recommend(user_idx, sparse_user_item[user_idx], N=10, filter_already_liked_items=False)

In [162]:
als_rec = pd.DataFrame({"product_id": grouped_cart["product_id"][ids], 
                        "score": scores, 
                        "already_carted": np.in1d(ids, sparse_user_item[user_idx].indices)})
als_rec

Unnamed: 0,product_id,score,already_carted
99,1004739,1.001975,True
29,1005228,0.340133,False
887,1004237,0.290203,False
256,2900632,0.290033,False
1,1005003,0.283211,False
319,1004777,0.279208,False
100,4501774,0.242472,False
399,1005004,0.234827,False
255,2900626,0.224733,False
48,1004708,0.223168,False


In [164]:
als_item_lookup = item_lookup.set_index("product_id").loc[als_rec["product_id"].to_list(), :].reset_index(0)
als_item_lookup.merge(als_rec)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score,already_carted
0,1004739,electronics,smartphone,,xiaomi,1.001975,True
1,1005228,electronics,smartphone,,honor,0.340133,False
2,1004237,electronics,smartphone,,apple,0.290203,False
3,2900632,appliances,kitchen,microwave,samsung,0.290033,False
4,1005003,electronics,smartphone,,huawei,0.283211,False
5,1004777,electronics,smartphone,,xiaomi,0.279208,False
6,4501774,appliances,kitchen,hob,hansa,0.242472,False
7,1005004,electronics,smartphone,,huawei,0.234827,False
8,2900626,appliances,kitchen,microwave,redmond,0.224733,False
9,1004708,electronics,smartphone,,huawei,0.223168,False


In [51]:
def make_train (matrix, percentage = .2):
    '''
    -----------------------------------------------------
    설명
    유저-아이템 행렬 (matrix)에서 
    1. 0 이상의 값을 가지면 1의 값을 갖도록 binary하게 테스트 데이터를 만들고
    2. 훈련 데이터는 원본 행렬에서 percentage 비율만큼 0으로 바뀜
    
    -----------------------------------------------------
    반환
    training_set: 훈련 데이터에서 percentage 비율만큼 0으로 바뀐 행렬
    test_set:     원본 유저-아이템 행렬의 복사본
    user_inds:    훈련 데이터에서 0으로 바뀐 유저의 index
    '''
    test_set = matrix.copy()
    test_set[test_set !=0] = 1 # binary하게 만들기
    
    training_set = matrix.copy()
    nonzero_inds = training_set.nonzero()
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))
    
    random.seed(0)
    num_samples = int(np.ceil(percentage * len(nonzero_pairs)))
    samples = random.sample (nonzero_pairs, num_samples)
    
    user_inds = [index[0] for index in samples]
    item_inds = [index[1] for index in samples]
    
    training_set[user_inds, item_inds] = 0
    training_set.eliminate_zeros()
    
    return training_set, test_set, list(set(user_inds))

# 훈련, 테스트 데이터 생성
product_train, product_test, product_users_altered = make_train(sparse_item_user, 0.2)

In [52]:
bm25_product_train = bm25_weight(product_train, K1=100, B=0.8)
sparse_user_item = bm25_product_train.T.tocsr()

In [53]:
model2 = AlternatingLeastSquares(factors=64, regularization=0.05)
model2.fit(2 * sparse_user_item)

100%|██████████| 15/15 [00:30<00:00,  2.03s/it]


In [70]:
model2.user_factors.shape

(126195, 64)

In [71]:
model2.item_factors.shape

(3229, 64)

In [66]:
# pred = np.dot(model2.user_factors, model2.item_factors.T)
pred = [sparse.csr_matrix(model2.user_factors), sparse.csr_matrix(model2.item_factors.T)]

In [61]:
from sklearn import metrics

def auc_score (test, predictions):
    '''
    fpr, tpr를 이용해서 AUC를 계산하는 함수
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr,tpr)

In [90]:
def calc_mean_auc(training_set, predictions, altered_users, test_set):
    '''
    가려진 정보가 있는 유저마다 AUC 평균을 구하는 함수
    ----------------------------------------
    input
    1. training_set: make_train 함수에서 만들어진 훈련 데이터 (일정 비율로 아이템 구매량이 0으로 가려진 데이터)
    2. prediction: implicit MF에서 나온 유저/아이템 별로 나온 예측 평점 행렬
    3. altered_users: make_train 함수에서 아이템 구매량이 0으로 가려진 유저
    4. test_set: make_train함수에서 만든 테스트 데이터
    ----------------------------------------
    반환
    추천 시스템 유저의 평균 auc
    인기아이템 기반 유저 평균 auc
    '''
    # 리스트 초기화
    store_auc = []
    popularity_auc = []
    
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # 모든 유저의 아이템별 구매횟수 합
    item_vecs = predictions[1] # 아이템 latent 벡터
    
    for user in altered_users:
        training_row = training_set[user,:].toarray().reshape(-1) # 유저의 훈련데이터
        zero_inds = np.where(training_row == 0) # 가려진 아이템 Index
        
        # 가려진 아이템에 대한 예측
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # 가려진 아이템에 대한 실제값
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        
        # 가려진 아이템에 대한 popularity (구매횟수 합)
        pop = pop_items[zero_inds]
        
        # AUC 계산 
        store_auc.append(auc_score(actual, pred))
        popularity_auc.append(auc_score(actual,pop))
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  

In [91]:
calc_mean_auc(product_train, pred, product_users_altered, product_test)

IndexError: index 3229 is out of bounds for axis 1 with size 3229