In [1]:
pip install implicit umap-learn -q

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\aryij\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from umap import UMAP
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

## 데이터프레임 로드

In [3]:
# parquet_path = "/content/drive/MyDrive/Colab Notebooks/Datasets/cp2_ecommerce/df_mv_filled.parquet"
parquet_path = "D:/cp2_dataset/df_cat.parquet"
df = pd.read_parquet(parquet_path, engine='pyarrow')

In [4]:
df.head(3)

Unnamed: 0,event_time,month,day,day_name,hour,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,main_cat,sub_cat_1,sub_cat_2
0,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,44600062,2103807459595387724,no_cat,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,no_cat,,
1,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,appliances,environment,water_heater
2,2019-10-01 04:00:01+04:00,10,1,Tuesday,4,view,17200506,2053013559792632471,furniture.living_room.sofa,no_brand,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,furniture,living_room,sofa


### event_type = cart, day_name = 금, 토, 일 데이터만 사용

In [5]:
cart_cond = (df["event_type"] == "cart")
df_cart = df[cart_cond]

In [6]:
df.shape, df_cart.shape

((42448764, 16), (926516, 16))

In [7]:
df.shape

(42448764, 16)

In [8]:
sub_cat_1 = (df_cart["sub_cat_1"] == "smartphone") \
            | (df_cart["sub_cat_1"] == "video") \
            | (df_cart["sub_cat_1"] == "notebook") \
            | (df_cart["sub_cat_1"] == "kitchen")

day_cond = (df_cart["day_name"] == "Friday") \
           | (df_cart["day_name"] == "Saturday") \
           | (df_cart["day_name"] == "Sunday") 

In [9]:
df_cart_cat = df_cart.loc[sub_cat_1]
df_day = df_cart_cat.loc[day_cond]
df_day.shape

(296943, 16)

In [10]:
cart_day = df_day[["product_id","user_id"]]

### counting을 위해 count 칼럼 생성

In [11]:
cart_day.loc[:, "count"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [12]:
cart_day.head(3)

Unnamed: 0,product_id,user_id,count
3515590,1005136,512433763,1
3515915,1003310,556510717,1
3516066,1005141,514302701,1


In [13]:
cart_day.shape

(296943, 3)

### product_id에 따른 설명 테이블 item_lookup 생성

In [14]:
item_lookup = df_day[['product_id','main_cat', "sub_cat_1", "sub_cat_2", "brand"]].drop_duplicates()
item_lookup.head()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
3515590,1005136,electronics,smartphone,,apple
3515915,1003310,electronics,smartphone,,apple
3516066,1005141,electronics,smartphone,,apple
3516110,1004945,electronics,smartphone,,samsung
3516183,1005118,electronics,smartphone,,apple


### Train / Test dataset 생성

In [15]:
# test_set 비율 0.3
valid_pct = 0.3

np.random.seed(42)
cart_day["random"] = np.random.random(size=len(cart_day))

train_mask = cart_day["random"] < (1-valid_pct)
valid_mask = cart_day["random"] >= (1-valid_pct)

# 고객과 아이템 별로 총 얼마나 구매했는지 
grouped_cart_train = cart_day[train_mask].groupby(["user_id", "product_id"]).size().to_frame("count").reset_index()
grouped_cart_valid = cart_day[valid_mask].groupby(["user_id", "product_id"]).size().to_frame("count").reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cart_day["random"] = np.random.random(size=len(cart_day))


In [16]:
sample_weight_train = np.log2(grouped_cart_train['count'] + 1)
sample_weight_valid = np.log2(grouped_cart_valid['count'] + 1)

grouped_cart_train = grouped_cart_train[['user_id', 'product_id']]
grouped_cart_valid = grouped_cart_valid[['user_id', 'product_id']]

train_users = np.sort(grouped_cart_train.user_id.unique())
valid_users = np.sort(grouped_cart_valid.user_id.unique())
cold_start_users = set(valid_users) - set(train_users)

train_items = np.sort(grouped_cart_train.product_id.unique())
valid_items = np.sort(grouped_cart_valid.product_id.unique())
cold_start_items = set(valid_items) - set(train_items)

# item_lookup 테이블도 train, valid set에 맞게 나눠준다
item_lookup_train = item_lookup[item_lookup.product_id.isin(train_items)]
item_lookup_valid = item_lookup[item_lookup.product_id.isin(valid_items)]

### ALS 알고리즘의 입력값 형태 만들기 위해, 희소 행렬 생성
- 희소 행렬
    - row : 유니크한 user_id
    - column : 유니크한 product_id
    - 행렬의 값은 유저가 각 제품 구매한 횟수

In [17]:
# create zero-based index position <-> user/item ID mappings
index_to_user = pd.Series(np.sort(np.unique(grouped_cart_train['user_id'])))
index_to_item = pd.Series(np.sort(np.unique(grouped_cart_train['product_id'])))

# create reverse mappings from user/item ID to index positions
user_to_index = pd.Series(data=index_to_user.index, index=index_to_user.values)
item_to_index = pd.Series(data=index_to_item.index, index=index_to_item.values)

# convert user/item identifiers to index positions
grouped_cart_train_imp = grouped_cart_train.copy()
grouped_cart_train_imp['user_id'] = grouped_cart_train['user_id'].map(user_to_index)
grouped_cart_train_imp['product_id'] = grouped_cart_train['product_id'].map(item_to_index)

In [18]:
# prepare the data for CSR creation
data = sample_weight_train
rows = grouped_cart_train_imp['user_id']
cols = grouped_cart_train_imp['product_id']

In [19]:
# matrix가 매우 크기 때문에 Sparse matrix로 바꾸어주어서
# zero가 아닌 값들의 위치와 그 값만 저장하도록 메모리 절약!

# create the required user-item and item-user CSR matrices
items_user_imp = csr_matrix((data, (cols, rows)), shape=(len(train_items), len(train_users)))
user_items_imp = items_user_imp.T.tocsr()

## 모델링

#### Random Model (Base line)
- product_id 의 최빈값을 추천하는 것을 baseline 으로 선정
- 각 user 들이 가장 많이 cart에 넣는 product을 추천하는 것을 기본으로 하여, 높은 성능의 추천 시스템을 생성하는 것을 우선적으로 진행

In [20]:
class RandomModel():
    def __init__(self):
        self.products = None
    
    def train(self, products):
        self.products = np.asarray(products)
        return self

    def recommend(self, size=(1,10)):
        return np.random.choice(self.products, size=size, replace=False)

In [21]:
random_model = RandomModel().train(item_lookup_train.index.tolist())

In [22]:
grouped_cart_train["product_id"].value_counts()

1004856     9169
1004767     6838
1004833     3777
1004870     3588
1002544     3060
            ... 
4502129        1
1306590        1
3100935        1
7900423        1
18001469       1
Name: product_id, Length: 2984, dtype: int64

In [23]:
baseline = grouped_cart_train.copy()

In [24]:
freq = grouped_cart_train["product_id"].mode()[0]

In [25]:
baseline = [freq] * len(grouped_cart_train)

In [26]:
# 최다 product_id의 빈도를 정확도 baseline으로 선정
from sklearn.metrics import accuracy_score
print("training accuracy: ", accuracy_score(grouped_cart_train["product_id"], baseline))

training accuracy:  0.06952585324426179


### Alternating Least Squares (ALS) model

#### ALS 특징
- Implicit 데이터는 구체적인 점수가 아니므로 사용자가 아이템에 관심있는지 여부(Preference)를 Binary로 표현하여 사용
    - 주목할점은 인터랙션이 한번도 없는 경우 Preference값이 음수가 아니라 0이라는 것
    
- 사용자가 아이템을 얼마나 선호하는지를 나타낸 Confidence 값을 정의
    - 여기서 α는 인터랙션이 있는 경우 r의 중요도를 조절하는 하이퍼파라미터이다. 
    - r의 값이 높아질수록 Confidence 값이 증가하게 만든다(Increasing Function). 
    - 인터랙션이 없다고 해서 선호하지 않는 것이 아니므로 이런 경우에는 낮은 Confidence 값을 가진다. 
    - 여기서 r은 이미 알고 있는 값이고, α는 사용자 설정값이므로 Confidence는 상수가 된다. 
        - 즉, 학습대상이 아니다.

#### ALS 선정한 이유
- CF - ALS / 데이터셋에 유저가 직접적으로 명시한 피드백이 없으므로 암시적 피드백의 특성을 잘 고려한 모델인 Alternating Least Squares(ALS) 모델을 활용하여 추천 시스템을 진행

In [27]:
# confidence value tuning

# In most casing the solutions should be pretty similar after calling fit. It might be worth trying to tune the confidence values passed as an input, potentially by multiplying the confidence weights by a constant (alpha) before calling fit.

# You should probably by determining the best value of alpha via cross validation - but a useful heuristic to get started is it to set the alpha param such that the positive examples and negative examples have equal weight:

# since all the zeros in the matrix (negative examples) get a default confidence of 1
# https://github.com/benfred/implicit/issues/96

# fit 하기 전, confidence 값 (alpha) 을 조정한다 
# -> negative example 들을 1로 맞춰준다

Riu = user_items_imp
# nnz : 저장된 값들의 수
alpha = (Riu.shape[0] * Riu.shape[1] - Riu.nnz) / sum(Riu.data)

In [28]:
from implicit.als import AlternatingLeastSquares

# initialize and fit the model
imp_model = AlternatingLeastSquares(factors=64, 
                                    random_state=42,
                                    regularization=0.05,
                                    iterations=20, 
                                    calculate_training_loss=True)
imp_model.fit(alpha * user_items_imp)

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

## Evaluation

#### Random Model

In [29]:
random_rec = random_model.recommend()

item_lookup.loc[random_rec[0].tolist(), :]

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
3630185,2702630,appliances,kitchen,refrigerators,midea
3556959,1801766,electronics,video,tv,artel
4827907,2501740,appliances,kitchen,oven,redmond
3522199,3100341,appliances,kitchen,blender,scarlett
24419074,2500562,appliances,kitchen,oven,darina
3950251,10900114,appliances,kitchen,mixer,saturn
17309768,1801960,electronics,video,tv,haier
36498168,4600702,appliances,kitchen,dishwasher,indesit
3521328,3601485,appliances,kitchen,dishwasher,lg
25963862,2600739,appliances,kitchen,cooker,gefest


#### ALS Model

##### Hit_rate, Precision, Recall

In [30]:
# generate recommendations for all users and map back to original user/item ID values
recs_imp = imp_model.recommend_all(user_items=user_items_imp, N=10, filter_already_liked_items=False)
recs_imp = pd.DataFrame(recs_imp, index=index_to_user.values).apply(lambda c: c.map(index_to_item))

valid_user_items = grouped_cart_valid.groupby('user_id')['product_id'].apply(set).to_dict()
combined_users = set(train_users) & set(valid_users)

In [31]:
# hit_rate, precision, recall 측정
imp_hrt = np.mean([int(len(set(recs_imp.loc[u]) & valid_user_items[u]) > 0) for u in combined_users])
imp_pre = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(recs_imp.loc[u]) for u in combined_users])
imp_rec = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(valid_user_items[u]) for u in combined_users])

In [32]:
print("Hit Rate :", round(imp_hrt, 2))
print("Precision :", round(imp_pre, 2))
print("Recall :", round(imp_rec, 2))

Hit Rate : 0.87
Precision : 0.1
Recall : 0.83


##### Hit Rate를 구해본 이유?
- Hit Rate는 전체 사용자 수 대비 적중한 사용자를 의미 (적중률)


##### 추천 시스템 통한 상품 추천 예시

###### 예시) 실제 user_id가 실제 cart에 가장 많이 넣은 제품들

In [33]:
user_id = 539016593

user_prod_freq = (
    cart_day
    .loc[cart_day["user_id"] == user_id, "product_id"]
    .value_counts()
    .iloc[:10]
)
freq_item = item_lookup.set_index("product_id").loc[user_prod_freq.index.to_list(), :]
freq_item.loc[:, "counts"] = user_prod_freq.to_list()
freq_item.reset_index()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,counts
0,1802037,electronics,video,tv,lg,6
1,1005100,electronics,smartphone,,samsung,4
2,1004809,electronics,smartphone,,xiaomi,3
3,1004777,electronics,smartphone,,xiaomi,3
4,1005204,electronics,smartphone,,xiaomi,3
5,1004838,electronics,smartphone,,oppo,2
6,1801998,electronics,video,tv,artel,1
7,1005006,electronics,smartphone,,xiaomi,1
8,1801770,electronics,video,tv,changhong,1
9,1005195,electronics,smartphone,,xiaomi,1


###### 추천 시스템 통해 구해본 해당 user_id를 위한 상품 추천

In [34]:
user_idx = list(grouped_cart_train['user_id']).index(user_id)
user_idx

69793

In [35]:
ids, scores = imp_model.recommend(user_idx, user_items_imp[user_idx], filter_already_liked_items=False)

In [36]:
ids

array([ 580,  120, 1656,  251, 1604, 2776, 2657, 1208,  493, 1619])

In [37]:
als_rec = pd.DataFrame({"product_id": grouped_cart_train["product_id"][ids], 
                        "score": scores, 
                        "already_carted": np.in1d(ids, user_items_imp[user_idx].indices)})
als_rec.drop_duplicates("product_id")

Unnamed: 0,product_id,score,already_carted
580,1005116,0.960218,True
120,1004237,0.794287,False
1656,1005073,0.715895,False
251,1004566,0.714667,False
1604,1004259,0.707156,False
2776,1004767,0.577688,False
2657,11000168,0.554805,False
1208,1004870,0.554098,False
493,1005121,0.539481,False
1619,1005105,0.52905,False


In [38]:
als_item_lookup = item_lookup_train.set_index("product_id").loc[als_rec["product_id"].to_list(), :].reset_index()
als_item_lookup.merge(als_rec).drop_duplicates("product_id").reset_index(drop=True)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score,already_carted
0,1005116,electronics,smartphone,,apple,0.960218,True
1,1004237,electronics,smartphone,,apple,0.794287,False
2,1005073,electronics,smartphone,,samsung,0.715895,False
3,1004566,electronics,smartphone,,huawei,0.714667,False
4,1004259,electronics,smartphone,,apple,0.707156,False
5,1004767,electronics,smartphone,,samsung,0.577688,False
6,11000168,appliances,kitchen,toster,delonghi,0.554805,False
7,1004870,electronics,smartphone,,samsung,0.554098,False
8,1005121,electronics,smartphone,,apple,0.539481,False
9,1005105,electronics,smartphone,,apple,0.52905,False


###### 실제 user_id가 넣은 상품과 비슷한 아이템

In [39]:
product_id = 1801690
product_idx = list(grouped_cart_train['product_id']).index(product_id)

similar_ids, similar_scores = imp_model.similar_items(product_idx)

sim_items = pd.DataFrame({"product_id": grouped_cart_train["product_id"][similar_ids], 
                        "score": similar_scores})

als_sim_item_lookup = item_lookup_train.set_index("product_id").loc[sim_items["product_id"].to_list(), :].reset_index()
als_sim_item_lookup.merge(sim_items).sort_values("score", ascending=False).reset_index(drop=True)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score
0,1801690,electronics,video,tv,samsung,1.0
1,1003304,electronics,smartphone,,apple,0.82967
2,1004741,electronics,smartphone,,xiaomi,0.828519
3,1004227,electronics,smartphone,,apple,0.8249
4,4100346,electronics,video,tv,sony,0.819131
5,1005073,electronics,smartphone,,samsung,0.818565
6,1004856,electronics,smartphone,,samsung,0.808489
7,1004856,electronics,smartphone,,samsung,0.808489
8,1004258,electronics,smartphone,,apple,0.806378
9,1004838,electronics,smartphone,,oppo,0.806199


# 모델링 결과 비교 및 최종 의견

- Baseline
  - accuracy: 0.06891920602160981

- ALS Model
  - Hit Rate : 0.87
  - Precision : 0.1
  - Recall : 0.83

- 모델링 결과 분석
  - 낮은 정밀도
    - 이커머스에서 추천 정밀도가 낮다면 플랫폼에 대한 신뢰 하락 우려
  - 추후 다른 달의 데이터에도 적용 후 똑같이 정밀도가 저조하다면 향후 모델링 개선 필요