In [1]:
pip install implicit umap-learn -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
from umap import UMAP
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

  from .autonotebook import tqdm as notebook_tqdm


## 데이터프레임 로드

In [3]:
# parquet_path = "/content/drive/MyDrive/Colab Notebooks/Datasets/cp2_ecommerce/df_mv_filled.parquet"
parquet_path = "D:/cp2_dataset/df_cat_add.parquet"
df = pd.read_parquet(parquet_path, engine='pyarrow')

In [4]:
df.head(3)

Unnamed: 0,event_time,month,day,day_name,hour,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,main_cat,sub_cat_1,sub_cat_2
0,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,44600062,2103807459595387724,no_cat,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,no_cat,,
1,2019-10-01 04:00:00+04:00,10,1,Tuesday,4,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,appliances,environment,water_heater
2,2019-10-01 04:00:01+04:00,10,1,Tuesday,4,view,17200506,2053013559792632471,furniture.living_room.sofa,no_brand,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,furniture,living_room,sofa


### event_type = cart, day_name = 금, 토, 일 데이터만 사용

In [5]:
cart_cond = (df["event_type"] == "cart")
df_cart = df[cart_cond]

In [6]:
df.shape, df_cart.shape

((42380091, 16), (926366, 16))

In [7]:
sub_cat_1 = (df_cart["sub_cat_1"] == "smartphone") \
            | (df_cart["sub_cat_1"] == "video") \
            | (df_cart["sub_cat_1"] == "notebook") \
            | (df_cart["sub_cat_1"] == "kitchen")

day_cond = (df_cart["day_name"] == "Friday") \
           | (df_cart["day_name"] == "Saturday") \
           | (df_cart["day_name"] == "Sunday") 

In [8]:
df_cart_cat = df_cart.loc[sub_cat_1]
df_day = df_cart_cat.loc[day_cond]
df_day.shape

(296936, 16)

In [9]:
cart_day = df_day[["product_id","user_id"]]

### counting을 위해 count 칼럼 생성

In [10]:
cart_day.loc[:, "count"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [11]:
cart_day.head(3)

Unnamed: 0,product_id,user_id,count
3515590,1005136,512433763,1
3515915,1003310,556510717,1
3516066,1005141,514302701,1


In [12]:
cart_day.shape

(296936, 3)

### product_id에 따른 설명 테이블 item_lookup 생성

In [13]:
item_lookup = df_day[['product_id','main_cat', "sub_cat_1", "sub_cat_2", "brand"]].drop_duplicates()
item_lookup.head()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
3515590,1005136,electronics,smartphone,,apple
3515915,1003310,electronics,smartphone,,apple
3516066,1005141,electronics,smartphone,,apple
3516110,1004945,electronics,smartphone,,samsung
3516183,1005118,electronics,smartphone,,apple


### Train / Test dataset 생성

In [14]:
# test_set 비율 0.3
valid_pct = 0.3

np.random.seed(42)
cart_day["random"] = np.random.random(size=len(cart_day))

train_mask = cart_day["random"] < (1-valid_pct)
valid_mask = cart_day["random"] >= (1-valid_pct)

# 고객과 아이템 별로 총 얼마나 구매했는지 
grouped_cart_train = cart_day[train_mask].groupby(["user_id", "product_id"]).size().to_frame("count").reset_index()
grouped_cart_valid = cart_day[valid_mask].groupby(["user_id", "product_id"]).size().to_frame("count").reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [15]:
sample_weight_train = np.log2(grouped_cart_train['count'] + 1)
sample_weight_valid = np.log2(grouped_cart_valid['count'] + 1)

grouped_cart_train = grouped_cart_train[['user_id', 'product_id']]
grouped_cart_valid = grouped_cart_valid[['user_id', 'product_id']]

train_users = np.sort(grouped_cart_train.user_id.unique())
valid_users = np.sort(grouped_cart_valid.user_id.unique())
cold_start_users = set(valid_users) - set(train_users)

train_items = np.sort(grouped_cart_train.product_id.unique())
valid_items = np.sort(grouped_cart_valid.product_id.unique())
cold_start_items = set(valid_items) - set(train_items)

# item_lookup 테이블도 train, valid set에 맞게 나눠준다
item_lookup_train = item_lookup[item_lookup.product_id.isin(train_items)]
item_lookup_valid = item_lookup[item_lookup.product_id.isin(valid_items)]

### ALS 알고리즘의 입력값 형태 만들기 위해, 희소 행렬 생성
- 희소 행렬
    - row : 유니크한 user_id
    - column : 유니크한 product_id
    - 행렬의 값은 유저가 각 제품 구매한 횟수

In [16]:
# create zero-based index position <-> user/item ID mappings
index_to_user = pd.Series(np.sort(np.unique(grouped_cart_train['user_id'])))
index_to_item = pd.Series(np.sort(np.unique(grouped_cart_train['product_id'])))

# create reverse mappings from user/item ID to index positions
user_to_index = pd.Series(data=index_to_user.index, index=index_to_user.values)
item_to_index = pd.Series(data=index_to_item.index, index=index_to_item.values)

# convert user/item identifiers to index positions
grouped_cart_train_imp = grouped_cart_train.copy()
grouped_cart_train_imp['user_id'] = grouped_cart_train['user_id'].map(user_to_index)
grouped_cart_train_imp['product_id'] = grouped_cart_train['product_id'].map(item_to_index)

In [17]:
# prepare the data for CSR creation
data = sample_weight_train
rows = grouped_cart_train_imp['user_id']
cols = grouped_cart_train_imp['product_id']

In [18]:
# matrix가 매우 크기 때문에 Sparse matrix로 바꾸어주어서
# zero가 아닌 값들의 위치와 그 값만 저장하도록 메모리 절약!

# create the required user-item and item-user CSR matrices
items_user_imp = csr_matrix((data, (cols, rows)), shape=(len(train_items), len(train_users)))
user_items_imp = items_user_imp.T.tocsr()

## 모델링

#### Random Model (Base line)

In [19]:
class RandomModel():
    def __init__(self):
        self.products = None
    
    def train(self, products):
        self.products = np.asarray(products)
        return self

    def recommend(self, size=(1,10)):
        return np.random.choice(self.products, size=size, replace=False)

In [20]:
random_model = RandomModel().train(item_lookup.index.tolist())

### Alternating Least Squares (ALS) model

In [21]:
from implicit.als import AlternatingLeastSquares

# initialize and fit the model
imp_model = AlternatingLeastSquares(factors=64, 
                                    random_state=42,
                                    regularization=0.05,
                                    iterations=20, 
                                    calculate_training_loss=True)
imp_model.fit(user_items_imp)

100%|██████████| 20/20 [00:48<00:00,  2.43s/it, loss=0.000201]


## Evaluation

#### Random Model

In [22]:
random_rec = random_model.recommend()

item_lookup.loc[random_rec[0].tolist(), :]

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand
25963862,2600739,appliances,kitchen,cooker,gefest
4063179,3500168,appliances,kitchen,meat_grinder,bosch
33476884,4502752,appliances,kitchen,hob,gorenje
3960111,1004797,electronics,smartphone,,xiaomi
4387699,1305667,computers,notebook,,hp
4117372,12600007,appliances,kitchen,grill,tefal
23354500,18000639,electronics,smartphone,phone_accessories,apple
3877272,1004155,electronics,smartphone,,meizu
4102843,33300011,furniture,kitchen,table,no_brand
4028275,1004988,electronics,smartphone,,nokia


#### ALS Model

##### Hit_rate, Precision, Recall

In [23]:
# generate recommendations for all users and map back to original user/item ID values
recs_imp = imp_model.recommend_all(user_items=user_items_imp, N=10, filter_already_liked_items=False)
recs_imp = pd.DataFrame(recs_imp, index=index_to_user.values).apply(lambda c: c.map(index_to_item))

valid_user_items = grouped_cart_valid.groupby('user_id')['product_id'].apply(set).to_dict()
combined_users = set(train_users) & set(valid_users)

In [24]:
# hit_rate, precision, recall 측정
imp_hrt = np.mean([int(len(set(recs_imp.loc[u]) & valid_user_items[u]) > 0) for u in combined_users])
imp_pre = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(recs_imp.loc[u]) for u in combined_users])
imp_rec = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(valid_user_items[u]) for u in combined_users])

In [28]:
print("Hit Rate :", round(imp_hrt, 2))
print("Precision :", round(imp_pre, 2))
print("Recall :", round(imp_rec, 2))

Hit Rate : 0.64
Precision : 0.07
Recall : 0.6


##### 해당 user_id가 실제 cart에 가장 많이 넣은 제품들

In [29]:
user_id = 539016593

user_prod_freq = (
    cart_day
    .loc[cart_day["user_id"] == user_id, "product_id"]
    .value_counts()
    .iloc[:10]
)
freq_item = item_lookup.set_index("product_id").loc[user_prod_freq.index.to_list(), :]
freq_item.loc[:, "counts"] = user_prod_freq.to_list()
freq_item.reset_index()

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,counts
0,1802037,electronics,video,tv,lg,6
1,1005100,electronics,smartphone,,samsung,4
2,1004809,electronics,smartphone,,xiaomi,3
3,1004777,electronics,smartphone,,xiaomi,3
4,1005204,electronics,smartphone,,xiaomi,3
5,1004838,electronics,smartphone,,oppo,2
6,1801998,electronics,video,tv,artel,1
7,1005006,electronics,smartphone,,xiaomi,1
8,1801770,electronics,video,tv,changhong,1
9,1005195,electronics,smartphone,,xiaomi,1


##### 추천 시스템

In [30]:
user_idx = list(grouped_cart_train['user_id']).index(user_id)
user_idx

69849

In [31]:
ids, scores = imp_model.recommend(user_idx, user_items_imp[user_idx], filter_already_liked_items=False)

In [32]:
ids

array([237, 441, 324, 475, 451, 295, 385, 448, 205,   2])

In [33]:
als_rec = pd.DataFrame({"product_id": grouped_cart_train["product_id"][ids], 
                        "score": scores, 
                        "already_carted": np.in1d(ids, user_items_imp[user_idx].indices)})
als_rec.drop_duplicates("product_id")

Unnamed: 0,product_id,score,already_carted
237,1801690,0.997315,True
441,1004870,0.001898,False
324,1004835,0.001709,False
475,1005159,0.001587,False
451,1004777,0.001274,False
295,1003304,0.001264,False
385,4501747,0.001256,False
448,1004249,0.001243,False
205,1004871,0.001101,False


In [34]:
als_item_lookup = item_lookup_train.set_index("product_id").loc[als_rec["product_id"].to_list(), :].reset_index()
als_item_lookup.merge(als_rec).drop_duplicates("product_id").reset_index(drop=True)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score,already_carted
0,1801690,electronics,video,tv,samsung,0.997315,True
1,1004870,electronics,smartphone,,samsung,0.001898,False
2,1004835,electronics,smartphone,,samsung,0.001709,False
3,1005159,electronics,smartphone,,xiaomi,0.001587,False
4,1004777,electronics,smartphone,,xiaomi,0.001274,False
5,1003304,electronics,smartphone,,apple,0.001264,False
6,4501747,appliances,kitchen,hob,beko,0.001256,False
7,1004249,electronics,smartphone,,apple,0.001243,False
8,1004871,electronics,smartphone,,samsung,0.001101,False


##### 비슷한 아이템

In [35]:
product_id = 1004250
product_idx = list(grouped_cart_train['product_id']).index(product_id)

similar_ids, similar_scores = imp_model.similar_items(product_idx)

sim_items = pd.DataFrame({"product_id": grouped_cart_train["product_id"][similar_ids], 
                        "score": similar_scores})

als_sim_item_lookup = item_lookup_train.set_index("product_id").loc[sim_items["product_id"].to_list(), :].reset_index()
als_sim_item_lookup.merge(sim_items).sort_values("score", ascending=False).reset_index(drop=True)

Unnamed: 0,product_id,main_cat,sub_cat_1,sub_cat_2,brand,score
0,1004250,electronics,smartphone,,apple,1.0
1,1004741,electronics,smartphone,,xiaomi,0.995856
2,4502730,appliances,kitchen,hob,simfer,0.99124
3,1004785,electronics,smartphone,,huawei,0.972566
4,1005113,electronics,smartphone,,apple,0.941062
5,1002544,electronics,smartphone,,apple,0.913639
6,1801766,electronics,video,tv,artel,0.912017
7,1004873,electronics,smartphone,,samsung,0.820935
8,1004258,electronics,smartphone,,apple,0.760123
9,1002524,electronics,smartphone,,apple,0.609651
