# Action 1 : 협업 필터링을 이용한 상품 추천 모델 생성
1. 유저 중심 협업 필터링을 통해 유사한 k명의 이웃을 추출
2. k명에 대한 cart & purchase 상품들을 모델 기반 협업 필터링을 통해 전환확률이 높은 상위 상품을 추천

## 유저 중심 협업 필터링
- 유저-상품 인터랙션 행렬은 Sparse하기에 코사인 유사도보단 자카드 유사도를 통해 유저의 유사도 행렬 도출

In [8]:
import numpy as np
import pandas as pd

import pickle
from tqdm import tqdm

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/event_log.csv')

In [3]:
def event_flow(data):
    """시간의 흐름에 따라 세션별 event_type을 정렬해주는 함수"""
    result = []
    for event in data:
        if not event in result:
            result.append(event)
            
    return ",".join(result)


def convert_bool(data):
    """정렬된 event_type에 대해 view / cart & purchase 를 기준으로 이진화해주는 함수"""
    split_data = data.split(',')
    if ('cart' in split_data) | ('purchase' in split_data):
        return 1
    else:
        return -1

In [4]:
# 유저-상품에 대한 event_type 정렬
user_product_df = df.groupby(['user_id', 'product_id'])['event_type'].apply(list).apply(event_flow)
user_product_df = user_product_df.reset_index()
user_product_df.sample(3)

Unnamed: 0,user_id,product_id,event_type
179979,1515915625537687395,1549921,view
249739,1515915625545103000,1270782,view
345677,1515915625569808912,4079420,view


In [5]:
# view / cart & purchase에 대한 이진화된 컬럼으로 변형
user_product_df['event_type'] = user_product_df['event_type'].apply(convert_bool)
user_product_df['event_type'].value_counts()

-1    509654
 1     47036
Name: event_type, dtype: int64

### User - Product Interaction Matrix

#### Data Filtering
- 일반화된 유사도 행렬을 구하기 위해 일정 수준 이상의 데이터를 필터링
    - 도메인에 따라 조정 가능
- 556690개의 데이터 중 29226개만 필터링


In [6]:
# 상품을 6개 이상 구매한 유저만 필터링
filter_users = user_product_df['user_id'].value_counts() > 5
filter_users = filter_users[filter_users].index.tolist()

# 51번 이상 노출된 상품만 필터링
filter_products = user_product_df['product_id'].value_counts() > 50
filter_products = filter_products[filter_products].index.tolist()

filtered_df = user_product_df[
    (user_product_df['user_id'].isin(filter_users)) & (user_product_df['product_id'].isin(filter_products))
]

print(user_product_df.shape)
print(filtered_df.shape)

(556690, 3)
(29226, 3)


#### Interaction Matrix로 변환
- NaN을 0(경험하지 못함)으로 변환 후, 정수타입으로 데이터 타입 변환

In [7]:
pivot_df = pd.pivot_table(
    filtered_df,
    index='user_id', columns='product_id', values='event_type'
).fillna(0).astype(int)

print(pivot_df.shape)
pivot_df.head(2)

(5219, 1461)


product_id,105,611,817,945,2260,2349,2712,3192,3193,3203,4279,4442,4535,4551,4556,5879,5913,6476,7597,7598,7600,7887,8093,11551,13327,16101,16122,16133,16187,16340,19491,20803,26421,31899,31952,31954,31973,37027,39261,39770,...,4170509,4170512,4170514,4170515,4170516,4170517,4170519,4170523,4170524,4170526,4170530,4170531,4170533,4170534,4170535,4170809,4170970,4170994,4171036,4171037,4171147,4182689,4183707,4183795,4183807,4183847,4183850,4183854,4183856,4183859,4183860,4183861,4183863,4183864,4183866,4183872,4183873,4183874,4183875,4183880
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1515915625353234047,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1515915625353294441,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


---

### Jaccard Similarity Matrix
- Interaction Matrix의 행렬곱을 통해 각 유저와 모든 유저에 대해 상품의 event 전환여부의 유사도를 계산
- 자카드 유사도 행렬에서 이웃 유저를 찾을 때 각 벡터에 대해서만 찾기에 비율로서 표현하지 않고 스코어로 표현

In [None]:
jaccard_matrix = np.zeros((pivot_df.shape[0], pivot_df.shape[0]))

temp_df = pivot_df.copy()
temp_df_T = temp_df.T.copy()

# 유저-상품 인터랙션 행렬의 각 유저를 돌면서
for i in tqdm(range(temp_df.shape[0])):
    temp_vector = temp_df.iloc[i, :].values
    
    # 모든 유저의 이벤트 전환여부에 대해 한명씩 비교
    for j in range(temp_df_T.shape[1]):
        temp_T_vector = temp_df_T.iloc[:, j].values
        
        score = 0
        # 비교되는 두 유저에 대해 같은 이벤트 행동을 한 상품이 존재하면 +1
        # 보지 않은 상품에 대해서는 제외
        for idx in range(len(temp_vector)):
            if temp_vector[idx] == temp_T_vector[idx]:
                if temp_vector[idx] in [-1, 1]:
                    score += 1
        
        # 두 유저의 자카드 스코어를 대입
        jaccard_matrix[i, j] = score

In [10]:
print(jaccard_matrix.shape)
jaccard_matrix = jaccard_matrix.astype(int)
jaccard_matrix

(5219, 5219)


array([[ 4,  2,  0, ...,  0,  0,  0],
       [ 2, 30,  0, ...,  0,  0,  1],
       [ 0,  0,  5, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  6,  2,  0],
       [ 0,  0,  0, ...,  2,  7,  0],
       [ 0,  1,  0, ...,  0,  0,  2]])

---

### 유저 중심 유사도 행렬

In [11]:
similarity_df = pd.DataFrame(
    jaccard_matrix, index=pivot_df.index, columns=pivot_df.index
)
similarity_df.head(2)

user_id,1515915625353234047,1515915625353294441,1515915625353496458,1515915625353900095,1515915625353946724,1515915625354061413,1515915625354144974,1515915625354145851,1515915625354153061,1515915625354227904,1515915625354399528,1515915625354857951,1515915625355179497,1515915625356119540,1515915625356236274,1515915625357384049,1515915625358666491,1515915625359368343,1515915625359453928,1515915625359616242,1515915625360401970,1515915625360775587,1515915625361099554,1515915625361963933,1515915625363053094,1515915625363853004,1515915625364062846,1515915625364316677,1515915625365042345,1515915625366169564,1515915625366672718,1515915625367150883,1515915625367438020,1515915625367995831,1515915625369530324,1515915625373342675,1515915625376375077,1515915625379210214,1515915625383075120,1515915625386067291,...,1515915625610156135,1515915625610199488,1515915625610224305,1515915625610284511,1515915625610320445,1515915625610322327,1515915625610325447,1515915625610360407,1515915625610365251,1515915625610388268,1515915625610389535,1515915625610435604,1515915625610446962,1515915625610454633,1515915625610477340,1515915625610493676,1515915625610536757,1515915625610570448,1515915625610575761,1515915625610590667,1515915625610606458,1515915625610713854,1515915625610718961,1515915625610736540,1515915625610773028,1515915625610790901,1515915625610791038,1515915625610802187,1515915625610826922,1515915625610828170,1515915625610840656,1515915625610847452,1515915625610890882,1515915625610910358,1515915625610924553,1515915625610938925,1515915625610976222,1515915625610992226,1515915625611011228,1515915625611014524
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1515915625353234047,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1515915625353294441,2,30,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,1


#### 모델링을 위한 데이터셋 저장

In [None]:
similarity_df.to_csv('/content/drive/MyDrive/Colab_Notebooks/jaccard_similarity_df.csv')
filtered_df.to_csv('/content/drive/MyDrive/Colab_Notebooks/filtered_df.csv')

---

---

## 모델 기반 협업 필터링

In [13]:
import os
import random

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import optimizers, activations

In [14]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/event_log.csv')
df.sort_values(['user_id', 'product_id', 'event_time'], inplace=True)

In [15]:
cart_purchase_df = df[
    (df['event_type'] == 'purchase') | (df['event_type'] == 'cart')
].reset_index(drop=True)

print(cart_purchase_df.shape)
cart_purchase_df.head(3)

(91381, 9)


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-10-03 11:23:00 UTC,cart,1023383,2144415922016747613,computers.peripherals.wifi,zyxel,119.03,1515915625353286099,p5I1h2vk1T
1,2020-10-03 11:23:44 UTC,purchase,1023383,2144415922016747613,computers.peripherals.wifi,zyxel,119.03,1515915625353286099,p5I1h2vk1T
2,2020-10-06 08:28:18 UTC,cart,4035841,2144415922016747613,computers.peripherals.wifi,tp-link,120.27,1515915625353286099,7ZE3RhNYE6


### 모델링을 위한 데이터 전처리

In [16]:
# cart & purchase 여부에 대해서만 고려하기에 user_id, product_id 에 대해 중복되는 데이터 제거
deduplicated_df = cart_purchase_df.drop_duplicates(subset=['user_id', 'product_id']).copy()

In [17]:
# 구매이력이 1회인 유저는 필터링
filter_users = deduplicated_df['user_id'].value_counts() > 1
filter_users = filter_users[filter_users].index.tolist()

# 장바구니 또는 구매된 이력이 1회인 상품은 필터링
filter_product = deduplicated_df['product_id'].value_counts() > 1
filter_product = filter_product[filter_product].index.tolist()

In [18]:
result_df = deduplicated_df[
    lambda x: (x['user_id'].isin(filter_users)) & (x['product_id'].isin(filter_product))
][['user_id', 'product_id']]

print(f"필터링 전 기록 총 수 : {len(deduplicated_df)}")
print(f"필터링 후 기록 총 수 : {len(result_df)}")

필터링 전 기록 총 수 : 47036
필터링 후 기록 총 수 : 11738


#### (유저, 상품) 쌍에 대해 전환된 데이터도 중요하지만, 전환되지 않은 데이터도 학습에 중요하므로 1:1비율로 전환되지 않은 데이터 쌍도 생성

In [19]:
random.seed(13)

not_cart_purchase_data = {'user_id': [], 'product_id': []}
while len(not_cart_purchase_data['user_id']) < len(result_df):
    random_user = random.choice(filter_users)
    random_product = random.choice(filter_product)
    
    random_cart_purchase = result_df[
        lambda x: (x['user_id'] == random_user) & (x['product_id'] == random_product)
    ]
    
    if len(random_cart_purchase) > 0:
        continue
    else:
        not_cart_purchase_data['user_id'].append(random_user)
        not_cart_purchase_data['product_id'].append(random_product)

In [20]:
result_not_df = pd.DataFrame(not_cart_purchase_data)

# cart & purchase한 경우 1, 그렇지 않은 경우 0으로 하는 컬럼 정의
boolean_df = pd.concat([result_df.assign(cart_purchase=1), result_not_df.assign(cart_purchase=0)])

print(boolean_df.shape)
boolean_df.head(3)

(23476, 3)


Unnamed: 0,user_id,product_id,cart_purchase
0,1515915625353286099,1023383,1
2,1515915625353286099,4035841,1
16,1515915625353900095,16237,1


In [21]:
# train, test 데이터 세분화
train_df, test_df = train_test_split(boolean_df, random_state=13)

In [22]:
# user_id와 product_id는 연속적이지 않음
# 인덱스로 사용하기 위해 매핑 진행
user_id_mapping = {user_id:i for i, user_id in enumerate(boolean_df['user_id'].unique())}
product_id_mapping = {product_id:i for i, product_id in enumerate(boolean_df['product_id'].unique())}

In [23]:
# user_id, product_id 를 user_index, product_index로 매핑 진행
train_user_data = train_df['user_id'].map(user_id_mapping)
train_product_data = train_df['product_id'].map(product_id_mapping)

test_user_data = test_df['user_id'].map(user_id_mapping)
test_product_data = test_df['product_id'].map(product_id_mapping)

---

### 하이퍼 파라미터 정의 및 모델 네트워크 구조 생성

In [24]:
# 사이즈 정의
num_users = len(user_id_mapping)
num_products = len(product_id_mapping)
embedding_size = 10

print(f"num users : {num_users}")
print(f"num products : {num_products}")

num users : 4478
num products : 4269


In [None]:
# 입력 레이어 생성
user_id_input = Input(shape=[1], name='user')
product_id_input = Input(shape=[1], name='product')

# 임베딩 레이어 생성 : (batch_size, 1, 10)
user_embedding = Embedding(output_dim=embedding_size,
                           input_dim=num_users,
                           input_length=1,
                           name='user_embedding')(user_id_input)
product_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=num_products,
                            input_length=1, 
                            name='item_embedding')(product_id_input)

# 내적을 위해 matrix > vector 로 변환 : (batch_size, 1, 10) > (batch_size, 10)
user_vector = Reshape([embedding_size])(user_embedding)
product_vector = Reshape([embedding_size])(product_embedding)

# 내적을 통해 vector > scala 로 변환 : (batch_size, 10) > (batch_size, 1)
y = Dot(1, normalize=False)([user_vector, product_vector])

# label이 이진 데이터기에 sigmoid를 통해 확률 계산
binary_y = tf.keras.layers.Dense(1, activation='sigmoid')(y)

In [None]:
# 과적합 방지 및 best_model 추출을 위한 callback 함수 설정
MODEL_SAVE_FOLDER_PATH = '/content/drive/MyDrive/Colab_Notebooks/'
if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
    os.mkdir(MODEL_SAVE_FOLDER_PATH)

model_path = MODEL_SAVE_FOLDER_PATH + 'best_model.h5'
model_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss', mode='min',
                                   verbose=1, save_best_only=True)

earlystopper = EarlyStopping(monitor='val_loss', mode='min', min_delta=0.0001, patience=10, verbose=1)

In [None]:
# 모델 구성
model = Model(inputs=[user_id_input, product_id_input], outputs=binary_y)

opt = optimizers.Adam(learning_rate=0.005)
model.compile(loss='binary_crossentropy', optimizer=opt)

In [None]:
# 모델 학습
history = model.fit([train_user_data, train_product_data],
          train_df['cart_purchase'],
          batch_size=512, 
          epochs=100,
          validation_split=0.05,
          callbacks=[earlystopper, model_checkpoint],
          shuffle=True)

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69334, saving model to ./model/best_model.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.69334 to 0.69214, saving model to ./model/best_model.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.69214 to 0.68662, saving model to ./model/best_model.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.68662 to 0.67148, saving model to ./model/best_model.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.67148 to 0.65471, saving model to ./model/best_model.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.65471 to 0.65249, saving model to ./model/best_model.h5
Epoch 7/100
Epoch 7: val_loss did not improve from 0.65249
Epoch 8/100
Epoch 8: val_loss did not improve from 0.65249
Epoch 9/100
Epoch 9: val_loss did not improve from 0.65249
Epoch 10/100
Epoch 10: val_loss did not improve from 0.65249
Epoch 11/100
Epoch 11: val_loss did not improve from 0.65249
Epoch 12/100
Epoch 12: val_loss did not improve from 0.65249
Epoch 13/100
Epoch 13: val_loss 

In [None]:
# Best Model에 대한 Validation Loss
min_loss_idx = np.argmin(history.history['val_loss'])
print(f"Best Loss : {np.round(history.history['val_loss'][min_loss_idx], 4)}")

Best Loss : 0.6525


In [None]:
# 모델 테스트를 위한 예측값 추출
best_model = tf.keras.models.load_model("/content/drive/MyDrive/Colab_Notebooks/best_model.h5")

y_pred = best_model.predict([test_user_data, test_product_data])
y_true = test_df['cart_purchase'].values

In [None]:
# cut-off별 Accuracy 계산
# 임계값이 0.55 일 때, accuracy가 0.6551로 가장 성능이 좋았다.
for val in np.linspace(0, 1, 21):
    result_pred = [1 if data >= val else 0 for data in y_pred]
    score = accuracy_score(y_true, result_pred)
    print(f"{val} : {score.round(4)}")

0.0 : 0.5033
0.05 : 0.5014
0.1 : 0.4992
0.15000000000000002 : 0.4962
0.2 : 0.4933
0.25 : 0.4984
0.30000000000000004 : 0.5062
0.35000000000000003 : 0.5316
0.4 : 0.5602
0.45 : 0.6188
0.5 : 0.6466
0.55 : 0.6551
0.6000000000000001 : 0.6447
0.65 : 0.6333
0.7000000000000001 : 0.6122
0.75 : 0.5883
0.8 : 0.5669
0.8500000000000001 : 0.5452
0.9 : 0.5277
0.9500000000000001 : 0.5086
1.0 : 0.4967


---

---

## 상품 추천 모델 알고리즘
- filtered_df : 상품을 6회 이상 본 유저와 유저에게 51회 이상 노출된 상품들을 필터링한 이진 데이터셋
- jaccard_df : filtered_df 기반의 유저 중심 유사도 행렬
- boolean_df : 구매이력이 1회 이상인 유저에 대해 전환여부에 대한 1:1 비율의 데이터셋

In [25]:
filtered_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/filtered_df.csv', index_col=0)
jaccard_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/jaccard_similarity_df.csv', index_col=0)

# 모델 기반 협업 필터링에서의 best score를 가진 모델
best_model = tf.keras.models.load_model("/content/drive/MyDrive/Colab_Notebooks/best_model.h5")

In [26]:
print(filtered_df.shape)
filtered_df.head(3)

(29226, 3)


Unnamed: 0,user_id,product_id,event_type
13,1515915625353234047,105,-1
17,1515915625353234047,3828462,-1
18,1515915625353234047,4005145,-1


In [27]:
jaccard_df.columns = jaccard_df.columns.astype(int)

print(jaccard_df.shape)
jaccard_df.head(3)

(5219, 5219)


Unnamed: 0_level_0,1515915625353234047,1515915625353294441,1515915625353496458,1515915625353900095,1515915625353946724,1515915625354061413,1515915625354144974,1515915625354145851,1515915625354153061,1515915625354227904,1515915625354399528,1515915625354857951,1515915625355179497,1515915625356119540,1515915625356236274,1515915625357384049,1515915625358666491,1515915625359368343,1515915625359453928,1515915625359616242,1515915625360401970,1515915625360775587,1515915625361099554,1515915625361963933,1515915625363053094,1515915625363853004,1515915625364062846,1515915625364316677,1515915625365042345,1515915625366169564,1515915625366672718,1515915625367150883,1515915625367438020,1515915625367995831,1515915625369530324,1515915625373342675,1515915625376375077,1515915625379210214,1515915625383075120,1515915625386067291,...,1515915625610156135,1515915625610199488,1515915625610224305,1515915625610284511,1515915625610320445,1515915625610322327,1515915625610325447,1515915625610360407,1515915625610365251,1515915625610388268,1515915625610389535,1515915625610435604,1515915625610446962,1515915625610454633,1515915625610477340,1515915625610493676,1515915625610536757,1515915625610570448,1515915625610575761,1515915625610590667,1515915625610606458,1515915625610713854,1515915625610718961,1515915625610736540,1515915625610773028,1515915625610790901,1515915625610791038,1515915625610802187,1515915625610826922,1515915625610828170,1515915625610840656,1515915625610847452,1515915625610890882,1515915625610910358,1515915625610924553,1515915625610938925,1515915625610976222,1515915625610992226,1515915625611011228,1515915625611014524
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1515915625353234047,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1515915625353294441,2,30,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,1
1515915625353496458,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**intersection_users** : boolean_df의 user_id와 filtered_df의 user_id의 교집합인 user_id
- Input user_id에 대해 유저 중심 협업 필터링과 모델 기반 협업 필터링을 적용하기 위해 

In [28]:
model_user_id_set = set(user_id_mapping.keys())
similarity_user_id_set = set(jaccard_df.index)

intersection_users = list(model_user_id_set & similarity_user_id_set)

### 상품 추천 모델에 대한 로직

In [29]:
def recommendation_items(user_id, return_items=False):
    # user_id에 대한 이웃유저 20명 추출
    neighbor_users = jaccard_df.loc[user_id].drop(user_id, axis=0).sort_values(ascending=False)
    neighbor_users = neighbor_users[neighbor_users > 0][:20].index

    # 이웃유저들의 cart & purchase 상품 추출
    recomend_product_list = filtered_df[
        (filtered_df['user_id'].isin(neighbor_users)) & (filtered_df['event_type'] == 1)]['product_id'].tolist()
    recomend_product_list = np.unique(recomend_product_list)
    # print(recomend_product_list)

    # 상품 리스트의 product_id를 index로 변환
    product_idx_dict = {}
    for product_id in recomend_product_list:
        product_index = product_id_mapping.get(product_id)
        if not product_index is None:
            product_idx_dict[product_id] = product_index

    # user_id를 index로 변환
    user_index = np.array(user_id_mapping[user_id]).reshape(1,)

    # user_index와 추출된 product_index를 best_model을 통해 예측 후, 전환확률 추출
    product_id_list = []
    product_score_list = []
    products = list(product_idx_dict.values())
    for product_id, product_index in product_idx_dict.items():
        product_index = np.array(product_index).reshape(1,)
        # reshape을 통해 scala > vector로 변환하고 input data로 넣어야 함
        pred_probability = best_model.predict([user_index, product_index])[0][0]

        product_id_list.append(product_id)
        product_score_list.append(pred_probability)

    # 각 product_id별 전환확률에 대한 데이터프레임 생성 후 상위 10개의 product_id 추출
    result_df = pd.DataFrame({'product_id': product_id_list, 'predict': product_score_list}).sort_values('predict', ascending=False)
    recommend_items = result_df[:10].product_id.values.tolist()

    # 만약 모델 기반 협업 필터링을 통해 10개의 상품이 추천되지 않았다면, 나머지는 가장 유사한 이웃유저의 cart & purchase 상품 추천
    if len(recommend_items) < 10:
        extra_num = 10 - len(recommend_items)
        extra_products = list(set(recomend_product_list) - set(recommend_items))[:extra_num]
        recommend_items.extend(extra_products)

    print(f"Recommendation Items : {recommend_items} \nfor User {user_id}")
    if return_items:
        return recommend_items

#### 상품 추천
- 예시로서 3명의 유저에 대한 상품 추천 진행

In [30]:
for i in range(3):
    recommendation_items(intersection_users[i])
    print('----------------------------------------------------')

Recommendation Items : [3961719, 4183863, 1821813, 4081772, 3964980, 3829445, 4171147, 4079420, 3791351, 4102739] 
for User 1515915625556410368
----------------------------------------------------
Recommendation Items : [630863] 
for User 1515915625537355784
----------------------------------------------------
Recommendation Items : [4099645, 4101542, 906630, 3961719, 3721192, 942339, 4079566, 866570, 4079565, 879432] 
for User 1515915625533022225
----------------------------------------------------


---

---

## 제언
- 이벤트의 활동이 자주 일어나는 분야에서의 이벤트 데이터이거나 데이터의 수가 많았다면, 좀 더 정확하고 많은 상품 추천이 가능하다고 생각합니다.
- 유저 및 상품에 대한 컨텐츠 정보가 있다면 모델 기반 협업 필터링의 성능이 많이 좋아질 것으로 생각합니다.
- 모델의 성능을 시험해보기 위해 AB 테스트를 진행 : 앞서 진행한 상품 기반의 퍼널 분석에서의 장바구니 전환율을 성공 지표로 설정하여 일정 기간 추적해 증감을 살펴본다면 모델의 효과를 검증할 수 있을 것으로 판단됩니다.