# 1. 개발 환경 소개

- Google Colab GPU 환경에서 진행했습니다.
- 개발 환경은 다음과 같습니다.

In [None]:
import platform
print('- os:',platform.platform())
print('- 운영체제:', end="")
!cat /etc/issue.net
print('- Process information:', platform.processor())
print('- Process Architecture:', platform.machine())
print("- RAM: 12.68GB")

- os: Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic
- 운영체제:Ubuntu 18.04.6 LTS
- Process information: x86_64
- Process Architecture: x86_64
- RAM: 12.68GB


In [None]:
print('GPU')
!nvidia-smi

GPU
Fri Aug 12 10:32:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Pr

# 2. 라이브러리 불러오기 및 경로 지정

### 라이브러리 불러오기

In [None]:
# 전처리 라이브러리
import pandas as pd
import numpy as np
import os
import ast
from datetime import datetime, timedelta
import datetime
from tqdm import tqdm
import random

# 분석 라이브러리
import torch
import tensorflow as tf
from keras.models import load_model
import torch

# 분석에 문제가 없는 경고 메시지 숨김
import warnings
warnings.filterwarnings('ignore')

### 라이브러리 버전

In [None]:
print('- ', end="")
!python --version
print('- pandas:', pd.__version__)
print('- numpy:', np.__version__)
print('- torch:', torch.__version__)
print('- tensorflow:', tf.__version__)

- Python 3.7.13
- pandas: 1.3.5
- numpy: 1.21.6
- torch: 1.12.0+cu113
- tensorflow: 2.8.2


### 경로 설정 및 데이터 불러오기

In [None]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


코드 실행을 위해 데이터가 있는 폴더를 설정합니다.

In [None]:
data_dir = '/content/drive/MyDrive/롯데멤버스_경진대회/3. 안다비젼_ 데이터 및 모델 세이브 파일/'

In [None]:
# 데이터 불러오기
data_train = pd.read_csv(data_dir+'output/model_data_train.csv') # 전체 주문건 - 훈련용
data_test = pd.read_csv(data_dir+'output/model_data_test.csv') # 전체 주문건 - 테스트용
user_data_train = pd.read_csv(data_dir+'output/user_data_train.csv') # 유저별 주문정보 - 훈련용
user_data_test = pd.read_csv(data_dir+'output/user_data_test.csv') # 유저별 주문정보 - 테스트용
pd_clac = pd.read_csv(data_dir+'data/LPOINT_BIG_COMP_04_PD_CLAC.csv') # 상품 분류 정보 - 유통사 상품 카테고리 마스터 

# 3. 후보 모델

* 저장해둔 데이터에서 문자열을 리스트로 변환합니다.

In [None]:
# 1. 데이터 복사
data_train_tmp = data_train.copy()
data_test_tmp = data_test.copy() # 이후에 중복제거용으로 사용

# 2. 리스트 변환
print("1. 숫자 -> 리스트 변환")
for i in ['gender', 'region', 'ages']:
  data_train[i] = [[x] for x in list(data_train[i])]
  data_test[i] = [[x] for x in list(data_test[i])]  

print("2. 문자열 -> 리스트 변환")
for i in tqdm(['order', 'product', 'order_dow', 'order_hour_of_day', 'day_since_prior_order']):
  for j in user_data_train.index:
    try: user_data_train[i][j] = list(map(int, ast.literal_eval(user_data_train[i][j])))
    except: pass
  for j in user_data_test.index:
    try: user_data_test[i][j] = list(map(int, ast.literal_eval(user_data_test[i][j])))  
    except: pass

for j in tqdm(user_data_train.index):
    try: user_data_train['buy_am'][j] = list(map(float, ast.literal_eval(user_data_train['buy_am'][j])))
    except: pass
for j in tqdm(user_data_test.index):
    try: user_data_test['buy_am'][j] = list(map(float, ast.literal_eval(user_data_test['buy_am'][j])))
    except: pass

1. 숫자 -> 리스트 변환
2. 문자열 -> 리스트 변환


100%|██████████| 5/5 [00:27<00:00,  5.54s/it]
100%|██████████| 8751/8751 [00:04<00:00, 1783.73it/s]
100%|██████████| 4074/4074 [00:01<00:00, 2146.71it/s]


* 후보 모델에 필요한 파라미터와 커스텀 레이어를 정의합니다.

In [None]:
# 3. 하이퍼파라미터 정의
EMBEDDING_DIMS = 64
DENSE_UNITS = 2048 # dense layer 뉴런 개수
DROPOUT_PCT = 0.1 # dropout
ALPHA = 0.1
NUM_CLASSES = data_train["product"].max() + 1 # 큰거 기준
LEARNING_RATE = 0.01

# 4. custom layers 정의
class MaskedEmbeddingsAggregatorLayer(tf.keras.layers.Layer):
    def __init__(self, agg_mode='sum', **kwargs):
        super(MaskedEmbeddingsAggregatorLayer, self).__init__(**kwargs)

        if agg_mode not in ['sum', 'mean']:
            raise NotImplementedError('mode {} not implemented!'.format(agg_mode))
        self.agg_mode = agg_mode
    
    @tf.function
    def call(self, inputs, mask=None):
        masked_embeddings = tf.ragged.boolean_mask(inputs, mask)
        if self.agg_mode == 'sum':
            aggregated =  tf.reduce_sum(masked_embeddings, axis=1)
        elif self.agg_mode == 'mean':
            aggregated = tf.reduce_mean(masked_embeddings, axis=1)
        return aggregated
    
    def get_config(self):
        return {'agg_mode': self.agg_mode}
    
class L2NormLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(L2NormLayer, self).__init__(**kwargs)
    
    @tf.function
    def call(self, inputs, mask=None):
        if mask is not None:
            inputs = tf.ragged.boolean_mask(inputs, mask).to_tensor()
        return tf.math.l2_normalize(inputs, axis=-1)

    def compute_mask(self, inputs, mask):
        return mask

* 후보 모델을 정의합니다

In [None]:
# 5. modeling
input_user = tf.keras.Input(shape=(None, ), name='user')
input_product_hist = tf.keras.layers.Input(shape=(None,), name='product_hist')
input_order_dow_hist = tf.keras.layers.Input(shape=(None,), name='order_dow_hist')
input_order_hour_of_day_hist = tf.keras.Input(shape=(None, ), name='order_hour_of_day_hist')
input_days_since_prior_order_hist = tf.keras.Input(shape=(None, ), name='days_since_prior_order_hist')
input_buy_am_hist = tf.keras.Input(shape=(None, ), name='buy_am_hist')


# 5-1. layer 구성
features_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS,  mask_zero=True, trainable=True, name='features_embeddings')
labels_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS,  mask_zero=True, trainable=True, name='labels_embeddings')
avg_embeddings = MaskedEmbeddingsAggregatorLayer(agg_mode='mean', name='aggregate_embeddings')

dense_1 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_1')
dense_2 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_2')
dense_3 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_3')
l2_norm_1 = L2NormLayer(name='l2_norm_1')
dense_output = tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax, name='dense_output')

# 5-2. feature 투입
features_embeddings = features_embedding_layer(input_user)
l2_norm_features = l2_norm_1(features_embeddings)
avg_features = avg_embeddings(l2_norm_features)

labels_product_embeddings = labels_embedding_layer(input_product_hist)
l2_norm_product = l2_norm_1(labels_product_embeddings)
avg_product = avg_embeddings(l2_norm_product)

labels_order_dow_embeddings = labels_embedding_layer(input_order_dow_hist)
l2_norm_order_dow = l2_norm_1(labels_order_dow_embeddings)
avg_order_dow = avg_embeddings(l2_norm_order_dow)

labels_order_hour_embeddings = labels_embedding_layer(input_order_hour_of_day_hist)
l2_norm_order_hour = l2_norm_1(labels_order_hour_embeddings)
avg_order_hour = avg_embeddings(l2_norm_order_hour)

labels_since_prior_embeddings = labels_embedding_layer(input_days_since_prior_order_hist)
l2_norm_since_prior = l2_norm_1(labels_since_prior_embeddings)
avg_since_prior = avg_embeddings(l2_norm_since_prior)

labels_buy_am_embeddings = labels_embedding_layer(input_buy_am_hist)
l2_norm_buy_prior = l2_norm_1(labels_buy_am_embeddings)
avg_buy_prior = avg_embeddings(l2_norm_buy_prior)

# 5-3. 임베딩 벡터들 연결
concat_inputs = tf.keras.layers.Concatenate(axis=1)([avg_product, avg_order_dow, avg_order_hour, avg_since_prior, avg_buy_prior ])

# 5-4. Dense Layers 구성
dense_1_features = dense_1(concat_inputs)
dense_1_relu = tf.keras.layers.ReLU(name='dense_1_relu')(dense_1_features)
dense_1_batch_norm = tf.keras.layers.BatchNormalization(name='dense_1_batch_norm')(dense_1_relu)

dense_2_features = dense_2(dense_1_relu)
dense_2_relu = tf.keras.layers.ReLU(name='dense_2_relu')(dense_2_features)
dense_2_batch_norm = tf.keras.layers.BatchNormalization(name='dense_2_batch_norm')(dense_2_relu)

dense_3_features = dense_3(dense_2_relu)
dense_3_relu = tf.keras.layers.ReLU(name='dense_3_relu')(dense_3_features)
dense_3_batch_norm = tf.keras.layers.BatchNormalization(name='dense_3_batch_norm')(dense_3_relu)

outputs = dense_output(dense_3_batch_norm)

# 5-5. Optimizer 정의
optimiser = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

# 5-6. 모델 준비 및 컴파일
model = tf.keras.models.Model(
    inputs=[input_product_hist,
            input_order_dow_hist,
            input_order_hour_of_day_hist,
            input_days_since_prior_order_hist,
            input_buy_am_hist,
            ],
    outputs=[outputs]
)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['acc']) 
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 product_hist (InputLayer)      [(None, None)]       0           []                               
                                                                                                  
 order_dow_hist (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 order_hour_of_day_hist (InputL  [(None, None)]      0           []                               
 ayer)                                                                                            
                                                                                                  
 days_since_prior_order_hist (I  [(None, None)]      0           []                           

* 입력 데이터 개수를 정의하고 훈련을 진행합니다.

In [None]:
# 6. 모델 데이터 세팅 (메모리 한계로 인해 일부만 사용)
user_data_train['user'] = user_data_train['user'].astype('int')
user_data_test['user'] = user_data_test['user'].astype('int')
train_tmp = user_data_train[(user_data_train.user >= 0)&(user_data_train.user <= 2000)]
test_tmp = user_data_test[(user_data_test.user >= 2980)&(user_data_test.user <= 3020)]

# 7. 모델 학습
history = model.fit([tf.keras.preprocessing.sequence.pad_sequences(train_tmp['product'])+1e-10,
                     tf.keras.preprocessing.sequence.pad_sequences(train_tmp['order_dow'])+1e-10,
                     tf.keras.preprocessing.sequence.pad_sequences(train_tmp['order_hour_of_day'])+1e-10,
                     tf.keras.preprocessing.sequence.pad_sequences(train_tmp['day_since_prior_order'])+1e-10,
                     tf.keras.preprocessing.sequence.pad_sequences(train_tmp['buy_am'])+ 1e-10,
                    ], train_tmp['predict_labels'].values,
                    epochs=600)

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

In [None]:
# 8. 모델 저장
model.save(data_dir+'output/candidate_generation.h5')

* 테스트 데이터에 대해 후보군을 추출합니다.

In [None]:
# 9. 후보군 생성
pred = model.predict([tf.keras.preprocessing.sequence.pad_sequences(test_tmp['product'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_tmp['order_dow'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_tmp['order_hour_of_day'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_tmp['day_since_prior_order'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_tmp['buy_am'])+ 1e-10,
           ])

N = 40 # 사용자별로 추천할 아이템 개수
k = np.sort((-pred).argsort()[:,:N])
k = k.flatten()
k[k>data_train["product"].max()]=0
k = np.unique(k)

# 4. 순위 모델

* 전처리 과정에서 사용될 함수를 정의합니다.

In [None]:
def normalize_col(df,col_name): # 정규화 함수
    df[col_name] = (df[col_name] - df[col_name].min()) / (df[col_name].max() - df[col_name].min())
    return df

def get_aisles(products, aisles): # 동일 중분류에 속한 아이템의 개수만큼 중분류를 반복해서 각 행에 붙여줌 
  def get_all_aisles(ai):
    active = [str(aisles_encoded[aisle]) for aisle, a in zip(aisles, ai) if a==1]
    if len(active) == 0: return '0'
    return ','.join((active))
  products['all_aisles'] = [get_all_aisles(ai) for ai in zip(*[products[aisle] for aisle in aisles])]

* 후보군 상품을 사용해 순위모델 데이터를 전처리합니다

In [None]:
# 1. 데이터 준비
product_m = pd_clac.copy()
product_m.columns = ['product_id', 'pd_nm', 'department', 'aisle']
product_enc = pd.get_dummies(product_m, columns=['aisle'], prefix=[None])
product_enc.columns = ['product_id', 'product_name', 'department'] + list(product_enc.columns[3:])

# 2. 아이템 속성 전처리
aisle_cols = pd_clac['clac_mcls_nm'].values.tolist()
type(aisle_cols)
aisles_encoded = {x: i for i, x in enumerate(aisle_cols)}
get_aisles(product_enc, aisle_cols)

# 3. 후보군 관련 데이터 생성
ratings_train = data_train[['product_id', 'product', 'reordered', 'user_id', 'user']]
ratings_test = data_test[['product_id', 'product', 'reordered', 'user_id', 'user']]

product_data = product_enc.set_index(['product_id']).sort_index()
product_data = product_data.reset_index().loc[k+1]
pd_nms = product_enc["product_name"].unique().tolist()
pd_name2pd_name_encoded = {x: i for i, x in enumerate(pd_nms)}
product_data["pd_name_d"] = product_data["product_name"].map(pd_name2pd_name_encoded)

# 4. 모델용 데이터 생성
new_data_train = product_data.merge(ratings_train, on='product_id') # rating 추가
new_data_test = product_data.merge(ratings_test, on='product_id') # rating 추가
for_merge = pd.concat([data_train_tmp, data_test_tmp], axis = 0)[['user', 'gender', 'region', 'ages']].drop_duplicates() # user별 정보 테이블

In [None]:
def generate_final_data(typ):

  globals()['new_data_'+typ] = globals()['new_data_'+typ][['product', 'user', 'reordered', 'all_aisles', 'pd_name_d']]
  # product : product_id의 int매핑값, user : user_id의 매핑값, reordered : 재주문된 아이템인지, all_aisles : 같은 aisle에 속한 상품, pd_name_d, 소분류명 매핑값
  globals()['new_data_'+typ]['product_type'] = np.where(globals()['new_data_'+typ]['reordered'] ==1, 'like', 'dislike')
  # product_type : 재구매했는지 여부

  globals()['product_list_'+typ] = globals()['new_data_'+typ].groupby(['user','product_type'])['product'].apply(list).reset_index()
  # product_list_typ :. user와 product_type으로 그룹화하여, product들을 리스트로 모아놓음

  globals()['aisle_list_'+typ] = globals()['new_data_'+typ].groupby(['user'])['all_aisles'].unique().apply(list).reset_index()
  globals()['aisle_list_'+typ]['all_aisles'] = globals()['aisle_list_'+typ]['all_aisles'].apply(lambda x: list(set(','.join(x))) )
  globals()['aisle_list_'+typ]['all_aisles'] = globals()['aisle_list_'+typ]['all_aisles'].apply(lambda x: [ x for x in x if x.isdigit() ])
  # aisle_list_typ : user별로 구매한 중분류들을 모아놓음
  
  globals()['pd_name_list_'+typ] = globals()['new_data_'+typ].groupby(['user'])['pd_name_d'].apply(list).reset_index()
  # pd_name_list_typ : user별로 구매한 소분류들을 모아놓음

  globals()['dataset_'+typ] = globals()['product_list_'+typ].pivot(index='user', columns='product_type', values='product').reset_index()
  globals()['dataset_'+typ].fillna(globals()['new_data_'+typ]["product"].max()+1, inplace=True)
  globals()['dataset_'+typ]['like'] =globals()['dataset_'+typ]['like'].apply(lambda x: x if type(x) is list else [])
  globals()['dataset_'+typ]['dislike'] =globals()['dataset_'+typ]['dislike'].apply(lambda x: x if type(x) is list else [])
  globals()['dataset_'+typ] = pd.merge(globals()['dataset_'+typ], globals()['pd_name_list_'+typ], how='left', on='user')
  globals()['dataset_'+typ] = pd.merge(globals()['dataset_'+typ], globals()['aisle_list_'+typ], how='left', on='user')
  globals()['dataset_'+typ] = pd.merge(globals()['dataset_'+typ], for_merge, how='left', on='user').reset_index(drop=True)

  # gender, region, ages 리스트 변환
  for i in ['gender', 'region', 'ages']:
    globals()['dataset_'+typ][i] = [[x] for x in list(globals()['dataset_'+typ][i])]

  # predict_label 채워넣기
  globals()['dataset_'+typ]['predict_labels'] = globals()['dataset_'+typ]['like'].apply(lambda x: int(random.uniform(1,globals()['new_data_'+typ]["product"].max()))) 

  # 채워넣은 결측치는 추후 대체 예정
  globals()['dataset_'+typ]['like'] = globals()['dataset_'+typ]['like'].apply(lambda x: [globals()['new_data_'+typ]["product"].max()+1] if x == [] else x)
  globals()['dataset_'+typ]['dislike'] = globals()['dataset_'+typ]['dislike'].apply(lambda x: [globals()['new_data_'+typ]["product"].max()+1] if x == [] else x)


# 5. 모델 입력 데이터 생성 및 저장
generate_final_data('train')
generate_final_data('test')

dataset_train.to_csv(data_dir+'output/dataset_train.csv', index=False)
dataset_test.to_csv(data_dir+'output/dataset_test.csv', index=False)

* 훈련에 사용할 데이터를 정의합니다.

In [None]:
# 6. 입력 데이터 추출 (메모리 한계로 인해 일부만 사용)
dataset_train['user'] = dataset_train['user'].astype('int')
dataset_test['user'] = dataset_test['user'].astype('int')

tmp_train_r = dataset_train[(dataset_train.index >= 0)&(dataset_train.index <= 2000)]
tmp_test_r = dataset_test[(dataset_test.index >= 0)&(dataset_test.index <= 40)]

In [None]:
tmp_train_r

Unnamed: 0,user,dislike,like,pd_name_d,all_aisles,gender,region,ages,predict_labels
0,0,"[197, 197, 197, 197, 197, 197, 824]",[1358],"[322, 322, 322, 322, 322, 322, 929]","[9, 3, 2, 6]",[0],[0],[0],245
1,1,"[197, 197, 197, 197, 197, 197, 122, 259]",[1358],"[322, 322, 322, 322, 322, 322, 934, 946]","[4, 8, 9, 3, 2, 7, 6]",[0],[1],[0],1023
2,3,"[317, 122]",[1358],"[223, 934]","[8, 9, 3, 2, 6]",[0],[2],[0],998
3,5,"[741, 479, 479, 402, 402, 24, 562, 662]","[479, 479, 479, 479, 479, 402]","[907, 947, 947, 947, 947, 947, 947, 947, 951, ...","[0, 5, 4, 8, 9, 3, 6]",[1],[2],[0],1330
4,9,[479],[1358],[947],"[5, 9, 4]",[0],[4],[0],529
...,...,...,...,...,...,...,...,...,...
1996,3931,[317],[1358],[223],"[2, 6]",[0],[1],[1],948
1997,3933,"[122, 259]",[1358],"[934, 946]","[4, 8, 9, 3, 7]",[0],[8],[0],704
1998,3936,"[275, 259]",[1358],"[940, 946]","[9, 4, 7]",[0],[1],[0],38
1999,3937,[122],[1358],[934],"[3, 9, 8]",[1],[4],[3],958


* 훈련에 사용할 파라미터를 정의합니다.

In [None]:
# 7. 하이퍼파라미터 정의
EMBEDDING_DIMS = 64
DENSE_UNITS = 2048 # dense layer 뉴런 개수
DROPOUT_PCT = 0.1 # dropout
ALPHA = 0.1
NUM_CLASSES = data_train["product"].max() + 1 # 큰거 기준
LEARNING_RATE = 0.01

* 훈련에 사용할 모델을 정의합니다.

In [None]:
# 8. 딥러닝 모델 구성
# 8-1. 입력 데이터 및 임베딩, 레이어 정의
input_name = tf.keras.Input(shape=(None, ), name='product_name')
inp_item_liked = tf.keras.layers.Input(shape=(None,), name='like')
inp_item_disliked = tf.keras.layers.Input(shape=(None,), name='dislike')
input_aisle = tf.keras.Input(shape=(None, ), name='aisle')
input_gender = tf.keras.Input(shape=(None, ), name='gender')
input_region = tf.keras.Input(shape=(None, ), name='region')
input_ages = tf.keras.Input(shape=(None, ), name='ages')

features_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS,  mask_zero=True, trainable=True, name='features_embeddings')
labels_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, mask_zero=True, trainable=True, name='labels_embeddings')
avg_embeddings = MaskedEmbeddingsAggregatorLayer(agg_mode='mean', name='aggregate_embeddings')

dense_1 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_1')
dense_2 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_2')
dense_3 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_3')
l2_norm_1 = L2NormLayer(name='l2_norm_1')
dense_output = tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax, name='dense_output')

# 8-2. 임베딩 벡터 구성
features_embeddings = features_embedding_layer(input_name)
l2_norm_features = l2_norm_1(features_embeddings)
avg_features = avg_embeddings(l2_norm_features)

labels_liked_embeddings = labels_embedding_layer(inp_item_liked)
l2_norm_liked = l2_norm_1(labels_liked_embeddings)
avg_liked = avg_embeddings(l2_norm_liked)

labels_disliked_embeddings = labels_embedding_layer(inp_item_disliked)
l2_norm_disliked = l2_norm_1(labels_disliked_embeddings)
avg_disliked = avg_embeddings(l2_norm_disliked)

labels_aisle_embeddings = labels_embedding_layer(input_aisle)
l2_norm_aisle = l2_norm_1(labels_aisle_embeddings)
avg_aisle = avg_embeddings(l2_norm_aisle)

labels_gender_embeddings = labels_embedding_layer(input_gender)
l2_norm_gender = l2_norm_1(labels_gender_embeddings)
avg_gender = avg_embeddings(l2_norm_gender)

labels_region_embeddings = labels_embedding_layer(input_region)
l2_norm_region = l2_norm_1(labels_region_embeddings)
avg_region = avg_embeddings(l2_norm_region)

labels_ages_embeddings = labels_embedding_layer(input_ages)
l2_norm_ages = l2_norm_1(labels_ages_embeddings)
avg_ages = avg_embeddings(l2_norm_ages)


# 8-3. 임베딩 벡터들 연결
concat_inputs = tf.keras.layers.Concatenate(axis=1)([avg_features,
                                                     avg_liked,
                                                     avg_disliked,
                                                     avg_aisle,
                                                     avg_gender,
                                                     avg_region,
                                                     avg_ages
                                                     ])
# 8-4. Dense Layer 구성
dense_1_features = dense_1(concat_inputs)
dense_1_relu = tf.keras.layers.ReLU(name='dense_1_relu')(dense_1_features)
dense_1_batch_norm = tf.keras.layers.BatchNormalization(name='dense_1_batch_norm')(dense_1_relu)

dense_2_features = dense_2(dense_1_relu)
dense_2_relu = tf.keras.layers.ReLU(name='dense_2_relu')(dense_2_features)
dense_2_batch_norm = tf.keras.layers.BatchNormalization(name='dense_2_batch_norm')(dense_2_relu)

dense_3_features = dense_3(dense_2_relu)
dense_3_relu = tf.keras.layers.ReLU(name='dense_3_relu')(dense_3_features)
dense_3_batch_norm = tf.keras.layers.BatchNormalization(name='dense_3_batch_norm')(dense_3_relu)
outputs = dense_output(dense_3_batch_norm)

# 8-5. Optimizer 정의
optimiser = tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE)

# 8-6. 모델 준비
model_rank = tf.keras.models.Model(
    inputs=[input_name, 
            inp_item_liked, 
            inp_item_disliked,
            input_aisle,
            input_gender,
            input_region,
            input_ages
            ],
    outputs=[outputs]
)
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
model_rank.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['acc'])
model_rank.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 product_name (InputLayer)      [(None, None)]       0           []                               
                                                                                                  
 like (InputLayer)              [(None, None)]       0           []                               
                                                                                                  
 dislike (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 aisle (InputLayer)             [(None, None)]       0           []                               
                                                                                            

In [None]:
# 9. 순위모델 훈련
ratings = model_rank.fit([tf.keras.preprocessing.sequence.pad_sequences(tmp_train_r['pd_name_d'])+1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(tmp_train_r['like'])+1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(tmp_train_r['dislike'])+1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_train_r['all_aisles'])+1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_train_r['gender'])+1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_train_r['region'])+1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_train_r['ages'])+1e-10,
           ],tmp_train_r['predict_labels'].values, epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [None]:
# 9. 모델 저장
model_rank.save(data_dir+'output/ranking.h5')

In [None]:
# 10. 순위 반영 상품 추출
pred = model_rank.predict([tf.keras.preprocessing.sequence.pad_sequences(tmp_test_r['pd_name_d'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(tmp_test_r['like'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(tmp_test_r['dislike'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_test_r['all_aisles'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_test_r['gender'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_test_r['region'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(tmp_test_r['ages'])+ 1e-10,
           ])

N = 40 # 인당 20개의 추천
ranking = (-pred).argsort()[:, :N]
ranking[ranking>data_train['product'].max()]=0 
ranking_probability = np.sort(pred[:, :N])

# 5. 평가지표값 확인

* 생성한 모델의 평가지표값(dcg, ndcg, idcg)을 계산하는 함수를 구현합니다.

In [None]:
# 1. 라벨값과 라벨값에 해당하는 추천 아이템 데이터 (train, test)
train_df = tmp_train_r[['like', 'predict_labels']]
train_df = train_df.rename(columns = {'like':'like_id', 'predict_labels': 'label'})

true_df = tmp_test_r[['user', 'like', 'predict_labels']]
true_df = true_df.rename(columns={'like': 'true_like_id','predict_labels': 'true_labels'})
true_df = true_df[['user', 'true_like_id']]

In [None]:
# 2. 고객당 추천 상품 데이터 추출
def user_rec(n , rank, rec_df, train_df):
   # n: 고객 순서
   # rank: 순위모델 결과
   # rec_df: 고객당 추천된 라벨과 라벨에 해당하는 상품 아이디
   # train_df: tmp_train_r에서 필요한 like, predict_labels만 추출한 데이터프레임
  for label in rank[n-1]: # 순위 모델 결과의 라벨들에 대해
    rec_id = train_df['like_id'][(train_df['label']==label)].tolist() # 각 라벨에 해당하는 상품 id 추출
    rec_df = rec_df.append(pd.DataFrame([[label, rec_id]], columns=['label', 'rec_id']), ignore_index=True) # 해당 고객을 위한 추천 상품 정리
  return rec_df

for i in range(len(tmp_test_r)):
    user_rec_data = pd.DataFrame(columns=['label', 'rec_id'])
    globals()['rec_df'+str(i)] = user_rec(i, ranking, user_rec_data, train_df)

In [None]:
# 2. 각 유저별 추천목록의 상품들 중 실제로 재구매한(좋아한) 것이 있는지 확인
def true_like(n, rec_df, true_df):
   # n: 고객 순서
   # rec_df: 고객당 추천된 라벨과 라벨에 해당하는 상품 아이디
   # true_df: tmp_test_r에서 필요한 user_id, true_like_id만 추출한 데이터프레임
  tf_list = []
  rec_id = rec_df['rec_id'].tolist() 
  true = true_df['true_like_id'].tolist()[n-1] # true데이터인 test_data의 형태상, row에 있는 유저 n-1번째 row를 고정해야 함. 

  for rec in rec_id[:][:]:
    if len(rec) == 0: tf_list.append(0.0)
    elif len(rec)>=1:        
      tf = true[0] in rec[0]
      if tf == True: tf_list.append(1)
      else: tf_list.append(0)
      rec_df['T/F'] = pd.DataFrame(tf_list)
  return rec_df

for i in range(len(tmp_test_r)):
    globals()['rec_df'+str(i)] = true_like(i, globals()['rec_df'+str(i)], true_df)

In [None]:
# 3. 각 유저별 dcg, idcg, ndcg 계산
def get_ndcg(rec_df):
  # rec_df: 유저당 추천된 '라벨'과 '라벨에 해당하는 상품 아이디', 'T/F(true_like여부)'특성이 포함된 데이터프레임 
  rec = rec_df['rec_id'].tolist()
  t = rec_df['T/F'].tolist()
  dcg = 0.0
  
  # dcg 계산 - 해당 모델의 추천 순위 성능
  # idcg 계산 - 가장 이상적인 모델의 추천순위 성능
  # ndcg 계산 - 비율
  for i, j in enumerate(t): 
    if j == 1.0: dcg += (1.0/np.log2(i+1+1))
    else: dcg += 0
  idcg = sum((1.0/np.log2(i+1+1) for i in range(0, len(t)+1)))
  ndcg = dcg / idcg 

  ndcg_df = pd.DataFrame(columns=['dcg', 'idcg', 'ndcg'])
  ndcg_df = ndcg_df.append(pd.DataFrame([[dcg, idcg, ndcg]], columns=['dcg', 'idcg', 'ndcg']), ignore_index=True)
  return ndcg_df

for i in range(len(tmp_test_r)):
    globals()['ndcg_df'+str(i)]  = get_ndcg(globals()['rec_df'+str(i)])

In [None]:
# 4. user별 dcg, idcg, ndcg 결과 취합
def concat_result(df1, df2):
  df3 = pd.concat([df1, df2])
  return df3

result = concat_result(ndcg_df0, ndcg_df1)
for i in range(2, len(tmp_test_r)):
  result = concat_result(result, globals()['ndcg_df'+str(i)])

print('[ ranking model의 dcg, idcg, ndcg평균 ] \n', result.mean())

[ ranking model의 dcg, idcg, ndcg평균 ] 
 dcg      2.220527
idcg    11.276482
ndcg     0.196917
dtype: float64
