In [None]:
import pandas as pd
import numpy as np
from statistics import mean
import os 

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

from sklearn.metrics import ndcg_score, dcg_score, roc_auc_score

from scipy.stats import entropy

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.getcwd()

'/content/drive/My Drive/초록샵'

In [None]:
path = r'/content/drive/MyDrive/초록샵'
os.chdir(path)

## 1. Loading Train/Test Dataest

In [None]:
df_train = pd.read_csv("pre_dataset/trainset/trainset_20211231.csv", index_col = 0)
print("데이터 크기", df_train.shape)
df_train.head(10)

데이터 크기 (173535, 3)


Unnamed: 0,user_id,item_id,label
0,551,234,1
1,551,226,0
2,551,229,0
3,551,29,0
4,551,129,0
5,1195,96,1
6,1195,164,0
7,1195,169,0
8,1195,126,0
9,1195,101,0


In [None]:
df_train.label.value_counts(normalize = True)

0    0.8
1    0.2
Name: label, dtype: float64

In [None]:
df_test = pd.read_csv("pre_dataset/testset/testset_ds1.csv", index_col = 0 )
print("데이터 크기", df_test.shape)
df_test.head(10)

데이터 크기 (3237, 3)


Unnamed: 0,user_id,item_id,label
0,8389,0,0
1,8389,1,0
2,8389,2,0
3,8389,3,0
4,8389,4,0
5,8389,5,0
6,8389,6,0
7,8389,7,0
8,8389,8,0
9,8389,9,0


In [None]:
df_all = pd.read_excel('pre_dataset/trainset/dataset_20211231.xlsx')
df_all.loc[:,'밀키트'].fillna(0, inplace = True)

df_all.columns = ['Date', 'Channel', 'item']

In [None]:
user_info = pd.read_csv('pre_dataset/user_info.csv', encoding = 'utf-8-sig', index_col = 0 )
item_info = pd.read_csv('pre_dataset/item_info.csv', encoding = 'utf-8-sig', index_col = 0 )

## 2. Convert to Array Type

In [None]:
train_user = df_train.user_id.values[:, np.newaxis]
train_item = df_train.item_id.values[:, np.newaxis]
train_label = df_train.label.values[:, np.newaxis]

In [None]:
test_user = df_test.user_id.values[:, np.newaxis]
test_item = df_test.item_id.values[:, np.newaxis]
test_label = df_test.label.values[:, np.newaxis]

In [None]:
df = pd.concat([df_train, df_test], ignore_index = False)
df['user_id'] = df['user_id'].astype("category").cat.codes
df['item_id'] = df['item_id'].astype("category").cat.codes
df.head()

Unnamed: 0,user_id,item_id,label
0,547,234,1
1,547,47,0
2,547,211,0
3,547,38,0
4,547,53,0


## 3. Model

In [None]:
tf.random.set_seed(1234)
user_num, item_num = user_info.shape[0], item_info.shape[0]

user = Input(shape=(1,), dtype='int32')
user_embedding = Embedding(user_num, 32, input_length=user.shape[1])(user)
user_embedding = Flatten()(user_embedding)

# Item embedding
item = Input(shape=(1,), dtype='int32')
item_embedding = Embedding(item_num, 32, input_length=item.shape[1])(item)
item_embedding = Flatten()(item_embedding)

# Merge
concatenated = Multiply()([user_embedding, item_embedding])


# Output
output_layer = Dense(1, activation='sigmoid', name='output_layer')(concatenated) 

# Model
model = Model([user, item], output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
model.fit([train_user, train_item],train_label, epochs = 30, batch_size = 256, verbose = 1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fa05dfbb2d0>

In [None]:
model.save('train_parameter/GMF_model')

INFO:tensorflow:Assets written to: train_parameter/GMF_model/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


In [None]:
model = tf.keras.models.load_model('train_parameter/NCF_model')

## 4. Evaluation Performance

In [None]:
def evaluation_GMF(test_data):
    item = test_data.groupby(['user_id'])["label"].sum() ## 구매한 아이템
    precision_mean, recall_mean, f1_score_mean = [], [], []
    for k in range(1, 11):
        precision, recall, f1_score = [], [], []
        for i in test_data['user_id'].unique():
            new_test = test_data[test_data['user_id'] == i].copy()
            new_test["pridict_label"] = model.predict([new_test.user_id.values[:,np.newaxis], new_test.item_id.values[:,np.newaxis]])
            new_test = new_test.sort_values(by = ['pridict_label'], ascending=False).head(k) ## Top-k Item
            precision_temp = sum(new_test["label"]) / k
            recall_temp = sum(new_test["label"]) / item[i]
            pr = precision_temp + recall_temp
            f1_temp = 2 * ((precision_temp * recall_temp) / pr if pr != 0 else 0)
            precision.append(precision_temp)
            recall.append(recall_temp)
            f1_score.append(f1_temp)
        precision_mean.append(mean(precision))
        recall_mean.append(mean(recall))
        f1_score_mean.append(mean(f1_score))
    return precision_mean, recall_mean, f1_score_mean

In [None]:
precision, recall, f1_score = evaluation_GMF(df_test)

In [None]:
k = list(range(1,11))
# k = 10
result = {'Top-k': k, 'Recall': recall, 'Pricision': precision, 'F1':f1_score}
pd.DataFrame(result)

Unnamed: 0,Top-k,Recall,Pricision,F1
0,1,0.029579,0.230769,0.051026
1,2,0.029579,0.115385,0.044989
2,3,0.05522,0.102564,0.065954
3,4,0.05522,0.076923,0.058553
4,5,0.069414,0.092308,0.070866
5,6,0.072619,0.089744,0.070133
6,7,0.09826,0.087912,0.080456
7,8,0.09826,0.076923,0.074742
8,9,0.09826,0.068376,0.069847
9,10,0.123901,0.069231,0.077434


In [None]:
def dcg_at_k(r, k, method=0):
   
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0


def ndcg_at_k( df_test, method=0):
    ndcg_mean = []
    for k in range(1, 11):
        ndcg = []
        for u in df_test.user_id.unique():
            label_temp = df_test[df_test.user_id == u]["label"]
            aa = label_temp.tolist()
            
            dcg_max = dcg_at_k(sorted(aa, reverse=True), k, method)
            if not dcg_max:
                return 0
            ndcg.append(dcg_at_k(aa, k, method) / dcg_max)
            
        ndcg_mean.append(mean(ndcg))
    return ndcg_mean

In [None]:
new_test = df_test.copy()

new_test["predict_label"] = model.predict([np.array(new_test.user_id).reshape(-1,1), np.array(new_test.item_id).reshape(-1,1)])
new_test = new_test.sort_values(by = ['user_id','predict_label'], ascending = [True,False])

new_test

Unnamed: 0,user_id,item_id,label,predict_label
554,1655,56,0,0.995380
534,1655,36,0,0.975584
630,1655,132,1,0.943568
682,1655,184,0,0.939525
604,1655,106,0,0.935326
...,...,...,...,...
3098,11015,110,0,0.101740
3039,11015,51,0,0.099830
3108,11015,120,0,0.095917
3234,11015,246,0,0.082824


In [None]:
ndcg  = ndcg_at_k(new_test)

k = list(range(1,11))
# k = 10
result = {'Top-k': k, 'NDCG': ndcg,}
pd.DataFrame(result)

Unnamed: 0,Top-k,NDCG
0,1,0.230769
1,2,0.115385
2,3,0.106161
3,4,0.092153
4,5,0.101844
5,6,0.103326
6,7,0.108504
7,8,0.10617
8,9,0.10425
9,10,0.111435


In [None]:
def evaluation_Entropy(test_data):
    all_item = test_data['item_id'].unique()
    entropy_list = []
    for k in range(1, 11):
        
        item_count = pd.DataFrame({'item_id':all_item})
        item_count['count'] = 0
        for i in test_data['user_id'].unique():
            
            recommend_item = []
            new_test = test_data[test_data['user_id'] == i].copy()
            new_test["predict_label"] = model.predict([np.array(new_test.user_id).reshape(-1,1), np.array(new_test.item_id).reshape(-1,1)])
            new_test = new_test.sort_values(by = ['predict_label'], ascending=False).head(k)
            for j in range(len(new_test['item_id'])):
                recommend_item.append(new_test['item_id'].values[j])
            
            item_count.loc[item_count['item_id'].isin(recommend_item), 'count'] += 1
            
        
        item_count['entropy'] = item_count['count'].apply(lambda x: x/item_count['count'].sum())
        entropy_list.append(entropy(item_count['entropy'], base=10))
    return entropy_list

In [None]:
entropy_list = evaluation_Entropy(df_test)

k = list(range(1,11))
result_entropy = {'Top-k': k, 'Entropy': entropy_list}
pd.DataFrame(result_entropy)

Unnamed: 0,Top-k,Entropy
0,1,1.021319
1,2,1.322349
2,3,1.440474
3,4,1.575534
4,5,1.663488
5,6,1.752136
6,7,1.782776
7,8,1.820093
8,9,1.874487
9,10,1.900816
