### Wide & Deep Recommendation System with Movie Lens
출처 : [Microsoft Github] (https://github.com/microsoft/recommenders)

In [3]:
import os
from tempfile import TemporaryDirectory

import tensorflow as tf
import pandas as pd
import sklearn.preprocessing
import papermill as pm

from tensorflow.python.client import device_lib
from python_splitters import python_random_split
import wide_deep_utils as wide_deep
import tf_utils
from pandas_df_utils import user_item_pairs
import python_evaluation

print("Tensorflow Version:", tf.VERSION)
devices = device_lib.list_local_devices()
print([x.name for x in devices])

num_cpus = os.cpu_count()
print("Num CPUs:", num_cpus)



W0722 19:31:29.863954 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:167: The name tf.train.SessionRunHook is deprecated. Please use tf.estimator.SessionRunHook instead.





Tensorflow Version:

 

1.14.0




['/device:CPU:0']




Num CPUs:

 

4




In [4]:
####################
# 파라미터 세팅
####################
#Recommend top k items
TOP_K = 10
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'
# Metrics to use for evaluation. reco_utils.evaluation.python_evaluation function names
RANKING_METRICS = ['map_at_k', 'ndcg_at_k', 'precision_at_k', 'recall_at_k']
RATING_METRICS = ['rmse', 'mae', 'rsquared', 'exp_var']
# Use session hook to evaluate model while training
EVALUATE_WHILE_TRAINING = True

# Data column names
USER_COL = 'UserId'
ITEM_COL = 'MovieId'
RATING_COL = 'Rating'
ITEM_FEAT_COL = 'Genres'

# Train and test set pickle file paths. If None, download and split the dataset.
DATA_DIR = None
TRAIN_PICKLE_PATH = None
TEST_PICKLE_PATH = None
EXPORT_DIR_BASE = './outputs/model'

#### Hyperparameters
MODEL_TYPE = 'wide_deep'
EPOCHS = 50  # if 0, only 1 batch will be processed
BATCH_SIZE = 64
# Wide (linear) model hyperparameters
LINEAR_OPTIMIZER = 'Ftrl'
LINEAR_OPTIMIZER_LR =0.0029   # Learning rate
LINEAR_L1_REG = 0.0           # L1 Regularization rate for FtrlOptimizer
LINEAR_MOMENTUM = 0.9         # Momentum for MomentumOptimizer or RMSPropOptimizer
# DNN model hyperparameters
DNN_OPTIMIZER = 'Adagrad'
DNN_OPTIMIZER_LR = 0.1
DNN_L1_REG = 0.0           # L1 Regularization rate for FtrlOptimizer
DNN_MOMENTUM = 0.9         # Momentum for MomentumOptimizer or RMSPropOptimizer
# Layer dimensions are defined separately to make this work with AzureML Hyperdrive
DNN_HIDDEN_LAYER_1 = 0     # Set 0 to not use this layer
DNN_HIDDEN_LAYER_2 = 128   # Set 0 to not use this layer
DNN_HIDDEN_LAYER_3 = 256   # Set 0 to not use this layer
DNN_HIDDEN_LAYER_4 = 32    # With this setting, DNN hidden units will be = [512, 256, 128, 128]
DNN_USER_DIM = 4
DNN_ITEM_DIM = 4
DNN_DROPOUT = 0.4
DNN_BATCH_NORM = 1         # 1 to use batch normalization, 0 if not.

# Set cache directory path if want to keep the model checkpoints
MODEL_DIR = './cahce/model'

In [6]:
###############################
# 데이터 전처리
# 1. Rating Data & Genres Data
###############################
df_rating = pd.read_csv('./data/100K_Latest/ratings.csv', 
                        sep=",", skiprows=1, header=None, 
                        names=[USER_COL, ITEM_COL, RATING_COL, 'timestamp'], engine='python')
df_movie = pd.read_csv('./data/100K_Latest/movies.csv', 
                       sep=",", skiprows=1, header=None, 
                       names=[ITEM_COL, 'MovieName', 'Genres_string'], engine='python')

# print('df_ratings \n', df_rating.head())
# print('df_movie \n', df_movie.head())

df_data = pd.merge(df_rating, df_movie)

print('df_data \n', df_data.head())


###############################
# 데이터 전처리
# 2. Feature 인코딩
###############################
# Encode 'genres' into int array (multi-hot representation) to use as item features
genres_encoder = sklearn.preprocessing.MultiLabelBinarizer()
df_data[ITEM_FEAT_COL] = genres_encoder.fit_transform(
    df_data['Genres_string'].apply(lambda s: s.split("|"))
).tolist()
print("Genres:", genres_encoder.classes_)
print(df_data.drop_duplicates(ITEM_COL)[[ITEM_COL, 'Genres_string', ITEM_FEAT_COL]].head())

df_data 


 

   UserId  MovieId  Rating   timestamp         MovieName  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                 Genres_string  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  




Genres:

 

['(no genres listed)' 'Action' 'Adventure' 'Animation' 'Children' 'Comedy'
 'Crime' 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'IMAX'
 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']




     MovieId                                Genres_string  \
0          1  Adventure|Animation|Children|Comedy|Fantasy   
215        3                               Comedy|Romance   
267        6                        Action|Crime|Thriller   
369       47                             Mystery|Thriller   
572       50                       Crime|Mystery|Thriller   

                                                Genres  
0    [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...  
215  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
267  [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...  
369  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...  
572  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...  




In [7]:
###############################
# Train, Test 데이터 나누기
###############################
train, test = python_random_split(
    df_data.drop('Genres_string', axis=1),  # We don't need Genres original string column
    ratio=0.75,
    seed=42
)

print("Train = {}, test = {}".format(len(train), len(test)))

Train = 75627, test = 25209




In [9]:
###############################
# item, user 수 확인
###############################
# Unique items in the dataset
if ITEM_FEAT_COL is None:
    items = df_data.drop_duplicates(ITEM_COL)[[ITEM_COL]].reset_index(drop=True)
    item_feat_shape = None
else:
    items = df_data.drop_duplicates(ITEM_COL)[[ITEM_COL, ITEM_FEAT_COL]].reset_index(drop=True)
    item_feat_shape = len(items[ITEM_FEAT_COL][0])
# Unique users in the dataset
users = df_data.drop_duplicates(USER_COL)[[USER_COL]].reset_index(drop=True)

print("Num items = {}, num users = {}".format(len(items), len(users)))

Num items = 9724, num users = 610




In [10]:
##############################################
# 최소 한 번은 학습 & 체크포인트 저장하도록 세팅
##############################################
train_steps = max(1, EPOCHS * len(train) // BATCH_SIZE)
save_checkpoints_steps = max(1, train_steps // 5)

##########################################################
# MODEL_DIR 에 모델이 존재하면 존재하는 모델을 학습을 이어서
# 모델의 구조가 다르면 에러 발생
##########################################################
if MODEL_DIR is None:
    tmp_dir = TemporaryDirectory()
    MODEL_DIR = tmp_dir.name


############
# 모델 세팅
############
DNN_HIDDEN_UNITS = [DNN_HIDDEN_LAYER_1, DNN_HIDDEN_LAYER_2, DNN_HIDDEN_LAYER_3, DNN_HIDDEN_LAYER_4]
DNN_HIDDEN_UNITS = [h for h in DNN_HIDDEN_UNITS if h > 0] 
if MODEL_TYPE is 'deep' or MODEL_TYPE is 'wide_deep':
    print("DNN hidden units =", DNN_HIDDEN_UNITS)
    print("Embedding {} users to {}-dim vector".format(len(users), DNN_USER_DIM))
    print("Embedding {} items to {}-dim vector".format(len(items), DNN_ITEM_DIM))

##########################
# 옵티마이저 파라미터 세팅
##########################
linear_params = {}
if LINEAR_OPTIMIZER == 'Ftrl':
    linear_params['l1_regularization_strength'] = LINEAR_L1_REG
elif LINEAR_OPTIMIZER == 'Momentum' or LINEAR_OPTIMIZER == 'RMSProp':
    linear_params['momentum'] = LINEAR_MOMENTUM

dnn_params = {}
if DNN_OPTIMIZER == 'Ftrl':
    dnn_params['l1_regularization_strength'] = DNN_L1_REG
elif DNN_OPTIMIZER == 'Momentum' or DNN_OPTIMIZER == 'RMSProp':
    dnn_params['momentum'] = DNN_MOMENTUM

print("\n", linear_params, dnn_params)

DNN hidden units =

 

[128, 256, 32]




Embedding 610 users to 4-dim vector




Embedding 9724 items to 4-dim vector




In [11]:
################################################
# Model Feature 세팅 - wide(linear) & deep(dnn)
################################################
wide_columns, deep_columns = wide_deep.build_feature_columns(
    users=users[USER_COL].values,
    items=items[ITEM_COL].values,
    user_col=USER_COL,
    item_col=ITEM_COL,
    item_feat_col=ITEM_FEAT_COL,
    user_dim=DNN_USER_DIM,
    item_dim=DNN_ITEM_DIM,
    item_feat_shape=item_feat_shape,
    model_type=MODEL_TYPE,
)

print("\nFeature specs:")
for c in wide_columns + deep_columns:
    print(str(c)[:100], "...")


Feature specs:




CrossedColumn(keys=(VocabularyListCategoricalColumn(key='UserId', vocabulary_list=(1, 5, 7, 15, 17, 

 

...




EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='UserId', vocabulary_list=(1,

 

...




EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='MovieId', vocabulary_list=(1

 

...




NumericColumn(key='Genres', shape=(20,), default_value=None, dtype=tf.float32, normalizer_fn=None)

 

...




In [12]:
####################################
# 세팅한 파라미터에 기반한 모델 빌드
####################################
model = wide_deep.build_model(
    model_dir=MODEL_DIR,
    wide_columns=wide_columns,
    deep_columns=deep_columns,
    linear_optimizer=tf_utils.build_optimizer(LINEAR_OPTIMIZER, LINEAR_OPTIMIZER_LR, **linear_params),
    dnn_optimizer=tf_utils.build_optimizer(DNN_OPTIMIZER, DNN_OPTIMIZER_LR, **dnn_params),
    dnn_hidden_units=DNN_HIDDEN_UNITS,
    dnn_dropout=DNN_DROPOUT,
    dnn_batch_norm=(DNN_BATCH_NORM==1),
    log_every_n_iter=max(1, train_steps//20),  # log 20 times
    save_checkpoints_steps=save_checkpoints_steps
)

W0722 19:32:15.039165 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:82: The name tf.train.FtrlOptimizer is deprecated. Please use tf.compat.v1.train.FtrlOptimizer instead.





W0722 19:32:15.052129 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:78: The name tf.train.AdagradOptimizer is deprecated. Please use tf.compat.v1.train.AdagradOptimizer instead.





In [16]:
cols = {
    'col_user': USER_COL,
    'col_item': ITEM_COL,
    'col_rating': RATING_COL,
    'col_prediction': 'prediction'
}

#####################################
# user와 item의 전체 조합(cross join)
#####################################
ranking_pool = user_item_pairs(
    user_df=users,
    item_df=items,
    user_col=USER_COL,
    item_col=ITEM_COL,
    user_item_filter_df=train,  # Remove seen items
    shuffle=True
)

In [17]:
# Define training hooks to track performance while training
hooks = []
if EVALUATE_WHILE_TRAINING:
    evaluation_logger = tf_utils.MetricsLogger()
    metrics = (m for m in (RANKING_METRICS, RATING_METRICS) if len(m) > 0)
    for ms in metrics:
        hooks.append(
            tf_utils.evaluation_log_hook(
                model,
                logger=evaluation_logger,
                true_df=test,
                y_col=RATING_COL,
                eval_df=ranking_pool if ms==RANKING_METRICS else test.drop(RATING_COL, axis=1),
                every_n_iter=save_checkpoints_steps,
                model_dir=MODEL_DIR,
                eval_fns=[getattr(python_evaluation, m) for m in ms],
                **({**cols, 'k': TOP_K} if ms==RANKING_METRICS else cols)
            )
        )

# Define training input (sample feeding) function
train_fn = tf_utils.pandas_input_fn(
    df=train,
    y_col=RATING_COL,
    batch_size=BATCH_SIZE,
    num_epochs=None,  # None == run forever. We use steps=TRAIN_STEPS instead.
    shuffle=True,
    num_threads=num_cpus-1
)

W0722 19:43:49.105174 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:52: The name tf.estimator.inputs is deprecated. Please use tf.compat.v1.estimator.inputs instead.





W0722 19:43:49.108168 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:52: The name tf.estimator.inputs.numpy_input_fn is deprecated. Please use tf.compat.v1.estimator.inputs.numpy_input_fn instead.





In [18]:
print("Training steps = {}, Batch size = {} (num epochs = {})".format(train_steps, BATCH_SIZE, EPOCHS))
tf.logging.set_verbosity(tf.logging.INFO)

#################
# 모델 학습 시작
#################
try:
    model.train(
        input_fn=train_fn,
        hooks=hooks,
        steps=train_steps
    )
except tf.train.NanLossDuringTrainingError:
    raise ValueError(
        """Training stopped with NanLossDuringTrainingError.
        Try other optimizers, smaller batch size and/or smaller learning rate."""
    )

Training steps = 59083, Batch size = 64 (num epochs = 50)




W0722 19:53:36.589709 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.




W0722 19:53:36.783190 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.




W0722 19:53:36.791169 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.




I0722 19:53:36.873950 11028 estimator.py:1145] Calling model_fn.




W0722 19:53:36.904866 11028 deprecation.py:506] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




W0722 19:53:37.531191 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:3038: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.




W0722 19:53:38.758909 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:2655: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




W0722 19:53:41.139543 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow_estimator\python\estimator\canned\linear.py:308: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.




W0722 19:53:42.330362 11028 deprecation.py:506] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\adagrad.py:76: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




I0722 19:53:42.682455 11028 estimator.py:1147] Done calling model_fn.




I0722 19:53:42.685410 11028 basic_session_run_hooks.py:541] Create CheckpointSaverHook.




W0722 19:53:43.060409 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:199: The name tf.summary.FileWriterCache is deprecated. Please use tf.compat.v1.summary.FileWriterCache instead.





W0722 19:53:43.063401 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:200: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.





I0722 19:53:43.527197 11028 monitored_session.py:240] Graph was finalized.




I0722 19:53:45.195738 11028 session_manager.py:500] Running local_init_op.




I0722 19:53:45.371228 11028 session_manager.py:502] Done running local_init_op.




W0722 19:53:45.441043 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\monitored_session.py:875: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.




I0722 19:53:46.668798 11028 basic_session_run_hooks.py:606] Saving checkpoints for 0 into ./cahce/model\model.ckpt.




W0722 19:53:47.128779 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:207: The name tf.train.SessionRunArgs is deprecated. Please use tf.estimator.SessionRunArgs instead.





I0722 19:53:49.433370 11028 basic_session_run_hooks.py:262] loss = 1053.0758, step = 1




W0722 19:53:53.524430 11028 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 915 vs previous value: 915. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.




I0722 19:54:02.302960 11028 basic_session_run_hooks.py:692] global_step/sec: 231.634




I0722 19:54:02.309948 11028 basic_session_run_hooks.py:260] loss = 67.35977, step = 2955 (12.882 sec)




W0722 19:54:02.453554 11028 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 2967 vs previous value: 2967. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.




I0722 19:54:18.555501 11028 basic_session_run_hooks.py:692] global_step/sec: 180.285




I0722 19:54:18.562483 11028 basic_session_run_hooks.py:260] loss = 42.57393, step = 5909 (16.253 sec)




W0722 19:54:25.107981 11028 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 6409 vs previous value: 6409. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.




W0722 19:54:31.978610 11028 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 6816 vs previous value: 6816. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.




I0722 19:54:51.608684 11028 basic_session_run_hooks.py:692] global_step/sec: 89.3711




I0722 19:54:51.612673 11028 basic_session_run_hooks.py:260] loss = 47.872826, step = 8863 (33.050 sec)




W0722 19:54:53.151566 11028 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 9008 vs previous value: 9008. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.




I0722 19:55:13.660748 11028 basic_session_run_hooks.py:606] Saving checkpoints for 11816 into ./cahce/model\model.ckpt.




W0722 19:55:14.707947 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:218: The name tf.logging.get_verbosity is deprecated. Please use tf.compat.v1.logging.get_verbosity instead.





W0722 19:55:14.711937 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:219: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.





W0722 19:55:14.717920 11028 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:219: The name tf.logging.ERROR is deprecated. Please use tf.compat.v1.logging.ERROR instead.





I0722 20:00:20.586909 11028 basic_session_run_hooks.py:692] global_step/sec: 8.97934




I0722 20:00:20.655721 11028 basic_session_run_hooks.py:260] loss = 56.186836, step = 11817 (329.043 sec)




I0722 20:00:34.333011 11028 basic_session_run_hooks.py:692] global_step/sec: 214.882




I0722 20:00:34.336995 11028 basic_session_run_hooks.py:260] loss = 42.11669, step = 14771 (13.681 sec)




I0722 20:00:51.023379 11028 basic_session_run_hooks.py:692] global_step/sec: 176.988




I0722 20:00:51.029362 11028 basic_session_run_hooks.py:260] loss = 28.052593, step = 17725 (16.691 sec)




I0722 20:01:05.064837 11028 basic_session_run_hooks.py:692] global_step/sec: 210.377




I0722 20:01:05.068825 11028 basic_session_run_hooks.py:260] loss = 48.668915, step = 20679 (14.040 sec)




I0722 20:01:18.537811 11028 basic_session_run_hooks.py:606] Saving checkpoints for 23632 into ./cahce/model\model.ckpt.




I0722 20:06:11.505718 11028 basic_session_run_hooks.py:692] global_step/sec: 9.63971




I0722 20:06:11.643172 11028 basic_session_run_hooks.py:260] loss = 38.190994, step = 23633 (306.569 sec)




I0722 20:06:23.621141 11028 basic_session_run_hooks.py:692] global_step/sec: 243.841




I0722 20:06:23.624135 11028 basic_session_run_hooks.py:260] loss = 43.85072, step = 26587 (11.987 sec)




I0722 20:06:37.942849 11028 basic_session_run_hooks.py:692] global_step/sec: 206.246




I0722 20:06:37.945841 11028 basic_session_run_hooks.py:260] loss = 52.082386, step = 29541 (14.322 sec)




I0722 20:06:56.915120 11028 basic_session_run_hooks.py:692] global_step/sec: 155.701




I0722 20:06:56.918113 11028 basic_session_run_hooks.py:260] loss = 31.841549, step = 32495 (18.972 sec)




I0722 20:07:10.772070 11028 basic_session_run_hooks.py:606] Saving checkpoints for 35448 into ./cahce/model\model.ckpt.




I0722 20:13:03.702912 11028 basic_session_run_hooks.py:692] global_step/sec: 8.05373




I0722 20:13:03.751780 11028 basic_session_run_hooks.py:260] loss = 49.31167, step = 35449 (366.834 sec)




I0722 20:13:17.851082 11028 basic_session_run_hooks.py:692] global_step/sec: 208.79




I0722 20:13:17.855072 11028 basic_session_run_hooks.py:260] loss = 29.02741, step = 38403 (14.103 sec)




I0722 20:13:34.842650 11028 basic_session_run_hooks.py:692] global_step/sec: 173.841




I0722 20:13:34.845641 11028 basic_session_run_hooks.py:260] loss = 48.463264, step = 41357 (16.991 sec)




I0722 20:13:46.663045 11028 basic_session_run_hooks.py:692] global_step/sec: 249.907




I0722 20:13:46.667034 11028 basic_session_run_hooks.py:260] loss = 32.53171, step = 44311 (11.820 sec)




I0722 20:13:58.764690 11028 basic_session_run_hooks.py:606] Saving checkpoints for 47264 into ./cahce/model\model.ckpt.




I0722 20:17:16.856030 11028 basic_session_run_hooks.py:692] global_step/sec: 14.0538




I0722 20:17:16.927842 11028 basic_session_run_hooks.py:260] loss = 31.52892, step = 47265 (210.256 sec)




I0722 20:17:29.300757 11028 basic_session_run_hooks.py:692] global_step/sec: 237.389




I0722 20:17:29.302752 11028 basic_session_run_hooks.py:260] loss = 42.666824, step = 50219 (12.381 sec)




I0722 20:17:41.593886 11028 basic_session_run_hooks.py:692] global_step/sec: 240.277




I0722 20:17:41.598873 11028 basic_session_run_hooks.py:260] loss = 42.538452, step = 53173 (12.296 sec)




I0722 20:17:53.435226 11028 basic_session_run_hooks.py:692] global_step/sec: 249.486




I0722 20:17:53.444206 11028 basic_session_run_hooks.py:260] loss = 36.007195, step = 56127 (11.845 sec)




I0722 20:18:06.565119 11028 basic_session_run_hooks.py:606] Saving checkpoints for 59080 into ./cahce/model\model.ckpt.




W0722 20:18:06.781541 11028 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\saver.py:960: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.




I0722 20:21:20.343625 11028 basic_session_run_hooks.py:692] global_step/sec: 14.2768




I0722 20:21:20.459316 11028 basic_session_run_hooks.py:260] loss = 34.39142, step = 59081 (207.009 sec)




I0722 20:21:20.478265 11028 basic_session_run_hooks.py:606] Saving checkpoints for 59083 into ./cahce/model\model.ckpt.




I0722 20:21:23.102496 11028 estimator.py:368] Loss for final step: 33.314835.




In [19]:
if EVALUATE_WHILE_TRAINING:
    for m, v in evaluation_logger.get_log().items():
        pm.record("eval_{}".format(m), v)

AttributeError: module 'papermill' has no attribute 'record'

In [26]:
if len(RATING_METRICS) > 0:
    predictions = list(model.predict(input_fn=tf_utils.pandas_input_fn(df=test)))
    prediction_df = test.drop(RATING_COL, axis=1)
    prediction_df['prediction'] = [p['predictions'][0] for p in predictions]    
    
    print("prediction_df ::: \n", prediction_df[:10])
    
    rating_results = {}
    for m in RATING_METRICS:
        fn = getattr(python_evaluation, m)
        result = fn(test, prediction_df, **cols)
        # pm.record(m, result)
        rating_results[m] = result
    print(rating_results)

I0722 20:26:32.792201 11028 estimator.py:1145] Calling model_fn.




I0722 20:26:34.226369 11028 estimator.py:1147] Done calling model_fn.




I0722 20:26:34.580422 11028 monitored_session.py:240] Graph was finalized.




I0722 20:26:34.590394 11028 saver.py:1280] Restoring parameters from ./cahce/model\model.ckpt-59083




I0722 20:26:34.756949 11028 session_manager.py:500] Running local_init_op.




I0722 20:26:34.826763 11028 session_manager.py:502] Done running local_init_op.




prediction_df ::: 


 

       UserId  MovieId   timestamp  \
67037     551    34162  1504925858   
42175     232    59421  1217541086   
93850     288     8880  1095780696   
6187      414     1080   961595418   
12229     577     2406   945965771   
7433      502     1206  1111757634   
53802     137     6787  1204859228   
65098      97     4025  1047481289   
68041     490    88125  1324376714   
11854     593     2291  1181008449   

                                               MovieName  \
67037                            Wedding Crashers (2005)   
42175                    What Happens in Vegas... (2008)   
93850                                        Mask (1985)   
6187                 Monty Python's Life of Brian (1979)   
12229                         Romancing the Stone (1984)   
7433                          Clockwork Orange, A (1971)   
53802                     All the President's Men (1976)   
65098                           Miss Congeniality (2000)   
68041  Harry Potter and the Deathly Hallo




{'rmse': 0.8912505265425538, 'mae': 0.6752194901811193, 'rsquared': 0.26311857056258414, 'exp_var': 0.2633145675924853}




In [27]:
if len(RANKING_METRICS) > 0:
    predictions = list(model.predict(input_fn=tf_utils.pandas_input_fn(df=ranking_pool)))
    prediction_df = ranking_pool.copy()
    prediction_df['prediction'] = [p['predictions'][0] for p in predictions]

    print("prediction_df ::: \n", prediction_df[:10])
    
    ranking_results = {}
    for m in RANKING_METRICS:
        fn = getattr(python_evaluation, m)
        result = fn(test, prediction_df, **{**cols, 'k': TOP_K})
        # pm.record(m, result)
        ranking_results[m] = result
    print(ranking_results)

I0722 20:26:59.349225 11028 estimator.py:1145] Calling model_fn.




I0722 20:27:00.054309 11028 estimator.py:1147] Done calling model_fn.




I0722 20:27:00.285690 11028 monitored_session.py:240] Graph was finalized.




I0722 20:27:00.292671 11028 saver.py:1280] Restoring parameters from ./cahce/model\model.ckpt-59083




I0722 20:27:00.390410 11028 session_manager.py:500] Running local_init_op.




I0722 20:27:00.431303 11028 session_manager.py:502] Done running local_init_op.




prediction_df ::: 


 

   UserId  MovieId                                             Genres  \
0     289     3087  [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...   
1     233    54503  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2     289      121  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   
3     476     6597  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4     352      223  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
5     245     7026  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
6     169      898  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   
7     387    79590  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
8     513     4928  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   
9     450     6832  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   

   prediction  
0    2.782947  
1    3.388410  
2    3.898042  
3    3.802190  
4    4.146657  
5    1.674149  
6    4.720969  
7    2.606355  
8    4.018032  
9    4.089769  




{'map_at_k': 1.0714668381013606e-06, 'ndcg_at_k': 0.0001804030871295739, 'precision_at_k': 0.0001639344262295082, 'recall_at_k': 3.2144005143040823e-06}




In [29]:
os.makedirs(EXPORT_DIR_BASE, exist_ok=True)

In [65]:
tf.logging.set_verbosity(tf.logging.ERROR)

train_rcvr_fn = tf.contrib.estimator.build_supervised_input_receiver_fn_from_input_fn(
    train_fn
)
eval_rcvr_fn = tf.contrib.estimator.build_supervised_input_receiver_fn_from_input_fn(
    tf_utils.pandas_input_fn(df=test, y_col=RATING_COL)
)
serve_rcvr_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
    tf.feature_column.make_parse_example_spec(wide_columns+deep_columns)
)
rcvr_fn_map = {
    tf.estimator.ModeKeys.TRAIN: train_rcvr_fn,
    tf.estimator.ModeKeys.EVAL: eval_rcvr_fn,
    tf.estimator.ModeKeys.PREDICT: serve_rcvr_fn
}

export_dir = tf.contrib.estimator.export_all_saved_models(
    model,
    export_dir_base=EXPORT_DIR_BASE,
    input_receiver_fn_map=rcvr_fn_map
)
# pm.record('saved_model_dir', str(export_dir))
print("Model exported to", str(export_dir))

Model exported to b'./outputs/model\\1563665171'


In [47]:
###########################################
# 내가 매긴 평가를 바탕으로 추천
# my_ratings.csv에 각 영화에 대한 평점 입력
# UserId : 999
###########################################
# 데이터 불러오기
df_my_rating = pd.read_csv('./data/100K_Latest/my_ratings.csv', 
                        sep=",", skiprows=1, header=None, 
                        names=[USER_COL, ITEM_COL, RATING_COL, 'timestamp', 'movieName'], engine='python')

# movieNmae 컬럼 삭제
del df_my_rating['movieName']
# 평가하지 않은 영화 삭제
df_my_rating = df_my_rating.dropna(axis=0)

print("df_my_rating ::: \n", df_my_rating)
print("test ::: \n", test)

df_my_data = pd.merge(df_my_rating, df_data)

# df_data

print('df_my_data \n', df_my_data.head())



df_my_rating ::: 


 

    UserId  MovieId  Rating    timestamp
0      999        1     4.0  964982703.0
1      999        2     4.0  964983148.0
33     999       36     4.0  964997388.0
44     999       48     3.0  965002283.0
46     999       50     5.0  965003173.0
65     999       73     4.0  965011628.0




test ::: 


 

       UserId  MovieId  Rating   timestamp  \
67037     551    34162     4.0  1504925858   
42175     232    59421     2.0  1217541086   
93850     288     8880     4.0  1095780696   
6187      414     1080     5.0   961595418   
12229     577     2406     4.0   945965771   
7433      502     1206     5.0  1111757634   
53802     137     6787     4.0  1204859228   
65098      97     4025     3.0  1047481289   
68041     490    88125     3.5  1324376714   
11854     593     2291     3.0  1181008449   
87810     201     2262     3.0   939811032   
60512     225     1970     4.0   949261430   
57994     385      551     3.0   834691914   
46319     228     3555     4.0  1363222643   
61839     480     2396     4.5  1179177705   
47446      21    70286     3.5  1441826981   
40203     502     2300     5.0  1111757731   
19310     100      222     4.0  1100184420   
86670     560    84944     3.0  1469648956   
11134     328     2078     3.5  1494210562   
82955     387     1019     3.5  10




df_my_data 


 

Empty DataFrame
Columns: [UserId, MovieId, Rating, timestamp, MovieName, Genres_string]
Index: []


