### Wide & Deep Recommendation System with Movie Lens
출처 : [Microsoft Github] (https://github.com/microsoft/recommenders)

In [15]:
import os
from tempfile import TemporaryDirectory

import tensorflow as tf
import pandas as pd
import sklearn.preprocessing
from IPython.display import display

import papermill as pm

from tensorflow.python.client import device_lib
from python_splitters import python_random_split
import wide_deep_utils as wide_deep
import tf_utils
from pandas_df_utils import user_item_pairs
import python_evaluation

print("Tensorflow Version:", tf.VERSION)
devices = device_lib.list_local_devices()
print([x.name for x in devices])

num_cpus = os.cpu_count()
print("Num CPUs:", num_cpus)

Tensorflow Version:

 

1.14.0




['/device:CPU:0']




Num CPUs:

In [16]:
####################
# 파라미터 세팅
####################
#Recommend top k items
TOP_K = 10
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'
# Metrics to use for evaluation. reco_utils.evaluation.python_evaluation function names
RANKING_METRICS = ['map_at_k', 'ndcg_at_k', 'precision_at_k', 'recall_at_k']
RATING_METRICS = ['rmse', 'mae', 'rsquared', 'exp_var']
# Use session hook to evaluate model while training
EVALUATE_WHILE_TRAINING = True

# Data column names
USER_COL = 'UserId'
ITEM_COL = 'MovieId'
RATING_COL = 'Rating'

# TO-DO 여기에 여러개의 feat를 넣을 수 있는가?
ITEM_FEAT_COL = 'Genres' 

# Train and test set pickle file paths. If None, download and split the dataset.
DATA_DIR = None
TRAIN_PICKLE_PATH = None
TEST_PICKLE_PATH = None
EXPORT_DIR_BASE = './outputs/model/' + MOVIELENS_DATA_SIZE

#### Hyperparameters
MODEL_TYPE = 'wide_deep'
EPOCHS = 1  # if 0, only 1 batch will be processed
BATCH_SIZE = 64
# Wide (linear) model hyperparameters
LINEAR_OPTIMIZER = 'Ftrl'
LINEAR_OPTIMIZER_LR =0.0029   # Learning rate
LINEAR_L1_REG = 0.0           # L1 Regularization rate for FtrlOptimizer
LINEAR_MOMENTUM = 0.9         # Momentum for MomentumOptimizer or RMSPropOptimizer
# DNN model hyperparameters
DNN_OPTIMIZER = 'Adagrad'
DNN_OPTIMIZER_LR = 0.1
DNN_L1_REG = 0.0           # L1 Regularization rate for FtrlOptimizer
DNN_MOMENTUM = 0.9         # Momentum for MomentumOptimizer or RMSPropOptimizer
# Layer dimensions are defined separately to make this work with AzureML Hyperdrive
DNN_HIDDEN_LAYER_1 = 0     # Set 0 to not use this layer
DNN_HIDDEN_LAYER_2 = 128   # Set 0 to not use this layer
DNN_HIDDEN_LAYER_3 = 256   # Set 0 to not use this layer
DNN_HIDDEN_LAYER_4 = 32    # With this setting, DNN hidden units will be = [512, 256, 128, 128]
DNN_USER_DIM = 4
DNN_ITEM_DIM = 4
DNN_DROPOUT = 0.4
DNN_BATCH_NORM = 1         # 1 to use batch normalization, 0 if not.

# Set cache directory path if want to keep the model checkpoints
MODEL_DIR = './cahce/model'
# MODEL_DIR = None

In [20]:
###############################
# 데이터 전처리
# 1. Rating Data & Genres Data
###############################
df_rating = pd.read_csv('./data/100K_Latest/ratings.csv', 
                        sep=",", skiprows=1, header=None, 
                        names=[USER_COL, ITEM_COL, RATING_COL, 'timestamp'], engine='python')
df_movie = pd.read_csv('./data/100K_Latest/movies.csv', 
                       sep=",", skiprows=1, header=None, 
                       names=[ITEM_COL, 'MovieName', 'Genres_string'], engine='python')

df_data = pd.merge(df_rating, df_movie)

print('df_data \n', df_data.head())


###############################
# 데이터 전처리
# 2. Feature 인코딩
###############################
# Encode 'genres' into int array (multi-hot representation) to use as item features
genres_encoder = sklearn.preprocessing.MultiLabelBinarizer()
df_data[ITEM_FEAT_COL] = genres_encoder.fit_transform(
    df_data['Genres_string'].apply(lambda s: s.split("|"))
).tolist()
print("Genres:", genres_encoder.classes_)

print("df_data \n", df_data.drop_duplicates(ITEM_COL)[[ITEM_COL, 'Genres_string', ITEM_FEAT_COL]].head())

df_data 


 

   UserId  MovieId  Rating   timestamp         MovieName  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                 Genres_string  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  




Genres:

 

['(no genres listed)' 'Action' 'Adventure' 'Animation' 'Children' 'Comedy'
 'Crime' 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'IMAX'
 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']




df_data 


 

     MovieId                                Genres_string  \
0          1  Adventure|Animation|Children|Comedy|Fantasy   
215        3                               Comedy|Romance   
267        6                        Action|Crime|Thriller   
369       47                             Mystery|Thriller   
572       50                       Crime|Mystery|Thriller   

                                                Genres  
0    [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...  
215  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
267  [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...  
369  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...  
572  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...  




In [26]:
###############################
# Train, Test 데이터 나누기
###############################
train, test = python_random_split(
    df_data.drop('Genres_string', axis=1),  # We don't need Genres original string column
    ratio=0.75,
    seed=42
)

print("Train = {}, test = {}".format(len(train), len(test)))

Train = 75627, test = 25209




In [27]:
###############################
# item, user 수 확인
###############################
# Unique items in the dataset
if ITEM_FEAT_COL is None:
    items = df_data.drop_duplicates(ITEM_COL)[[ITEM_COL]].reset_index(drop=True)
    item_feat_shape = None
else:
    items = df_data.drop_duplicates(ITEM_COL)[[ITEM_COL, ITEM_FEAT_COL]].reset_index(drop=True)
    item_feat_shape = len(items[ITEM_FEAT_COL][0])
# Unique users in the dataset
users = df_data.drop_duplicates(USER_COL)[[USER_COL]].reset_index(drop=True)

print("Num items = {}, num users = {}".format(len(items), len(users)))

Num items = 9724, num users = 610




In [28]:
##############################################
# 최소 한 번은 학습 & 체크포인트 저장하도록 세팅
##############################################
train_steps = max(1, EPOCHS * len(train) // BATCH_SIZE)
save_checkpoints_steps = max(1, train_steps // 5)

##########################################################
# MODEL_DIR 에 모델이 존재하면 존재하는 모델을 학습을 이어서
# 모델의 구조가 다르면 에러 발생
##########################################################
if MODEL_DIR is None:
    tmp_dir = TemporaryDirectory()
    MODEL_DIR = tmp_dir.name


############
# 모델 세팅
############
DNN_HIDDEN_UNITS = [DNN_HIDDEN_LAYER_1, DNN_HIDDEN_LAYER_2, DNN_HIDDEN_LAYER_3, DNN_HIDDEN_LAYER_4]
DNN_HIDDEN_UNITS = [h for h in DNN_HIDDEN_UNITS if h > 0] 
if MODEL_TYPE is 'deep' or MODEL_TYPE is 'wide_deep':
    print("DNN hidden units =", DNN_HIDDEN_UNITS)
    print("Embedding {} users to {}-dim vector".format(len(users), DNN_USER_DIM))
    print("Embedding {} items to {}-dim vector".format(len(items), DNN_ITEM_DIM))

##########################
# 옵티마이저 파라미터 세팅
##########################
linear_params = {}
if LINEAR_OPTIMIZER == 'Ftrl':
    linear_params['l1_regularization_strength'] = LINEAR_L1_REG
elif LINEAR_OPTIMIZER == 'Momentum' or LINEAR_OPTIMIZER == 'RMSProp':
    linear_params['momentum'] = LINEAR_MOMENTUM

dnn_params = {}
if DNN_OPTIMIZER == 'Ftrl':
    dnn_params['l1_regularization_strength'] = DNN_L1_REG
elif DNN_OPTIMIZER == 'Momentum' or DNN_OPTIMIZER == 'RMSProp':
    dnn_params['momentum'] = DNN_MOMENTUM

print("\n", linear_params, dnn_params)

DNN hidden units =

 

[128, 256, 32]




Embedding 610 users to 4-dim vector




Embedding 9724 items to 4-dim vector







 

{'l1_regularization_strength': 0.0}

 

In [29]:
################################################
# Model Feature 세팅 - wide(linear) & deep(dnn)
# wide_deep.build_feature_columns
#     tf.feature_column 세팅
################################################
item_genres = df_data.drop_duplicates(ITEM_COL)[[ITEM_COL, 'Genres']].reset_index(drop=True)
item_feat_shape = len(item_genres['Genres'][0])

wide_columns, deep_columns = wide_deep.build_feature_columns(
    users=users[USER_COL].values,
    items=items[ITEM_COL].values,
    user_col=USER_COL,
    item_col=ITEM_COL,
    item_feat_col=ITEM_FEAT_COL,
    user_dim=DNN_USER_DIM,
    item_dim=DNN_ITEM_DIM,
    item_feat_shape=item_feat_shape,
    model_type=MODEL_TYPE,
)

print("\nFeature specs:")
for c in wide_columns + deep_columns:
    print(str(c)[:100], "...")

item_feat_col :::::::: 

 

Genres





Feature specs:




CrossedColumn(keys=(VocabularyListCategoricalColumn(key='UserId', vocabulary_list=(1, 5, 7, 15, 17, 

 

...




EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='UserId', vocabulary_list=(1,

 

...




EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='MovieId', vocabulary_list=(1

 

...




NumericColumn(key='Genres', shape=(20,), default_value=None, dtype=tf.float32, normalizer_fn=None)

 

...




In [30]:
####################################
# 세팅한 파라미터에 기반한 모델 빌드
# wide_deep.build_model  
#   Wide Column만 있을 때 
#       model = tf.estimator.LinearRegressor
#   Deep Column만 있을 때 
#       model = tf.estimator.DNNRegressor
#   Wide Column과 Deep Column이 모두 있을 때
#       model = tf.estimator.DNNLinearCombinedRegressor
####################################
model = wide_deep.build_model(
    model_dir=MODEL_DIR,
    wide_columns=wide_columns,
    deep_columns=deep_columns,
    linear_optimizer=tf_utils.build_optimizer(LINEAR_OPTIMIZER, LINEAR_OPTIMIZER_LR, **linear_params),
    dnn_optimizer=tf_utils.build_optimizer(DNN_OPTIMIZER, DNN_OPTIMIZER_LR, **dnn_params),
    dnn_hidden_units=DNN_HIDDEN_UNITS,
    dnn_dropout=DNN_DROPOUT,
    dnn_batch_norm=(DNN_BATCH_NORM==1),
    log_every_n_iter=max(1, train_steps//20),  # log 20 times
    save_checkpoints_steps=save_checkpoints_steps
)

W0819 21:07:31.516094   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:82: The name tf.train.FtrlOptimizer is deprecated. Please use tf.compat.v1.train.FtrlOptimizer instead.





W0819 21:07:31.520084   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:78: The name tf.train.AdagradOptimizer is deprecated. Please use tf.compat.v1.train.AdagradOptimizer instead.





In [31]:
cols = {
    'col_user': USER_COL,
    'col_item': ITEM_COL,
    'col_rating': RATING_COL,
    'col_prediction': 'prediction'
}

#####################################
# user와 item의 전체 조합(cross join)
#####################################
ranking_pool = user_item_pairs(
    user_df=users,
    item_df=items,
    user_col=USER_COL,
    item_col=ITEM_COL,
    user_item_filter_df=train,  # Remove seen items
    shuffle=True
)

In [32]:
# Define training hooks to track performance while training
hooks = []
if EVALUATE_WHILE_TRAINING:
    evaluation_logger = tf_utils.MetricsLogger()
    metrics = (m for m in (RANKING_METRICS, RATING_METRICS) if len(m) > 0)
    for ms in metrics:
        hooks.append(
            tf_utils.evaluation_log_hook(
                model,
                logger=evaluation_logger,
                true_df=test,
                y_col=RATING_COL,
                eval_df=ranking_pool if ms==RANKING_METRICS else test.drop(RATING_COL, axis=1),
                every_n_iter=save_checkpoints_steps,
                model_dir=MODEL_DIR,
                eval_fns=[getattr(python_evaluation, m) for m in ms],
                **({**cols, 'k': TOP_K} if ms==RANKING_METRICS else cols)
            )
        )

# Define training input (sample feeding) function
train_fn = tf_utils.pandas_input_fn(
    df=train,
    y_col=RATING_COL,
    batch_size=BATCH_SIZE,
    num_epochs=None,  # None == run forever. We use steps=TRAIN_STEPS instead.
    shuffle=True,
    num_threads=num_cpus-1
)

W0819 21:07:59.225576   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:52: The name tf.estimator.inputs is deprecated. Please use tf.compat.v1.estimator.inputs instead.





W0819 21:07:59.229566   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:52: The name tf.estimator.inputs.numpy_input_fn is deprecated. Please use tf.compat.v1.estimator.inputs.numpy_input_fn instead.





In [33]:
print("Training steps = {}, Batch size = {} (num epochs = {})".format(train_steps, BATCH_SIZE, EPOCHS))
tf.logging.set_verbosity(tf.logging.INFO)

#########################################################
# 모델 학습 시작 - 이미 학습된 모델이 있다면 실행 안해도 됨
#########################################################
try:
    model.train(
        input_fn=train_fn,
        hooks=hooks,
        steps=train_steps
    )
except tf.train.NanLossDuringTrainingError:
    raise ValueError(
        """Training stopped with NanLossDuringTrainingError.
        Try other optimizers, smaller batch size and/or smaller learning rate."""
    )

Training steps = 1181, Batch size = 64 (num epochs = 1)




hooks ::: 

 

[<tf_utils._TrainLogHook object at 0x0000015E0C03F358>, <tf_utils._TrainLogHook object at 0x0000015E0C03F668>]




steps ::: 

 

1181




max_steps ::: 

 

None




saving_listeners ::: 

 

None




W0819 21:08:06.722714   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.




W0819 21:08:06.888271   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.




W0819 21:08:06.894255   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.




I0819 21:08:06.926170   920 estimator.py:1155] Calling model_fn.




W0819 21:08:07.044858   920 deprecation.py:506] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




W0819 21:08:07.743069   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:3038: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.




W0819 21:08:08.601618   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:2655: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




W0819 21:08:11.736569   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow_estimator\python\estimator\canned\linear.py:308: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.




W0819 21:08:12.938969   920 deprecation.py:506] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\adagrad.py:76: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




I0819 21:08:13.402756   920 estimator.py:1157] Done calling model_fn.




I0819 21:08:13.406743   920 basic_session_run_hooks.py:541] Create CheckpointSaverHook.




W0819 21:08:13.970113   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:199: The name tf.summary.FileWriterCache is deprecated. Please use tf.compat.v1.summary.FileWriterCache instead.





W0819 21:08:13.973106   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:200: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.





I0819 21:08:14.683453   920 monitored_session.py:240] Graph was finalized.




I0819 21:08:15.226802   920 session_manager.py:500] Running local_init_op.




I0819 21:08:15.421941   920 session_manager.py:502] Done running local_init_op.




W0819 21:08:15.527067   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\monitored_session.py:875: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.




I0819 21:08:17.274638   920 basic_session_run_hooks.py:606] Saving checkpoints for 0 into ./cahce/model\model.ckpt.




W0819 21:08:17.796264   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:207: The name tf.train.SessionRunArgs is deprecated. Please use tf.estimator.SessionRunArgs instead.





I0819 21:08:18.941241   920 basic_session_run_hooks.py:262] loss = 1142.6165, step = 1




I0819 21:08:19.681261   920 basic_session_run_hooks.py:692] global_step/sec: 79.7275




I0819 21:08:19.684256   920 basic_session_run_hooks.py:260] loss = 88.133804, step = 60 (0.743 sec)




I0819 21:08:19.953534   920 basic_session_run_hooks.py:692] global_step/sec: 216.695




I0819 21:08:19.955528   920 basic_session_run_hooks.py:260] loss = 87.90403, step = 119 (0.271 sec)




I0819 21:08:20.218378   920 basic_session_run_hooks.py:692] global_step/sec: 221.937




I0819 21:08:20.221368   920 basic_session_run_hooks.py:260] loss = 90.41711, step = 178 (0.265 sec)




I0819 21:08:20.491161   920 basic_session_run_hooks.py:606] Saving checkpoints for 236 into ./cahce/model\model.ckpt.




W0819 21:08:20.845213   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:218: The name tf.logging.get_verbosity is deprecated. Please use tf.compat.v1.logging.get_verbosity instead.





W0819 21:08:20.849206   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:219: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.





W0819 21:08:20.860173   920 deprecation_wrapper.py:119] From D:\01.Programming\PycharmProjects\Recommenders-movielens\tf_utils.py:219: The name tf.logging.ERROR is deprecated. Please use tf.compat.v1.logging.ERROR instead.





I0819 21:12:03.759964   920 basic_session_run_hooks.py:692] global_step/sec: 0.263933




I0819 21:12:03.816811   920 basic_session_run_hooks.py:260] loss = 82.89117, step = 237 (223.590 sec)




I0819 21:12:04.080107   920 basic_session_run_hooks.py:692] global_step/sec: 184.868




I0819 21:12:04.082102   920 basic_session_run_hooks.py:260] loss = 57.486298, step = 296 (0.271 sec)




I0819 21:12:04.468071   920 basic_session_run_hooks.py:692] global_step/sec: 151.686




I0819 21:12:04.473057   920 basic_session_run_hooks.py:260] loss = 62.745335, step = 355 (0.391 sec)




I0819 21:12:04.772258   920 basic_session_run_hooks.py:692] global_step/sec: 194.598




I0819 21:12:04.776246   920 basic_session_run_hooks.py:260] loss = 74.84384, step = 414 (0.303 sec)




I0819 21:12:05.049516   920 basic_session_run_hooks.py:606] Saving checkpoints for 472 into ./cahce/model\model.ckpt.




I0819 21:16:01.762053   920 basic_session_run_hooks.py:692] global_step/sec: 0.248955




I0819 21:16:01.847005   920 basic_session_run_hooks.py:260] loss = 52.30132, step = 473 (237.071 sec)




I0819 21:16:02.322164   920 basic_session_run_hooks.py:692] global_step/sec: 105.336




I0819 21:16:02.331194   920 basic_session_run_hooks.py:260] loss = 73.91003, step = 532 (0.483 sec)




I0819 21:16:02.717988   920 basic_session_run_hooks.py:692] global_step/sec: 149.056




I0819 21:16:02.727998   920 basic_session_run_hooks.py:260] loss = 63.04612, step = 591 (0.398 sec)




I0819 21:16:03.123738   920 basic_session_run_hooks.py:692] global_step/sec: 145.41




I0819 21:16:03.128795   920 basic_session_run_hooks.py:260] loss = 70.77345, step = 650 (0.401 sec)




I0819 21:16:03.529737   920 basic_session_run_hooks.py:606] Saving checkpoints for 708 into ./cahce/model\model.ckpt.




I0819 21:21:54.632729   920 basic_session_run_hooks.py:692] global_step/sec: 0.167848




I0819 21:21:54.770362   920 basic_session_run_hooks.py:260] loss = 73.92205, step = 709 (351.636 sec)




I0819 21:21:55.249082   920 basic_session_run_hooks.py:692] global_step/sec: 95.5699




I0819 21:21:55.256063   920 basic_session_run_hooks.py:260] loss = 64.50162, step = 768 (0.492 sec)




I0819 21:21:55.845452   920 basic_session_run_hooks.py:692] global_step/sec: 98.9318




I0819 21:21:55.849440   920 basic_session_run_hooks.py:260] loss = 78.649864, step = 827 (0.593 sec)




I0819 21:21:56.288303   920 basic_session_run_hooks.py:692] global_step/sec: 133.228




I0819 21:21:56.294287   920 basic_session_run_hooks.py:260] loss = 47.52865, step = 886 (0.445 sec)




I0819 21:21:56.669284   920 basic_session_run_hooks.py:606] Saving checkpoints for 944 into ./cahce/model\model.ckpt.




I0819 21:25:48.108311   920 basic_session_run_hooks.py:692] global_step/sec: 0.254509




I0819 21:25:48.204054   920 basic_session_run_hooks.py:260] loss = 55.51477, step = 945 (231.903 sec)




I0819 21:25:48.697285   920 basic_session_run_hooks.py:692] global_step/sec: 100.005




I0819 21:25:48.701274   920 basic_session_run_hooks.py:260] loss = 56.159492, step = 1004 (0.504 sec)




I0819 21:25:48.995488   920 basic_session_run_hooks.py:692] global_step/sec: 197.852




I0819 21:25:48.999483   920 basic_session_run_hooks.py:260] loss = 57.91274, step = 1063 (0.298 sec)




I0819 21:25:49.319621   920 basic_session_run_hooks.py:692] global_step/sec: 182.024




I0819 21:25:49.323609   920 basic_session_run_hooks.py:260] loss = 80.56822, step = 1122 (0.324 sec)




I0819 21:25:49.660749   920 basic_session_run_hooks.py:606] Saving checkpoints for 1180 into ./cahce/model\model.ckpt.




W0819 21:25:49.891158   920 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\saver.py:960: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.




I0819 21:29:45.310930   920 basic_session_run_hooks.py:692] global_step/sec: 0.250009




I0819 21:29:45.377262   920 basic_session_run_hooks.py:260] loss = 52.98465, step = 1181 (236.054 sec)




I0819 21:29:45.381253   920 basic_session_run_hooks.py:606] Saving checkpoints for 1181 into ./cahce/model\model.ckpt.




I0819 21:29:48.237787   920 estimator.py:377] Loss for final step: 52.98465.




In [19]:
# if EVALUATE_WHILE_TRAINING:
#     for m, v in evaluation_logger.get_log().items():
#         # pm.record("eval_{}".format(m), v)

AttributeError: module 'papermill' has no attribute 'record'

In [35]:
if len(RATING_METRICS) > 0:
    predictions = list(model.predict(input_fn=tf_utils.pandas_input_fn(df=test)))
    prediction_df = test.drop(RATING_COL, axis=1)
    prediction_df['prediction'] = [p['predictions'][0] for p in predictions]    
    
    print("prediction_df ::: \n", prediction_df[:10])
    
    rating_results = {}
    for m in RATING_METRICS:
        fn = getattr(python_evaluation, m)
        result = fn(test, prediction_df, **cols)
        # pm.record(m, result)
        rating_results[m] = result
    print(rating_results)

I0819 21:34:43.150696   920 estimator.py:1155] Calling model_fn.




I0819 21:34:44.746430   920 estimator.py:1157] Done calling model_fn.




I0819 21:34:45.176280   920 monitored_session.py:240] Graph was finalized.




I0819 21:34:45.190243   920 saver.py:1280] Restoring parameters from ./cahce/model\model.ckpt-1181




I0819 21:34:45.352813   920 session_manager.py:500] Running local_init_op.




I0819 21:34:45.451545   920 session_manager.py:502] Done running local_init_op.




prediction_df ::: 


 

       UserId  MovieId   timestamp  \
67037     551    34162  1504925858   
42175     232    59421  1217541086   
93850     288     8880  1095780696   
6187      414     1080   961595418   
12229     577     2406   945965771   
7433      502     1206  1111757634   
53802     137     6787  1204859228   
65098      97     4025  1047481289   
68041     490    88125  1324376714   
11854     593     2291  1181008449   

                                               MovieName  \
67037                            Wedding Crashers (2005)   
42175                    What Happens in Vegas... (2008)   
93850                                        Mask (1985)   
6187                 Monty Python's Life of Brian (1979)   
12229                         Romancing the Stone (1984)   
7433                          Clockwork Orange, A (1971)   
53802                     All the President's Men (1976)   
65098                           Miss Congeniality (2000)   
68041  Harry Potter and the Deathly Hallo




{'rmse': 0.9417435397550733, 'mae': 0.7479035569285215, 'rsquared': 0.17725867412193774, 'exp_var': 0.1970497423061145}




In [36]:
if len(RANKING_METRICS) > 0:
    predictions = list(model.predict(input_fn=tf_utils.pandas_input_fn(df=ranking_pool)))
    prediction_df = ranking_pool.copy()
    prediction_df['prediction'] = [p['predictions'][0] for p in predictions]

    print("prediction_df ::: \n", prediction_df[:10])
    
    ranking_results = {}
    for m in RANKING_METRICS:
        fn = getattr(python_evaluation, m)
        result = fn(test, prediction_df, **{**cols, 'k': TOP_K})
        # pm.record(m, result)
        ranking_results[m] = result
    print(ranking_results)

I0819 21:36:41.603063   920 estimator.py:1155] Calling model_fn.




I0819 21:36:42.622337   920 estimator.py:1157] Done calling model_fn.




I0819 21:36:43.207771   920 monitored_session.py:240] Graph was finalized.




I0819 21:36:43.214753   920 saver.py:1280] Restoring parameters from ./cahce/model\model.ckpt-1181




I0819 21:36:43.317477   920 session_manager.py:500] Running local_init_op.




I0819 21:36:43.369339   920 session_manager.py:502] Done running local_init_op.




prediction_df ::: 


 

   UserId  MovieId                                             Genres  \
0     504    68159  [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...   
1     250    26567  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2     351    58156  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3     469    92420  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4     180    70183  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   
5     417     1292  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   
6      55   157108  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
7     390     5502  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...   
8     519    44597  [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...   
9      31     1753  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

   prediction  
0    3.686002  
1    3.764728  
2    3.110615  
3    3.200232  
4    3.166072  
5    3.792621  
6    2.883960  
7    3.274301  
8    3.854220  
9    3.254479  




{'map_at_k': 0.00017544752711345915, 'ndcg_at_k': 0.0037159433936839866, 'precision_at_k': 0.004754098360655739, 'recall_at_k': 0.0008993024413315052}




In [38]:
os.makedirs(EXPORT_DIR_BASE, exist_ok=True)

In [39]:
tf.logging.set_verbosity(tf.logging.ERROR)

train_rcvr_fn = tf.contrib.estimator.build_supervised_input_receiver_fn_from_input_fn(
    train_fn
)
eval_rcvr_fn = tf.contrib.estimator.build_supervised_input_receiver_fn_from_input_fn(
    tf_utils.pandas_input_fn(df=test, y_col=RATING_COL)
)
serve_rcvr_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
    tf.feature_column.make_parse_example_spec(wide_columns+deep_columns)
)
rcvr_fn_map = {
    tf.estimator.ModeKeys.TRAIN: train_rcvr_fn,
    tf.estimator.ModeKeys.EVAL: eval_rcvr_fn,
    tf.estimator.ModeKeys.PREDICT: serve_rcvr_fn
}

export_dir = tf.contrib.estimator.export_all_saved_models(
    model,
    export_dir_base=EXPORT_DIR_BASE,
    input_receiver_fn_map=rcvr_fn_map
)
# pm.record('saved_model_dir', str(export_dir))
print("Model exported to", str(export_dir))

Model exported to

 

b'./outputs/model/100k\\1566218461'




In [40]:
###########################################
# 내가 매긴 평가를 바탕으로 추천
# my_ratings.csv에 각 영화에 대한 평점 입력
# UserId : 999
###########################################
# 데이터 불러오기
df_my_rating = pd.read_csv('./data/100K_Latest/my_ratings.csv', 
                        sep=",", skiprows=1, header=None, 
                        names=[USER_COL, ITEM_COL, RATING_COL, 'timestamp', 'movieName'], engine='python')


df_my_test = df_my_rating[df_my_rating['Rating'].isnull()]

# movieNmae 컬럼 삭제
# del df_my_rating['movieName']
# 평가하지 않은 영화 삭제

df_my_test = df_my_rating[df_my_rating['Rating'].isnull()]

# print("len(df_my_rating)", df_my_rating.shape[0])
# print("df_my_rating \n", df_my_rating)

# 평가하지 않은 영화 삭제
df_my_train = df_my_rating.dropna(axis=0)
print("len(df_my_train)", df_my_train.shape[0])

df_genre = df_data.drop_duplicates(ITEM_COL)[[ITEM_COL, ITEM_FEAT_COL]]
df_my_train = pd.merge(df_my_train, df_genre, on="MovieId")
df_my_test = pd.merge(df_my_test, df_genre, on="MovieId")


print('train \n', train.head())
print('df_genre \n', df_genre.head())
print('df_my_train \n', df_my_train.head())
print('df_my_test \n', df_my_test.head())

# 기존 데이터와 concatenate
# df_my_train = pd.concat([df_my_rating, train], sort=False)
# 
# print('train \n', train.head())
# print("df_my_train", df_my_train.head())

len(df_my_train)

 

50




train 


 

        UserId  MovieId  Rating   timestamp                   MovieName  \
97717      414     4985     1.0  1008691553               Sheena (1984)   
100124     599     6860     2.0  1519345916             Mobsters (1991)   
25952      387      300     3.0  1154681852            Quiz Show (1994)   
25871      414      266     5.0   961512595  Legends of the Fall (1994)   
97255      477    59141     4.5  1241396097        Son of Rambow (2007)   

                                                   Genres  
97717   [0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...  
100124  [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...  
25952   [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...  
25871   [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...  
97255   [0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...  




df_genre 


 

     MovieId                                             Genres
0          1  [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
215        3  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
267        6  [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
369       47  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
572       50  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...




df_my_train 


 

   UserId  MovieId  Rating  timestamp                   movieName  \
0     999        1     4.0  964982703            Toy Story (1995)   
1     999        2     4.0  964983148              Jumanji (1995)   
2     999       36     4.0  964997388     Dead Man Walking (1995)   
3     999       48     3.0  965002283           Pocahontas (1995)   
4     999       50     5.0  965003173  Usual Suspects, The (1995)   

                                              Genres  
0  [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...  
1  [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...  
4  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...  




df_my_test 


 

   UserId  MovieId  Rating  timestamp                           movieName  \
0     999        3     NaN  964983593             Grumpier Old Men (1995)   
1     999        4     NaN  964984038            Waiting to Exhale (1995)   
2     999        5     NaN  964984483  Father of the Bride Part II (1995)   
3     999        6     NaN  964984928                         Heat (1995)   
4     999        7     NaN  964985373                      Sabrina (1995)   

                                              Genres  
0  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  




In [41]:
##########################
# 모델 추가 학습 학습 시작
##########################
my_train_fn = tf_utils.pandas_input_fn(
    df=df_my_train,
    y_col=RATING_COL,
    batch_size=BATCH_SIZE,
    num_epochs=None,  # None == run forever. We use steps=TRAIN_STEPS instead.
    shuffle=True,
    num_threads=num_cpus-1
)

try:
    model.train(
        input_fn=my_train_fn,
        hooks=hooks,
        steps=train_steps
    )
except tf.train.NanLossDuringTrainingError:
    raise ValueError(
        """Training stopped with NanLossDuringTrainingError.
        Try other optimizers, smaller batch size and/or smaller learning rate."""
    )

hooks ::: 

 

[<tf_utils._TrainLogHook object at 0x0000015E0C03F358>, <tf_utils._TrainLogHook object at 0x0000015E0C03F668>]




steps ::: 

 

1181




max_steps ::: 

 

None




saving_listeners ::: 

 

None




In [44]:
##############################
# 사용자에게 추천할 영화 Top-k
##############################
# 어떤 영화를 추천했는지 보기 위해 pandas 옵션 세팅
pd.set_option('display.max_columns', None)


predictions = list(model.predict(input_fn=tf_utils.pandas_input_fn(df=df_my_test)))
prediction_df = df_my_test.drop(RATING_COL, axis=1)
df_my_test['prediction'] = [p['predictions'][0] for p in predictions]    
print(df_my_test.sort_values(['prediction'], ascending=False).head())   


# tsne_movie_embeddings(model)

      UserId  MovieId  Rating  timestamp  \
4593     999     6862     NaN  967032818   
704      999      929     NaN  965298653   
9628     999   184257     NaN  969294753   
3533     999     4848     NaN  966559783   
9145     999   150548     NaN  969073588   

                                  movieName  \
4593                     Out of Time (2003)   
704            Foreign Correspondent (1940)   
9628               Making a Murderer (2015)   
3533                Mulholland Drive (2001)   
9145  Sherlock: The Abominable Bride (2016)   

                                                 Genres  prediction  
4593  [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...    4.991636  
704   [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, ...    4.974196  
9628  [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...    4.971679  
3533  [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, ...    4.965366  
9145  [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, ...    4.953690  




#### TO-DO LIST
1. Wide & Deep 모델 내부 로직 확인
2. 데이터 분석 시각화 구현(연관도가 어떻게 측정된 것인지)
3. 더 많은 SELF VALIDATION(정말 나에게 추천될만한 영화가 추천된 것인지)
4. 더 많은 데이터로 학습(1M 까지?)
5. Tensorflow high-level API 공부( tf.estimator 공부!!!)