### Wide & Deep 튜토리얼 버전

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn.preprocessing
from python_splitters import python_random_split


In [45]:
#######################
# Dataset 100K 용 세팅
#######################

# Load each data set (users, movies, and ratings).
users_cols = ['userid', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('./data/100K/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['userid', 'movieid', 'rating', 'timestamp']
ratings = pd.read_csv('./data/100K/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "action", "adventure", "animation", "children", "comedy",
    "crime", "documentary", "drama", "fantasy", "film-noir", "horror",
    "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western"
]
movies_cols = [
    'movieid', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    './data/100K/u.item', sep='|', names=movies_cols, encoding='latin-1')

movies = movies.dropna(subset=['release_date'])

users["userid"] = users['userid'].apply(lambda x: str(x))
users["zip_code"] = users['zip_code'].apply(lambda x: str(x))
ratings["userid"] = ratings['userid'].apply(lambda x: str(x))
ratings["movieid"] = ratings['movieid'].apply(lambda x: str(x))
movies["movieid"] = movies['movieid'].apply(lambda x: str(x))
movies["release_date"] = movies['release_date'].apply(lambda x: int(str(x).split('-')[-1]))

# Create one merged DataFrame containing all the movielens data.
df_data = ratings.merge(movies, on='movieid').merge(users, on='userid')

# df_data["release_date"] = df_data['release_date'].apply(lambda x: int(x))

df_data = df_data.drop('title', axis=1)
df_data = df_data.drop('video_release_date', axis=1)
df_data = df_data.drop('imdb_url', axis=1) 

print("df_data::: \n", df_data.head())

df_data::: 
   userid movieid  rating  timestamp  release_date  genre_unknown  action  \
0    196     242       3  881250949          1997              0       0   
1    196     257       2  881251577          1997              0       1   
2    196     111       4  881251793          1996              0       0   
3    196      25       4  881251955          1996              0       0   
4    196     382       4  881251843          1994              0       0   

   adventure  animation  children  ...  mystery  romance  sci-fi  thriller  \
0          0          0         0  ...        0        0       0         0   
1          1          0         0  ...        0        0       1         0   
2          0          0         0  ...        0        1       0         0   
3          0          0         0  ...        0        0       0         0   
4          0          0         0  ...        0        0       0         0   

   war  western  age  gender  occupation  zip_code  
0    0  

In [46]:
# Categorical base columns.
userid = tf.contrib.layers.sparse_column_with_hash_bucket("userid", hash_bucket_size=100)
movieid = tf.contrib.layers.sparse_column_with_hash_bucket("movieid", hash_bucket_size=100)
gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["M", "W"])
# genre = tf.contrib.layers.sparse_column_with_keys(column_name="genre", keys=[
#   "genre_unknown", "action", "adventure", "animation", "children", "comedy",
#     "crime", "documentary", "drama", "fantasy", "film-noir", "horror",
#     "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western"])
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
zip_code = tf.contrib.layers.sparse_column_with_hash_bucket("zip_code", hash_bucket_size=100)

# Continuous base columns.
age = tf.contrib.layers.real_valued_column("age")
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
release_date = tf.contrib.layers.real_valued_column("release_date")

genre_unknown = tf.contrib.layers.real_valued_column("genre_unknown")
action = tf.contrib.layers.real_valued_column("action")
adventure = tf.contrib.layers.real_valued_column("adventure")
animation = tf.contrib.layers.real_valued_column("animation")
children = tf.contrib.layers.real_valued_column("children")
comedy = tf.contrib.layers.real_valued_column("comedy")
crime = tf.contrib.layers.real_valued_column("crime")
documentary = tf.contrib.layers.real_valued_column("documentary")
drama = tf.contrib.layers.real_valued_column("drama")
fantasy = tf.contrib.layers.real_valued_column("fantasy")
filmnoir = tf.contrib.layers.real_valued_column("film-noir")
horror = tf.contrib.layers.real_valued_column("horror")
musical = tf.contrib.layers.real_valued_column("musical")
mystery = tf.contrib.layers.real_valued_column("mystery")
romance = tf.contrib.layers.real_valued_column("romance")
scifi = tf.contrib.layers.real_valued_column("sci-fi")
thriller = tf.contrib.layers.real_valued_column("thriller")
war = tf.contrib.layers.real_valued_column("war")
western = tf.contrib.layers.real_valued_column("western")

In [47]:
wide_columns = [userid, movieid, gender, occupation, zip_code, release_date, age_buckets, 
                genre_unknown, action, adventure, animation, children, comedy, crime, documentary, drama, 
                fantasy, filmnoir, horror, musical, mystery, romance, scifi, thriller, war, western,  
  tf.contrib.layers.crossed_column([userid, movieid], hash_bucket_size=int(1e4))
]

In [48]:
deep_columns = [
  tf.contrib.layers.embedding_column(userid, dimension=8),
  tf.contrib.layers.embedding_column(movieid, dimension=8),
  # tf.contrib.layers.embedding_column(genre, dimension=8),
  tf.contrib.layers.embedding_column(gender, dimension=8),
  tf.contrib.layers.embedding_column(occupation, dimension=8),
  tf.contrib.layers.embedding_column(zip_code, dimension=8),
  age, release_date, genre_unknown, action, adventure, animation, children, comedy, crime, documentary, drama, 
  fantasy, filmnoir, horror, musical, mystery, romance, scifi, thriller, war, western
]

W0819 20:53:19.848343  1156 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


W0819 20:53:19.850338  1156 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


W0819 20:53:19.851335  1156 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


W0819 20:53:19.853331  1156 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


In [49]:
import tempfile
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.DNNLinearCombinedClassifier(
    model_dir=model_dir,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 50])

In [None]:
import pandas as pd
import urllib

# Define the column names for the data sets.
COLUMNS = ['userid','movieid','occupation','gender','zip_code','age', 'release_date', "genre_unknown", "action", "adventure", "animation", "children", "comedy",
    "crime", "documentary", "drama", "fantasy", "film-noir", "horror",
    "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western"]
LABEL_COLUMN = 'rating'
CATEGORICAL_COLUMNS = ['userid','movieid','occupation','gender','zip_code']
CONTINUOUS_COLUMNS = ['age', 'release_date', "genre_unknown", "action", "adventure", "animation", "children", "comedy",
    "crime", "documentary", "drama", "fantasy", "film-noir", "horror",
    "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western"]


# Read the training and test data sets into Pandas dataframe.
###############################
# Train, Test 데이터 나누기
###############################
df_train, df_test = python_random_split(
    df_data,
    ratio=0.75,
    seed=42
)

print("Train = {}, test = {}".format(len(df_train), len(df_test)))

# df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
# df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
df_train[LABEL_COLUMN] = df_train['rating']
df_test[LABEL_COLUMN] = df_test['rating']


def input_fn(df):
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.

  continuous_cols = {k: tf.constant(df[k].values)
                     for k in CONTINUOUS_COLUMNS}

  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tf.SparseTensor.
  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      dense_shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  
  print("continuous_cols.items() ::: ", continuous_cols.items())
  print("categorical_cols.items() ::: ", categorical_cols.items())
  
  # Merges the two dictionaries into one.
  
  # feature_cols = dict(continuous_cols.items() + categorical_cols.items())
  feature_cols = {**continuous_cols, **categorical_cols}
  
  # Converts the label column into a constant Tensor.
  label = tf.constant(df[LABEL_COLUMN].values)
  
  # Returns the feature columns and the label.
  return feature_cols, label

def train_input_fn():
  return input_fn(df_train)

def eval_input_fn():
  return input_fn(df_test)

print("train_input_fn ::: ", train_input_fn())

Train = 74993, test = 24998


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [44]:
m.fit(input_fn=train_input_fn, steps=200)
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

continuous_cols.items() :::  dict_items([('age', <tf.Tensor 'Const:0' shape=(74993,) dtype=int64>), ('release_date', <tf.Tensor 'Const_1:0' shape=(74993,) dtype=int64>), ('genre_unknown', <tf.Tensor 'Const_2:0' shape=(74993,) dtype=int64>), ('action', <tf.Tensor 'Const_3:0' shape=(74993,) dtype=int64>), ('adventure', <tf.Tensor 'Const_4:0' shape=(74993,) dtype=int64>), ('animation', <tf.Tensor 'Const_5:0' shape=(74993,) dtype=int64>), ('children', <tf.Tensor 'Const_6:0' shape=(74993,) dtype=int64>), ('comedy', <tf.Tensor 'Const_7:0' shape=(74993,) dtype=int64>), ('crime', <tf.Tensor 'Const_8:0' shape=(74993,) dtype=int64>), ('documentary', <tf.Tensor 'Const_9:0' shape=(74993,) dtype=int64>), ('drama', <tf.Tensor 'Const_10:0' shape=(74993,) dtype=int64>), ('fantasy', <tf.Tensor 'Const_11:0' shape=(74993,) dtype=int64>), ('film-noir', <tf.Tensor 'Const_12:0' shape=(74993,) dtype=int64>), ('horror', <tf.Tensor 'Const_13:0' shape=(74993,) dtype=int64>), ('musical', <tf.Tensor 'Const_14:0' 

W0819 20:47:25.275065  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.285043  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.292018  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.302992  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.310970  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.319945  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.325927  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.333908  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.344878  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.352856  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.359838  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.367819  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.375796  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.384772  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.390754  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.397737  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.405716  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.414693  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.421671  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.428655  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:25.438631  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:47:32.931826  1156 head.py:2027] Casting <dtype: 'int64'> labels to bool.


W0819 20:47:33.347482  1156 head.py:2027] Casting <dtype: 'int64'> labels to bool.


W0819 20:47:33.478645  1156 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.


W0819 20:47:33.520532  1156 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.


W0819 20:47:35.349532  1156 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\python\training\saver.py:1066: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.


continuous_cols.items() :::  dict_items([('age', <tf.Tensor 'Const:0' shape=(24998,) dtype=int64>), ('release_date', <tf.Tensor 'Const_1:0' shape=(24998,) dtype=int64>), ('genre_unknown', <tf.Tensor 'Const_2:0' shape=(24998,) dtype=int64>), ('action', <tf.Tensor 'Const_3:0' shape=(24998,) dtype=int64>), ('adventure', <tf.Tensor 'Const_4:0' shape=(24998,) dtype=int64>), ('animation', <tf.Tensor 'Const_5:0' shape=(24998,) dtype=int64>), ('children', <tf.Tensor 'Const_6:0' shape=(24998,) dtype=int64>), ('comedy', <tf.Tensor 'Const_7:0' shape=(24998,) dtype=int64>), ('crime', <tf.Tensor 'Const_8:0' shape=(24998,) dtype=int64>), ('documentary', <tf.Tensor 'Const_9:0' shape=(24998,) dtype=int64>), ('drama', <tf.Tensor 'Const_10:0' shape=(24998,) dtype=int64>), ('fantasy', <tf.Tensor 'Const_11:0' shape=(24998,) dtype=int64>), ('film-noir', <tf.Tensor 'Const_12:0' shape=(24998,) dtype=int64>), ('horror', <tf.Tensor 'Const_13:0' shape=(24998,) dtype=int64>), ('musical', <tf.Tensor 'Const_14:0' 

W0819 20:49:06.222658  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.229640  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.241608  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.249587  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.255569  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.263550  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.269534  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.275518  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.282503  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.290477  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.298456  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.304439  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.311423  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.319401  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.325385  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.333363  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.339857  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.347836  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.354818  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.362800  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:06.369777  1156 feature_column.py:1674] Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.


W0819 20:49:07.805510  1156 head.py:2027] Casting <dtype: 'int64'> labels to bool.


W0819 20:49:08.006972  1156 head.py:2027] Casting <dtype: 'int64'> labels to bool.


W0819 20:49:08.112887  1156 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.


W0819 20:49:08.152785  1156 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.


accuracy: 0.0600048
accuracy/baseline_label_mean: 3.5290024
accuracy/threshold_0.500000_mean: 0.0600048
auc: 0.0
auc_precision_recall: 1.0
global_step: 254
labels/actual_label_mean: 3.5290024
labels/prediction_mean: 1.0
loss: -24591164.0
precision/positive_threshold_0.500000_mean: 1.0
recall/positive_threshold_0.500000_mean: 1.0
