### Wide & Deep 튜토리얼 버전

In [2]:
import tensorflow as tf

In [None]:
#######################
# Dataset 100K 용 세팅
#######################
import numpy as np

# Load each data set (users, movies, and ratings).
users_cols = ['userid', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('./data/100K/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['userid', 'movieid', 'rating', 'timestamp']
ratings = pd.read_csv('./data/100K/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    ITEM_COL, 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    './data/100K/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users[USER_COL] = users[USER_COL].apply(lambda x: str(x-1))
movies[ITEM_COL] = movies[ITEM_COL].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings[ITEM_COL] = ratings[ITEM_COL].apply(lambda x: str(x-1))
ratings[USER_COL] = ratings[USER_COL].apply(lambda x: str(x-1))
ratings[RATING_COL] = ratings[RATING_COL].apply(lambda x: float(x))

# Compute the number of movies to which a genre is assigned.
genre_occurences = movies[genre_cols].sum().to_dict()

# Since some movies can belong to more than one genre, we create different
# 'genre' columns as follows:
# - all_genres: all the active genres of the movie.
# - genre: randomly sampled from the active genres.
def mark_genres(movies, genres):
  def get_random_genre(gs):
    active = [genre for genre, g in zip(genres, gs) if g==1]
    if len(active) == 0:
      return 'Other'
    return np.random.choice(active)
  def get_all_genres(gs):
    active = [genre for genre, g in zip(genres, gs) if g==1]
    if len(active) == 0:
      return 'Other'
    return '|'.join(active)
  movies['Genres'] = [
      get_random_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
  movies['Genres_string'] = [
      get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]

mark_genres(movies, genre_cols)

# Create one merged DataFrame containing all the movielens data.
movielens = ratings.merge(movies, on=ITEM_COL).merge(users, on=USER_COL)

movielens = movielens.fillna(0)

print("movielens::: \n", movielens.head())
df_data = movielens


###############################
# 데이터 전처리
# 2-1 Genre 인코딩
###############################
# Encode 'genres' into int array (multi-hot representation) to use as item features
genres_encoder = sklearn.preprocessing.MultiLabelBinarizer()
df_data['Genres'] = genres_encoder.fit_transform(
    df_data['Genres_string'].apply(lambda s: s.split("|"))
).tolist()

###############################
# 데이터 전처리
# 2-2 sex 인코딩
###############################
le = sklearn.preprocessing.LabelEncoder()
df_data['sex'] = le.fit_transform(df_data['sex'])


# 기존 트레인 데이터 : UserId  MovieId  Rating   timestamp  MovieName  Genres
df_data['MovieName'] = df_data['title']

df_data_new = df_data[['userid','movieid','rating','movie_name','genres', 'sex', 'age']]
df_data = df_data_new
print("df_data::: \n", df_data.head())

In [5]:
# Categorical base columns.
gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["female", "male"])
genre = tf.contrib.layers.sparse_column_with_keys(column_name="genre", keys=[
  "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
zip_code = tf.contrib.layers.sparse_column_with_hash_bucket("zip_code", hash_bucket_size=100)

# Continuous base columns.
age = tf.contrib.layers.real_valued_column("age")
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
release_date = tf.contrib.layers.real_valued_column("release_date")


In [7]:
wide_columns = [
  gender, genre, occupation, zip_code, release_date, age_buckets,   
  tf.contrib.layers.crossed_column([gender, genre], hash_bucket_size=int(1e4)),
  tf.contrib.layers.crossed_column([age_buckets, genre], hash_bucket_size=int(1e4)),
  tf.contrib.layers.crossed_column([occupation, genre], hash_bucket_size=int(1e6))]

In [8]:
deep_columns = [
  tf.contrib.layers.embedding_column(gender, dimension=8),
  tf.contrib.layers.embedding_column(genre, dimension=8),
  tf.contrib.layers.embedding_column(occupation, dimension=8),
  tf.contrib.layers.embedding_column(zip_code, dimension=8),
  age, release_date
]

W0813 20:17:45.447030 18472 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


W0813 20:17:45.449023 18472 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


W0813 20:17:45.451018 18472 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


W0813 20:17:45.453011 18472 feature_column.py:1091] The default stddev value of initializer was changed from "1/sqrt(vocab_size)" to "1/sqrt(dimension)" in core implementation (tf.feature_column.embedding_column).


In [9]:
import tempfile
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.DNNLinearCombinedClassifier(
    model_dir=model_dir,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 50])

W0813 20:18:00.824897 18472 deprecation.py:573] From <ipython-input-9-3507e962daab>:7: calling DNNLinearCombinedClassifier.__init__ (from tensorflow.contrib.learn.python.learn.estimators.dnn_linear_combined) with fix_global_step_increment_bug=False is deprecated and will be removed after 2017-04-15.
Instructions for updating:
Please set fix_global_step_increment_bug=True and update training steps in your pipeline. See pydoc for details.


W0813 20:18:00.826891 18472 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\dnn_linear_combined.py:676: multi_class_head (from tensorflow.contrib.learn.python.learn.estimators.head) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.contrib.estimator.*_head.


W0813 20:18:00.833874 18472 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py:1179: BaseEstimator.__init__ (from tensorflow.contrib.learn.python.learn.estimators.estimator) is deprecated and will be removed in a future version.
Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*


W0813 20:18:00.835867 18472 deprecation.py:323] From D:\01.Programming\PycharmProjects\Recommenders-movielens\venv\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py:427: RunConfig.__init__ (from tensorflow.contrib.learn.python.learn.estimators.run_config) is deprecated and will be removed in a future version.
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.


In [None]:
import pandas as pd
import urllib

# Define the column names for the data sets.
COLUMNS = ["gender", "genre", "occupation", "zip_code", "age", "age_buckets", "release_date"]
LABEL_COLUMN = 'Rating'
CATEGORICAL_COLUMNS = ["gender", "genre", "occupation", "zip_code"]
CONTINUOUS_COLUMNS = ["age", "release_date"]


# Read the training and test data sets into Pandas dataframe.
df_data = 
df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
df_train[LABEL_COLUMN] = (df_train['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
df_test[LABEL_COLUMN] = (df_test['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)

def input_fn(df):
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.
  continuous_cols = {k: tf.constant(df[k].values)
                     for k in CONTINUOUS_COLUMNS}
  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tf.SparseTensor.
  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
  # Converts the label column into a constant Tensor.
  label = tf.constant(df[LABEL_COLUMN].values)
  # Returns the feature columns and the label.
  return feature_cols, label

def train_input_fn():
  return input_fn(df_train)

def eval_input_fn():
  return input_fn(df_test)