In [147]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix

from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

from tensorflow.python.ops import array_ops
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import sparse_tensor
from tensorflow.python.ops import sparse_ops
from tensorflow.contrib.factorization.python.ops import factorization_ops
from tensorflow.contrib.factorization.python.ops import factorization_ops_test_utils

import tensorflow as tf
import pandas as pd
import numpy as np
import math
import os

def get_MAE(output_row, output_col, actual):
    mae = 0
    for i in range(actual.data.shape[0]):
        row_pred = output_row[actual.row[i]]
        col_pred = output_col[actual.col[i]]
        mae += abs(actual.data[i] - np.dot(row_pred, col_pred))
    mae /= actual.data.shape[0]
    return mae

def get_RMSE(output_row, output_col, actual):
    mse = 0
    for i in range(actual.data.shape[0]):
        row_pred = output_row[actual.row[i]]
        col_pred = output_col[actual.col[i]]
        err = actual.data[i] - np.dot(row_pred, col_pred)
        mse += err * err
    mse /= actual.data.shape[0]
    rmse = math.sqrt(mse)
    return rmse

def evaluate_model(sess, train_sparse, test_sparse, row_factor, col_factor):

    train_rmse = get_RMSE(row_factor, col_factor, train_sparse)
    test_rmse = get_RMSE(row_factor, col_factor, test_sparse)    

#     print('train RMSE: ', train_rmse)
#     print('test RMSE: ', test_rmse)
    tf.logging.info('train RMSE = %f' % train_rmse)
    tf.logging.info('test RMSE = %f' % test_rmse)

    train_mae = get_MAE(row_factor, col_factor, train_sparse)
    test_mae = get_MAE(row_factor, col_factor, test_sparse)    
#     print('train MAE: ', train_mae)
#     print('test MAE: ', test_mae)

    tf.logging.info('train MAE = %f' % train_mae)
    tf.logging.info('test MAE = %f' % test_mae)


In [148]:
headers = ['userId', 'movieId', 'rating', 'timestamp']
ratings_df = pd.read_csv('../datasets/u.data', names=headers, header=None, sep="\t", 
                         dtype={'userId': np.int32, 'movieId': np.int32, 
                                'rating': np.float32,'timestamp': np.int32,
                         })

In [149]:
print("\nData Preprocessing....\n")

np_users = ratings_df.userId.values
np_items = ratings_df.movieId.values

unique_users = np.unique(np_users)
unique_items = np.unique(np_items)

n_users = unique_users.shape[0]
n_items = unique_items.shape[0]

print(n_users, n_items)

max_item = unique_items[-1]
max_user = unique_items[-1]

# Reconstruct the ratings set's user/movie indices

# Mapping unique users down to an array 0..n_users-1
z = np.zeros(max_user+1, dtype=int)
z[unique_users] = np.arange(n_users)
users_map = z[np_users]

# Mapping unique items down to an array 0..n_items-1
z = np.zeros(max_item+1, dtype=int)
z[unique_items] = np.arange(n_items)
movies_map = z[np_items]

np_ratings = ratings_df.rating.values
ratings = np.zeros((np_ratings.shape[0], 3), dtype=object)
ratings[:, 0] = users_map
print(users_map)
ratings[:, 1] = movies_map
ratings[:, 2] = np_ratings


Data Preprocessing....

943 1682
[195 185  21 ... 275  12  11]


In [150]:
X_train, X_test = train_test_split(ratings, train_size=0.9)

# Ignoring timestamp
user_train, movie_train, rating_train = zip(*X_train)
train_sparse = coo_matrix((rating_train, (user_train, movie_train)), shape=(n_users, n_items))
# print(train_sparse)

user_test, movie_test, rating_test = zip(*X_test)
test_sparse = coo_matrix((rating_test, (user_test, movie_test)), shape=(n_users, n_items))


In [151]:
# Default hyperparameters
DEFAULT_PARAMS = {
    'weights': True,
    'latent_factors': 5,
    'num_iters': 20,
    'regularization': 0.07,
    'unobs_weight': 0.01,
    'wt_type': 0,
    'feature_wt_factor': 130.0,
    'feature_wt_exp': 0.08,
    'delimiter': '\t'
}

# Parameters optimized with hypertuning for the MovieLens data set
OPTIMIZED_PARAMS = {
    'latent_factors': 34,
    'regularization': 9.83,
    'unobs_weight': 0.001,
    'feature_wt_factor': 189.8,
}

LOG_RATINGS = 0
LINEAR_RATINGS = 1
LINEAR_OBS_W = 100.0

params = OPTIMIZED_PARAMS

# Create WALS model
num_rows = train_sparse.shape[0]
num_cols = train_sparse.shape[1]

row_wts = np.ones(num_rows)
    
p = (train_sparse > 0.0).sum(0)
p = p.astype("float32")
p[p==0] = 0.00001
frac = np.array(1.0/p)
frac[np.ma.masked_invalid(frac).mask] = 0.0

col_wts = np.array(params['feature_wt_factor'] * frac).flatten()
assert np.isfinite(col_wts).sum() == col_wts.shape[0]

row_factor = None
col_factor = None

indices = np.array([[i,j] for i,j in zip(train_sparse.row, train_sparse.col)])
values = np.asarray(train_sparse.data, dtype=np.float32)
shape = np.asarray([train_sparse.shape[0], train_sparse.shape[1]], dtype=np.int64)
# print(indices); print(values); print(shape)

input_tensor = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)

model = factorization_ops.WALSModel(num_rows, num_cols, 
                                    n_components=params['latent_factors'], 
                                    unobserved_weight=params['unobs_weight'],
                                    regularization=params['regularization'],
                                    row_weights=row_wts, 
                                    col_weights=col_wts)

# Retrieve the row and column factors
row_factor = model.row_factors[0]
col_factor = model.col_factors[0]

In [152]:
print("\nTraining....\n")

sess = tf.Session(graph=input_tensor.graph)

row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
col_update_op = model.update_col_factors(sp_input=input_tensor)[1]

sess.run(model.initialize_op)
sess.run(model.worker_init)

row_update_prep_gramian_op = model.row_update_prep_gramian_op
col_update_prep_gramian_op = model.col_update_prep_gramian_op

init_row_update_op = model.initialize_row_update_op
init_col_update_op = model.initialize_col_update_op

num_iterations = 20

for i in range(num_iterations):    
    row_update_prep_gramian_op.run(session=sess)
    init_row_update_op.run(session=sess)
    row_update_op.run(session=sess)
    
    col_update_prep_gramian_op.run(session=sess)
    init_col_update_op.run(session=sess)
    col_update_op.run(session=sess)
        
    print("\nEvaluating..: ", i, "/", num_iterations)
    output_row = row_factor.eval(session=sess)
    output_col = col_factor.eval(session=sess)
    evaluate_model(sess, train_sparse, test_sparse, output_row, output_col)



Training....


Evaluating..:  0 / 20
INFO:tensorflow:train RMSE = 1.933830
INFO:tensorflow:test RMSE = 2.694779
INFO:tensorflow:train MAE = 1.500535
INFO:tensorflow:test MAE = 2.171231

Evaluating..:  1 / 20
INFO:tensorflow:train RMSE = 0.747385
INFO:tensorflow:test RMSE = 1.135554
INFO:tensorflow:train MAE = 0.583691
INFO:tensorflow:test MAE = 0.890394

Evaluating..:  2 / 20
INFO:tensorflow:train RMSE = 0.661742
INFO:tensorflow:test RMSE = 1.074285
INFO:tensorflow:train MAE = 0.509932
INFO:tensorflow:test MAE = 0.841299

Evaluating..:  3 / 20
INFO:tensorflow:train RMSE = 0.629542
INFO:tensorflow:test RMSE = 1.059356
INFO:tensorflow:train MAE = 0.481121
INFO:tensorflow:test MAE = 0.829985

Evaluating..:  4 / 20
INFO:tensorflow:train RMSE = 0.611104
INFO:tensorflow:test RMSE = 1.051104
INFO:tensorflow:train MAE = 0.464414
INFO:tensorflow:test MAE = 0.823129

Evaluating..:  5 / 20
INFO:tensorflow:train RMSE = 0.598774
INFO:tensorflow:test RMSE = 1.045066
INFO:tensorflow:train MAE = 0.45

In [None]:
sess.close()

### Save Model

In [153]:
print("Saving Model....\n")

# Evaluate output factor matrices
output_row = row_factor.eval(session=sess)
output_col = col_factor.eval(session=sess)

model_dir = os.path.join("Saved-Models", 'WALS')
os.makedirs(model_dir)
np.save(os.path.join(model_dir, 'user'), users_map)
np.save(os.path.join(model_dir, 'movie'), movies_map)
np.save(os.path.join(model_dir, 'row'), output_row)
np.save(os.path.join(model_dir, 'col'), output_col)

\Saving Model....

