In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

input_file = 'ml-latest-small/ratings.csv'
headers = ['userId', 'movieId', 'rating', 'timestamp']
header_row = None
ratings_df = pd.read_csv(input_file, sep=",", names=headers, header=header_row, skiprows = 1,
                         dtype={'userId': np.int32, 'movieId': np.int32, 
                                'rating': np.float32,'timestamp': np.int32,
                         })

### Data Preprocessing

In [2]:
# Making MovieIds, UserIds zero-indexed

np_users = ratings_df.userId.values
np_items = ratings_df.movieId.values

unique_users = np.unique(np_users)
unique_items = np.unique(np_items)

n_users = unique_users.shape[0]
n_items = unique_items.shape[0]

print(n_users)
print(n_items)

max_item = unique_items[-1]

# Reconstruct the ratings set's user/movie indices

np_users = ratings_df.userId.values
np_users[:] -= 1 # Make users zero-indexed
# print(np_users)


# Mapping unique items down to an array 0..n_items-1
z = np.zeros(max_item+1, dtype=int)
z[unique_items] = np.arange(n_items)
movies_map = z[np_items]

np_ratings = ratings_df.rating.values
# print(np_ratings.shape[0])
ratings = np.zeros((np_ratings.shape[0], 3), dtype=object)
ratings[:, 0] = np_users
ratings[:, 1] = movies_map
ratings[:, 2] = np_ratings

# print(ratings)


610
9724


### Partitioning dataset to Training and Testing Sets



In [3]:
from scipy.sparse import coo_matrix

X_train, X_test = train_test_split(ratings, train_size=0.8)

# Ignoring timestamp
user_train, movie_train, rating_train = zip(*X_train)
train_sparse = coo_matrix((rating_train, (user_train, movie_train)), shape=(n_users, n_items))
# print(train_sparse)

user_test, movie_test, rating_test = zip(*X_test)
test_sparse = coo_matrix((rating_test, (user_test, movie_test)), shape=(n_users, n_items))
# print(test_sparse)


### Weighted alternating least squares (WALS) method 

Building the Graph - model


In [4]:
from tensorflow.contrib.factorization.python.ops import factorization_ops

# Default hyperparameters
DEFAULT_PARAMS = {
    'weights': True,
    'latent_factors': 5,
    'num_iters': 20,
    'regularization': 0.07,
    'unobs_weight': 0.01,
    'wt_type': 0,
    'feature_wt_factor': 130.0,
    'feature_wt_exp': 0.08,
    'delimiter': '\t'
}

# Parameters optimized with hypertuning for the MovieLens data set
OPTIMIZED_PARAMS = {
    'latent_factors': 34,
    'regularization': 9.83,
    'unobs_weight': 0.001,
    'feature_wt_factor': 189.8,
}

params = DEFAULT_PARAMS

# Create WALS model
row_wts = None
col_wts = None

num_rows = train_sparse.shape[0]
num_cols = train_sparse.shape[1]

# print(num_rows, num_cols)

row_factor = None
col_factor = None

indices = np.array([[i,j] for i,j in zip(train_sparse.row, train_sparse.col)])
values = np.asarray(train_sparse.data, dtype=np.float32)
shape = np.asarray([train_sparse.shape[0], train_sparse.shape[1]], dtype=np.int64)
# print(indices); print(values); print(shape)

input_tensor = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)

model = factorization_ops.WALSModel(num_rows, num_cols, 
                                    n_components=params['latent_factors'], 
                                    unobserved_weight=params['unobs_weight'],
                                    regularization=params['regularization'],
                                    row_weights=row_wts, 
                                    col_weights=col_wts)

# Retrieve the row and column factors
row_factor = model.row_factors[0]
col_factor = model.col_factors[0]


Instructions for updating:
Colocations handled automatically by placer.


### Training Phase

In [10]:
sess = tf.Session(graph=input_tensor.graph)

row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
col_update_op = model.update_col_factors(sp_input=input_tensor)[1]

sess.run(model.initialize_op)
sess.run(model.worker_init)

row_update_prep_gramian_op = model.row_update_prep_gramian_op
col_update_prep_gramian_op = model.col_update_prep_gramian_op

init_row_update_op = model.initialize_row_update_op
init_col_update_op = model.initialize_col_update_op

num_iterations = 1

for _ in range(num_iterations):    
#     sess.run(row_update_prep_gramian_op)
#     sess.run(init_row_update_op)
#     sess.run(row_update_op)
    row_update_prep_gramian_op.run(session=sess)
    init_row_update_op.run(session=sess)
    row_update_op.run(session=sess)
    
    col_update_prep_gramian_op.run(session=sess)
    init_col_update_op.run(session=sess)
    col_update_op.run(session=sess)


#     sess.run(col_update_prep_gramian_op)
#     sess.run(init_col_update_op)
#     sess.run(col_update_op)


### Evaluate 

In [11]:
# Evaluate output factor matrices
output_row = row_factor.eval(session=sess)
output_col = col_factor.eval(session=sess)

sess.close()

### Save Model

In [17]:
import os

model_dir = os.path.join("WALS", 'model')

os.makedirs(model_dir)
np.save(os.path.join(model_dir, 'user'), np_users)
np.save(os.path.join(model_dir, 'movie'), movies_map)
np.save(os.path.join(model_dir, 'row'), row_factor)
np.save(os.path.join(model_dir, 'col'), col_factor)

TypeError: can't pickle _thread.RLock objects

### Results
Compute RMSE and MAE between predicted and actual ratings.

In [16]:
import math

def get_rmse(output_row, output_col, actual):
    mse = 0
    for i in range(actual.data.shape[0]):
        row_pred = output_row[actual.row[i]]
        col_pred = output_col[actual.col[i]]
        err = actual.data[i] - np.dot(row_pred, col_pred)
        mse += err * err
    mse /= actual.data.shape[0]
    rmse = math.sqrt(mse)
    return rmse

train_rmse = get_rmse(output_row, output_col, train_sparse)
test_rmse = get_rmse(output_row, output_col, test_sparse)    
    
tf.logging.info('train RMSE = %.2f' % train_rmse)
tf.logging.info('test RMSE = %.2f' % test_rmse)

INFO:tensorflow:train RMSE = 3.65
INFO:tensorflow:test RMSE = 3.65


### Make Recommendations

In [None]:
int k = 5
