# Import the packages and check connection to bucket

In [1]:
from google.cloud import storage
import pandas as pd ## for dataset and eda
import numpy as np ## for eda
from datetime import datetime

In [2]:
bucket_name = "firstprojectdl"

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)

print(bucket)
print('Great, we now have access to our first bucket on google cloud storage where we put our data')

<Bucket: firstprojectdl>
Great, we now have access to our first bucket on google cloud storage where we put our data


In [3]:
from google.cloud import storage
import pandas as pd

bucket_name = "firstprojectdl"

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)

# When you have your files in a subfolder of the bucket.
my_prefix = "data/movieLens/movieLens100k/" # the name of the subfolder
blobs = bucket.list_blobs(prefix = my_prefix, delimiter = '/')

dfDict = {}
dateparse = lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')

for blob in blobs:
    if(blob.name != my_prefix): # ignoring the subfolder itself 
        file_name = blob.name.replace(my_prefix, "")
        blob.download_to_filename(file_name) # download the file to the machine
        print(file_name)
        if file_name =='ratings100k.dat':
            df = pd.read_csv(file_name, sep='\t', 
                            names=['user_id', 'movie_id', 'rating', 'timestamp'], 
                            parse_dates=['timestamp'], 
                            date_parser=dateparse) # load the rating data
        
 

ratings100k.dat
u.data
u.item
u.user


In [4]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [5]:
with open('ratings100k.dat') as infile:
  with open('ratings100kNew.dat', 'w') as outfile:
    for line in infile:
      fields = line.split('\t')
      outfile.write('::'.join(fields))

In [6]:
import numpy as np

In [7]:
# Comet ML for experiment logging
from comet_ml import Experiment

In [8]:
# Import packages
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
# Set random seed
tf.random.set_random_seed(1994)
np.random.seed(1994)

In [10]:
def read_rating(num_users, num_items, num_total_ratings, a, b, train_ratio):
    """
    Function to read in the ratings data
    :param path: path to ratings data
    :param num_users: number of users
    :param num_items: number of items
    :param num_total_ratings: number of total ratings
    :param a: positive rating (1)
    :param b: negative rating (0)
    :param train_ratio: ratio that splits train and test sets
    """
    fp = open("ratings100kNew.dat")

    user_train_set = set()
    user_test_set = set()
    item_train_set = set()
    item_test_set = set()

    R = np.zeros((num_users, num_items))
    mask_R = np.zeros((num_users, num_items))
    C = np.ones((num_users, num_items)) * b

    train_R = np.zeros((num_users, num_items))
    test_R = np.zeros((num_users, num_items))

    train_mask_R = np.zeros((num_users, num_items))
    test_mask_R = np.zeros((num_users, num_items))

    random_perm_idx = np.random.permutation(num_total_ratings)
    train_idx = random_perm_idx[0:int(num_total_ratings * train_ratio)]
    test_idx = random_perm_idx[int(num_total_ratings * train_ratio):]

    num_train_ratings = len(train_idx)
    num_test_ratings = len(test_idx)

    lines = fp.readlines()
    for line in lines:
        user, item, rating, _ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        R[user_idx, item_idx] = int(rating)
        mask_R[user_idx, item_idx] = 1
        C[user_idx, item_idx] = a

    # Training set
    for itr in train_idx:
        line = lines[itr]
        user, item, rating, _ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        train_R[user_idx, item_idx] = int(rating)
        train_mask_R[user_idx, item_idx] = 1

        user_train_set.add(user_idx)
        item_train_set.add(item_idx)

    # Test set
    for itr in test_idx:
        line = lines[itr]
        user, item, rating, _ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        test_R[user_idx, item_idx] = int(rating)
        test_mask_R[user_idx, item_idx] = 1

        user_test_set.add(user_idx)
        item_test_set.add(item_idx)

    return R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings, \
           user_train_set, item_train_set, user_test_set, item_test_set

In [11]:
data_name = 'ml-1ook'
num_users = 943
num_items = 1683
num_total_ratings = 100000
# Data is split into random 75% - 25% train-test sets
train_ratio = 0.75

In [12]:
R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings, \
user_train_set, item_train_set, user_test_set, item_test_set = read_rating(num_users, num_items, num_total_ratings, 1, 0, train_ratio)

In [13]:
# Get hyper-parameters
hyper_params = {
    "hidden_neuron": 500,
    "lambda_value": 1,
    "epochs": 500,
    "batch_size": 512,
    "optimizer": "Adam",
    "learning_rate": 0.001,
    "random_seed": 1994
}

In [14]:
# Import packages
import tensorflow as tf
import time
import numpy as np
import os
import math

In [15]:
class AutoRec:
    """
    Function to define the AutoRec model class
    """
    def __init__(self, sess, args,
                 num_users, num_items,
                 R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings,
                 user_train_set, item_train_set, user_test_set, item_test_set,
                 result_path):

        self.sess = sess
        self.args = args

        self.num_users = num_users
        self.num_items = num_items

        self.R = R
        self.mask_R = mask_R
        self.C = C
        self.train_R = train_R
        self.train_mask_R = train_mask_R
        self.test_R = test_R
        self.test_mask_R = test_mask_R
        self.num_train_ratings = num_train_ratings
        self.num_test_ratings = num_test_ratings

        self.user_train_set = user_train_set
        self.item_train_set = item_train_set
        self.user_test_set = user_test_set
        self.item_test_set = item_test_set

        self.hidden_neuron = args['hidden_neuron']
        self.train_epoch = args['train_epoch']
        self.batch_size = args['batch_size']
        self.num_batch = int(math.ceil(self.num_users / float(self.batch_size)))

        self.base_lr = args['base_lr']
        self.optimizer_method = args['optimizer_method']
        self.display_step = args['display_step']
        self.random_seed = args['random_seed']

        self.global_step = tf.Variable(0, trainable=False)
        self.decay_epoch_step = args['decay_epoch_step']
        self.decay_step = self.decay_epoch_step * self.num_batch
        self.lr = tf.compat.v1.train.exponential_decay(self.base_lr, self.global_step,
                                                       self.decay_step, 0.96, staircase=True)
        self.lambda_value = args['lambda_value']

        self.train_cost_list = []
        self.test_cost_list = []
        self.test_rmse_list = []

        self.result_path = result_path
        self.grad_clip = args['grad_clip']

    def run(self, experiment):
        """
        Function to run AutoRec
        :param experiment: CometML Experiment function
        """
        # Build AutoRec
        self.prepare_model()
        init = tf.compat.v1.global_variables_initializer()
        self.sess.run(init)

        # Train and evaluate AutoRec for all epochs
        for epoch_itr in range(self.train_epoch):
            experiment.set_step(epoch_itr)
            self.train_model(epoch_itr,experiment)
            self.test_model(epoch_itr, experiment)

        # Log results
        self.make_records()

    def prepare_model(self):
        """
        Function to build AutoRec
        """
        self.input_R = tf.compat.v1.placeholder(dtype=tf.float32,
                                                shape=[None, self.num_items],
                                                name="input_R")
        self.input_mask_R = tf.compat.v1.placeholder(dtype=tf.float32,
                                                     shape=[None, self.num_items],
                                                     name="input_mask_R")

        V = tf.compat.v1.get_variable(name="V", initializer=tf.compat.v1.truncated_normal(
            shape=[self.num_items, self.hidden_neuron],
            mean=0, stddev=0.03), dtype=tf.float32)
        W = tf.compat.v1.get_variable(name="W", initializer=tf.compat.v1.truncated_normal(
            shape=[self.hidden_neuron, self.num_items],
            mean=0, stddev=0.03), dtype=tf.float32)
        mu = tf.compat.v1.get_variable(name="mu", initializer=tf.zeros(shape=self.hidden_neuron), dtype=tf.float32)
        b = tf.compat.v1.get_variable(name="b", initializer=tf.zeros(shape=self.num_items), dtype=tf.float32)

        pre_Encoder = tf.matmul(self.input_R, V) + mu
        self.Encoder = tf.nn.sigmoid(pre_Encoder)
        pre_Decoder = tf.matmul(self.Encoder, W) + b
        self.Decoder = tf.identity(pre_Decoder)

        pre_rec_cost = tf.multiply((self.input_R - self.Decoder), self.input_mask_R)
        rec_cost = tf.square(self.l2_norm(pre_rec_cost))
        pre_reg_cost = tf.square(self.l2_norm(W)) + tf.square(self.l2_norm(V))
        reg_cost = self.lambda_value * 0.5 * pre_reg_cost

        self.cost = rec_cost + reg_cost

        if self.optimizer_method == "Adam":
            optimizer = tf.compat.v1.train.AdamOptimizer(self.lr)
        elif self.optimizer_method == "RMSProp":
            optimizer = tf.compat.v1.train.RMSPropOptimizer(self.lr)
        else:
            raise ValueError("Optimizer Key ERROR")

        if self.grad_clip:
            gvs = optimizer.compute_gradients(self.cost)
            capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs]
            self.optimizer = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
        else:
            self.optimizer = optimizer.minimize(self.cost, global_step=self.global_step)

    def train_model(self, itr, experiment):
        """
        Function to train AutoRec
        :param itr: Current iteration
        :param experiment: CometML experiment
        """
        start_time = time.time()
        random_perm_doc_idx = np.random.permutation(self.num_users)

        batch_cost = 0
        for i in range(self.num_batch):
            if i == self.num_batch - 1:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size:]
            elif i < self.num_batch - 1:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size: (i + 1) * self.batch_size]

            _, Cost = self.sess.run(
                [self.optimizer, self.cost],
                feed_dict={self.input_R: self.train_R[batch_set_idx, :],
                           self.input_mask_R: self.train_mask_R[batch_set_idx, :]})

            batch_cost = batch_cost + Cost
        self.train_cost_list.append(batch_cost)

        if (itr + 1) % self.display_step == 0:

            print("Training //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(batch_cost),
                  "Elapsed time : %d sec" % (time.time() - start_time))

        experiment.log_metric("loss", batch_cost, step=itr)

    def test_model(self, itr, experiment):
        """
        Function to evaluate AutoRec
        :param itr: Current iteration
        :param experiment: CometML experiment
        """
        start_time = time.time()
        Cost, Decoder = self.sess.run(
            [self.cost, self.Decoder],
            feed_dict={self.input_R: self.test_R,
                       self.input_mask_R: self.test_mask_R})

        self.test_cost_list.append(Cost)

        if (itr + 1) % self.display_step == 0:
            Estimated_R = Decoder.clip(min=1, max=5)
            unseen_user_test_list = list(self.user_test_set - self.user_train_set)
            unseen_item_test_list = list(self.item_test_set - self.item_train_set)

            for user in unseen_user_test_list:
                for item in unseen_item_test_list:
                    if self.test_mask_R[user, item] == 1:  # exist in test set
                        Estimated_R[user, item] = 3

            pre_numerator = np.multiply((Estimated_R - self.test_R), self.test_mask_R)
            numerator = np.sum(np.square(pre_numerator))
            denominator = self.num_test_ratings
            RMSE = np.sqrt(numerator / float(denominator))

            self.test_rmse_list.append(RMSE)

            print("Testing //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(Cost),
                  " RMSE = {:.5f}".format(RMSE),
                  "Elapsed time : %d sec" % (time.time() - start_time))
            print("=" * 100)

        experiment.log_metric("RMSE", RMSE, step=itr)

    def make_records(self):
        """
        Function to log results
        """
        if not os.path.exists(self.result_path):
            os.makedirs(self.result_path)

        basic_info = self.result_path + "basic_info.txt"
        train_record = self.result_path + "train_record.txt"
        test_record = self.result_path + "test_record.txt"

        with open(train_record, 'w') as f:
            f.write(str("Cost:"))
            f.write('\t')
            for itr in range(len(self.train_cost_list)):
                f.write(str(self.train_cost_list[itr]))
                f.write('\t')
            f.write('\n')

        with open(test_record, 'w') as g:
            g.write(str("Cost:"))
            g.write('\t')
            for itr in range(len(self.test_cost_list)):
                g.write(str(self.test_cost_list[itr]))
                g.write('\t')
            g.write('\n')

            g.write(str("RMSE:"))
            for itr in range(len(self.test_rmse_list)):
                g.write(str(self.test_rmse_list[itr]))
                g.write('\t')
            g.write('\n')

        #with open(basic_info, 'w') as h:
        #    h.write(str(self.args))

    def l2_norm(self, tensor):
        """
        Function to apply L2 normalization
        :param tensor: TensorFlow tensor
        """
        return tf.sqrt(tf.reduce_sum(tf.square(tensor)))

In [16]:
# Initialize TensorFlow Config
config = tf.compat.v1.ConfigProto()

In [72]:
experiment = Experiment(api_key="vV3GGUE8tetqNJuD2kGxVi1M9", project_name="autoencoders-movielens100k")
experiment.log_parameters(hyper_params)

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/benichou/autoencoders-movielens100k/5759b090846e4451a274ab7d75b30b6c
COMET INFO:   Parameters:
COMET INFO:     batch_size    : 512
COMET INFO:     epochs        : 500
COMET INFO:     hidden_neuron : 500
COMET INFO:     lambda_value  : 1
COMET INFO:     learning_rate : 0.001
COMET INFO:     optimizer     : Adam
COMET INFO:     random_seed   : 1994
COMET INFO:   Uploads:
COMET INFO:     environment details      : 1
COMET INFO:     filename                 : 1
COMET INFO:     git metadata             : 1
COMET INFO:     git-patch (uncompressed) : 1 (24 MB)
COMET INFO:     installed packages       : 1
COMET INFO:     os packages              : 1
COMET INFO: ---------------------------
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/ben

In [73]:
args = {'hidden_neuron': 500, 'lambda_value':1, 'train_epoch': 500, 'batch_size': 512, 'optimizer_method': 'Adam', 'grad_clip': False, 'base_lr': 1e-3, 'decay_epoch_step': 50, 'random_seed':1994, 'display_step':1}

In [74]:
result_path = 'results/'

In [75]:
with tf.compat.v1.Session(config=config) as sess:
    # Define the AutoRec class from `AutoRec.py`
    AutoRec = AutoRec(sess, args,
                      num_users, num_items,
                      R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings,
                      user_train_set, item_train_set, user_test_set, item_test_set,
                      result_path)
    # Run the AutoRec model
    AutoRec.run(experiment)

ValueError: Variable V already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-60-3d6b2725f39f>", line 88, in prepare_model
    mean=0, stddev=0.03), dtype=tf.float32)
  File "<ipython-input-60-3d6b2725f39f>", line 62, in run
    self.prepare_model()
  File "<ipython-input-65-96b2d812faeb>", line 9, in <module>
    AutoRec.run(experiment)
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3357, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
