# Import the packages and check connection to bucket

In [23]:
from google.cloud import storage
import pandas as pd ## for dataset and eda
import numpy as np ## for eda
from datetime import datetime

In [24]:
bucket_name = "firstprojectdl"

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)

print(bucket)
print('Great, we now have access to our first bucket on google cloud storage where we put our data')

<Bucket: firstprojectdl>
Great, we now have access to our first bucket on google cloud storage where we put our data


RATINGS FILE DESCRIPTION
================================================================================

All ratings are contained in the file "ratings.dat" and are in the
following format:

UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

MOVIES FILE DESCRIPTION
================================================================================

Movie information is in the file "movies.dat" and is in the following
format:

MovieID::Title::Genres

- Titles are identical to titles provided by the IMDB (including
year of release)
- Genres are pipe-separated and are selected from the following genres:

	* Action
	* Adventure
	* Animation
	* Children's
	* Comedy
	* Crime
	* Documentary
	* Drama
	* Fantasy
	* Film-Noir
	* Horror
	* Musical
	* Mystery
	* Romance
	* Sci-Fi
	* Thriller
	* War
	* Western


# Read the datasets from google cloud storage

In [50]:
from google.cloud import storage
import pandas as pd

bucket_name = "firstprojectdl"

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)

# When you have your files in a subfolder of the bucket.
my_prefix = "data/movieLens/movieLens1M/" # the name of the subfolder
blobs = bucket.list_blobs(prefix = my_prefix, delimiter = '/')

dfDict = {}
dateparse = lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')

for blob in blobs:
    if(blob.name != my_prefix): # ignoring the subfolder itself 
        file_name = blob.name.replace(my_prefix, "")
        blob.download_to_filename(file_name) # download the file to the machine
        if file_name =='ratings.dat':
            df = pd.read_csv('ratings.dat', sep = '::', header=None)
            df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
            dfDict[file_name] = df
        



dfDict['rating'] = dfDict['ratings.dat'] 
del dfDict['ratings.dat'] 
        



In [53]:
df = dfDict['rating']
df.shape

(1000209, 4)

In [54]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [55]:
n_users = df.user_id.unique().shape[0]
print(f'Number of users: {n_users}')
n_items = df.movie_id.unique().shape[0]
print(f'Number of items/movies rated: {n_items}')

Number of users: 6040
Number of items/movies rated: 3706


In [61]:
## then create the rating matrix with dimension (number of users, number of items)

ratings = np.zeros((n_users, n_items))
print(ratings)
print(f'Matrix shape is {ratings.shape}')
maxRatingsPossible = n_users * n_items
print(f'Maximum number of ratings is {maxRatingsPossible}. With this number of ratings, the matrix would be dense')

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Matrix shape is (6040, 3706)
Maximum number of ratings is 22384240. With this number of ratings, the matrix would be dense


In [70]:
listMovies = df.movie_id.unique()
for newId, movies in enumerate(listMovies):
    df.loc[df['movie_id'] == movies, 'movie_id'] = newId

In [71]:
## what we do here here is as follows:
# 1. we iterate over the dataframe rows
# 2. we then go to the right user id row
# and rating column and then assign it the right rating
from tqdm import tqdm
for row in tqdm(df.itertuples()):
    ratings[row[1]-1, row[2]-1] = row[3] 
ratings

1000209it [00:02, 490656.48it/s]


array([[5., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 4.]])

In [72]:
ratings.shape

(6040, 3706)

In [73]:
numerator = float(len(ratings.nonzero()[0])) # of course, we have 100,000 ratings
denominator =(ratings.shape[0] * ratings.shape[1]) # then 100000/1586126 

sparsity = numerator/denominator

sparsity *= 100 # get the percentage

print(f'Matrix Sparsity : {round(sparsity, 3)} %')

Matrix Sparsity : 4.219 %


In [74]:
def train_val_test_split(ratings):
    """
    The purpose of this method is split our datasets betwen:
    1. Train
    2. Validation (we extracted 7 ratings for a user from train and put it in there)
    3. Test (we extracted 7 ratings for a user from train and put it in there)
    We go through each of the user and randonly select ratings from train set that will go
    from the initial matrix, take 7 to put in the validation matrix 
    and take 7 to put in the test matrix. Finally these ratings that have been put in 
    the test and val sets are set to 0 in the main/train matrix
    """
    test = np.zeros(ratings.shape)
    validation = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                       size=7, 
                                        replace=False) # for user, get the test rating array
        val_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                       size=7, 
                                        replace=False) # for user, get the validation array
        train[user, test_ratings] = 0. # set to 0 the train matrix/copy of the original matrix the ratings taken for test
        train[user, val_ratings] = 0.# set to 0 the train matrix/copy of the original matrix the ratings taken for validation
        test[user, test_ratings] = ratings[user, test_ratings] # assign to the test matrix/matrix of 0 the test ratings
        validation[user, val_ratings] = ratings[user, val_ratings] # assign to the val matrix/matrix of 0 the val ratings
        
    assert(np.all((train * test * validation) == 0))

    return train, validation, test

In [75]:
train, validation, test = train_val_test_split(ratings)

print("This is the train rating matrix")
print(train) 
print(f"The shape is {train.shape}")
print("This is the validation rating matrix")
print(validation)
print(f"The shape is {validation.shape}")
print("This is the test rating matrix")
print(test)
print(f"The shape is {test.shape}")

This is the train rating matrix
[[5. 0. 0. ... 0. 0. 5.]
 [0. 0. 0. ... 0. 0. 5.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 4.]]
The shape is (6040, 3706)
This is the validation rating matrix
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
The shape is (6040, 3706)
This is the test rating matrix
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
The shape is (6040, 3706)


In [76]:
from numpy.linalg import solve

In [77]:
class MFals():
    def __init__(self, 
                 ratings, 
                 n_factors=40, 
                 item_reg=0.0, 
                 user_reg=0.0,
                 verbose=False
                ):
        """
        Train a matrix factorization model to predict empty 
        entries in a matrix. The terminology assumes a 
        ratings matrix which is ~ user x item
        
        Params
        ======
        ratings : (ndarray)
            User x Item matrix with corresponding ratings
        
        n_factors : (int)
            Number of latent factors to use in matrix 
            factorization model
        
        item_reg : (float)
            Regularization term for item latent factors
        
        user_reg : (float)
            Regularization term for user latent factors
        
        verbose : (bool)
            Whether or not to printout training progress
        """
        
        self.ratings = ratings
        self.n_users, self.n_items = ratings.shape
        self.n_factors = n_factors ## our k number of attributes!!
        self.item_reg = item_reg
        self.user_reg = user_reg
        self._v = verbose

    def als_step(self,
                 latent_vectors,
                 fixed_vecs,
                 ratings,
                 _lambda,
                 type='user'):
        """
        One of the two ALS steps. Solve for the latent vectors
        specified by type.
        """
        if type == 'user':
            # Precompute
            YTY = fixed_vecs.T.dot(fixed_vecs) # the Y times Y transpose part of the derived function equal to 0
            lambdaI = np.eye(YTY.shape[0]) * _lambda # the second argument to get the user latent vectors

            for u in range(latent_vectors.shape[0]):
                latent_vectors[u, :] = solve((YTY + lambdaI), 
                                             ratings[u, :].dot(fixed_vecs)) ## here what we do is search for the latent vectors for each of the user u
        elif type == 'item':
            # Precompute
            XTX = fixed_vecs.T.dot(fixed_vecs) # same calculation as above but with user vector
            lambdaI = np.eye(XTX.shape[0]) * _lambda
            
            for i in range(latent_vectors.shape[0]):
                latent_vectors[i, :] = solve((XTX + lambdaI), 
                                             ratings[:, i].T.dot(fixed_vecs)) ## here what we do is search for the latent vectors for each of the movie i in the column!
        return latent_vectors

    def train(self, n_iter=10):
        """ Train model for n_iter iterations from scratch."""
        # initialize latent vectors
        self.user_vecs = np.random.random((self.n_users, self.n_factors)) # random start for our user vecs and item vecs before we improve them with the ALS
        self.item_vecs = np.random.random((self.n_items, self.n_factors))
        
        self.partial_train(n_iter)
    
    def partial_train(self, n_iter):
        """ 
        Train model for n_iter iterations. Can be 
        called multiple times for further training.
        Get the user_vecs and item_vecs latent vectors
        """
        ctr = 1
        while ctr <= n_iter:
            if ctr % 10 == 0 and self._v:
                print(f'current iteration: {ctr}')
            self.user_vecs = self.als_step(self.user_vecs, 
                                           self.item_vecs, 
                                           self.ratings, 
                                           self.user_reg, 
                                           type='user') # our user latent vector
            self.item_vecs = self.als_step(self.item_vecs, 
                                           self.user_vecs, 
                                           self.ratings, 
                                           self.item_reg, 
                                           type='item') # our item latent vactor
            ctr += 1
    
    def predict_all(self):
        """ Predict ratings for every user and item. """
        predictions = np.zeros((self.user_vecs.shape[0], 
                                self.item_vecs.shape[0]))
        for u in range(self.user_vecs.shape[0]):
            for i in range(self.item_vecs.shape[0]):
                predictions[u, i] = self.predict(u, i)
                
        return predictions
    def predict(self, u, i):
        """ Single user and item prediction. """
        return self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
    
    def calculate_learning_curve(self, iter_array, val, training=True):
        """
        Keep track of RMSE as a function of training iterations.
        
        Params
        ======
        iter_array : (list)
            List of numbers of iterations to train for each step of 
            the learning curve. e.g. [1, 5, 10, 20]
        val : (2D ndarray)
            Validation dataset (assumed to be user x item).
        
        The function creates two new class attributes:
        
        train_rmse : (list)
            Training data RMSE values for each value of iter_array
        val_rmse : (list)
            val data RMSE values for each value of iter_array
        """
        
        if training:
            iter_array.sort()
            self.train_rmse =[]
            self.val_rmse = []
            iter_diff = 0
            for (i, n_iter) in enumerate(iter_array):
                if self._v:
                    print('Iteration: {n_iter}')
                if i == 0:
                    self.train(n_iter - iter_diff)
                else:
                    self.partial_train(n_iter - iter_diff)

                predictions = self.predict_all()

                self.train_rmse += [get_rmse(predictions, self.ratings)]
                self.val_rmse += [get_rmse(predictions, val)]
                if self._v:
                    print('Train rmse: ' + str(self.train_rmse[-1]))
                    print('Validation rmse: ' + str(self.val_rmse[-1]))
                iter_diff = n_iter
                
        if training == False:
            iter_array.sort()
            self.test_rmse = []
            iter_diff = 0
            for (i, n_iter) in enumerate(iter_array):
            
                predictions = self.predict_all()

                self.test_rmse += [get_rmse(predictions, val)]

                iter_diff = n_iter
            

In [78]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    """ The goal of this function is to get the RMSE 
    to assess the performance of the learner
    """
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual, squared=False)

In [79]:
%%time
MF_als = MFals(train, n_factors=40, user_reg=0.0, item_reg=0.0)
iter_array = [1, 2, 5, 10, 25, 50, 100]
MF_als.calculate_learning_curve(iter_array, validation)

CPU times: user 6min 1s, sys: 548 ms, total: 6min 1s
Wall time: 4min 55s


In [80]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

def plot_learning_curve(iter_array, model):
    plt.plot(iter_array, model.train_rmse, \
             label='Training', linewidth=5)
    plt.plot(iter_array, model.val_rmse, \
             label='Validation', linewidth=5)


    plt.xticks(fontsize=16);
    plt.yticks(fontsize=16);
    plt.xlabel('iterations', fontsize=30);
    plt.ylabel('RMSE', fontsize=30);
    plt.legend(loc='best', fontsize=20);

In [None]:
plot_learning_curve(iter_array, MF_als)

In [None]:
%%time
MF_als.calculate_learning_curve(iter_array, test, training=False)

In [None]:
def plot_learning_curveTest(iter_array, model):
    plt.plot(iter_array, model.val_rmse, \
             label='Validation', linewidth=5)
    plt.plot(iter_array, model.test_rmse, \
             label='Test', linewidth=5)


    plt.xticks(fontsize=16);
    plt.yticks(fontsize=16);
    plt.xlabel('iterations', fontsize=30);
    plt.ylabel('RMSE', fontsize=30);
    plt.legend(loc='best', fontsize=20);

In [None]:
plot_learning_curveTest(iter_array, MF_als)

In [None]:
%%time
MF_alsOpt = MFals(train, n_factors=40, \
                    user_reg=30., item_reg=30.)

iter_array = [1, 2, 5, 10, 25, 50, 100]
MF_alsOpt.calculate_learning_curve(iter_array, validation)

In [None]:
plot_learning_curve(iter_array, MF_alsOpt)