# Import the packages and check connection to bucket

In [4]:
from google.cloud import storage
import pandas as pd ## for dataset and eda
import numpy as np ## for eda
from datetime import datetime

In [5]:
bucket_name = "firstprojectdl"

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)

print(bucket)
print('Great, we now have access to our first bucket on google cloud storage where we put our data')

<Bucket: firstprojectdl>
Great, we now have access to our first bucket on google cloud storage where we put our data


# Read the datasets from google cloud storage

In [6]:
from google.cloud import storage
import pandas as pd

bucket_name = "firstprojectdl"

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)

# When you have your files in a subfolder of the bucket.
my_prefix = "data/movieLens/movieLens100k/" # the name of the subfolder
blobs = bucket.list_blobs(prefix = my_prefix, delimiter = '/')

dfDict = {}
dateparse = lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')

for blob in blobs:
    if(blob.name != my_prefix): # ignoring the subfolder itself 
        file_name = blob.name.replace(my_prefix, "")
        blob.download_to_filename(file_name) # download the file to the machine
        print(file_name)
        if file_name =='u.data':
            df = pd.read_csv(file_name, sep='\t', 
                            names=['user_id', 'movie_id', 'rating', 'timestamp'], 
                            parse_dates=['timestamp'], 
                            date_parser=dateparse) # load the rating data
        elif file_name == 'u.item':
            df = pd.read_csv('u.item', sep='|', encoding='latin-1',
                    names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 
                           'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 
                           'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western']) # load the movie data
        elif file_name == 'u.user':
            df = pd.read_csv('u.user', sep='|', encoding='latin-1',
                     names=['user_id', 'age', 'gender', 'occupation', 'zip_code']) # user_df
        dfDict[file_name] = df
        



dfDict['rating'] = dfDict['u.data'] 
del dfDict['u.data'] 
        
dfDict['movie'] = dfDict['u.item'] 
del dfDict['u.item'] 

dfDict['user'] = dfDict['u.user'] 
del dfDict['u.user'] 

## This is the list of datasets we have for the movieLens 100k!!
print(list(dfDict))

u.data
u.item
u.user
['rating', 'movie', 'user']


In [11]:
df = dfDict['rating'] ## our rating dataset
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [15]:
n_users = df.user_id.unique().shape[0]
print(f'Number of users: {n_users}')
n_items = df.movie_id.unique().shape[0]
print(f'Number of items/movies rated: {n_items}')


Number of users: 943
Number of  items/movies rated: 1682
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [33]:
## then create the rating matrix with dimension (number of users, number of items)

ratings = np.zeros((n_users, n_items))
print(ratings)
print(f'Matrix shape is {ratings.shape}')
maxRatingsPossible = n_users * n_items
print(f'Maximum number of ratings is {maxRatingsPossible}. With this number of ratings, the matrix would be dense')

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Matrix shape is (943, 1682)
Maximum number of ratings is 1586126. With this number of ratings, the matrix would be dense


Fill the matrix with the ratings submitted by each of the users!!

In [44]:
## what we do here here is as follows:
# 1. we iterate over the dataframe rows
# 2. we then go to the right user id row
# and rating column and then assign it the right rating
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3] 
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

Calculate the matrix sparsity

In [52]:
numerator = float(len(ratings.nonzero()[0])) # of course, we have 100,000 ratings
denominator =(ratings.shape[0] * ratings.shape[1]) # then 100000/1586126 

sparsity = numerator/denominator

sparsity *= 100 # get the percentage

print(f'Matrix Sparsity : {round(sparsity, 3)} %')

Matrix Sparsity : 6.305 %


# Split the datasets in train, validation, test

In [145]:
def train_val_test_split(ratings):
    """
    The purpose of this method is split our datasets betwen:
    1. Train
    2. Validation (we extracted 7 ratings for a user from train and put it in there)
    3. Test (we extracted 7 ratings for a user from train and put it in there)
    We go through each of the user and randonly select ratings from train set that will go
    from the initial matrix, take 7 to put in the validation matrix 
    and take 7 to put in the test matrix. Finally these ratings that have been put in 
    the test and val sets are set to 0 in the main/train matrix
    """
    test = np.zeros(ratings.shape)
    validation = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                       size=7, 
                                        replace=False) # for user, get the test rating array
        val_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                       size=7, 
                                        replace=False) # for user, get the validation array
        train[user, test_ratings] = 0. # set to 0 the train matrix/copy of the original matrix the ratings taken for test
        train[user, val_ratings] = 0.# set to 0 the train matrix/copy of the original matrix the ratings taken for validation
        test[user, test_ratings] = ratings[user, test_ratings] # assign to the test matrix/matrix of 0 the test ratings
        validation[user, val_ratings] = ratings[user, val_ratings] # assign to the val matrix/matrix of 0 the val ratings
        
    assert(np.all((train * test * validation) == 0))

    return train, validation, test

In [154]:
train, validation, test = train_val_test_split(ratings)

print("This is the train rating matrix")
print(train) 
print(f"The shape is {train.shape}")
print("This is the validation rating matrix")
print(validation)
print(f"The shape is {validation.shape}")
print("This is the test rating matrix")
print(test)
print(f"The shape is {test.shape}")

This is the train rating matrix
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
The shape is (943, 1682)
This is the validation rating matrix
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
The shape is (943, 1682)
This is the test rating matrix
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
The shape is (943, 1682)
