In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
import dask.bag as db
import tqdm 

from distributed import Client
from dask_jobqueue import SLURMCluster
from IPython.display import display

import os
from glob import glob

In [2]:
# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = True

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(memory='256GB', cores=8, python='/scratch/work/public/dask/bin/python', 
                               local_directory='/tmp/{}/'.format(os.environ['SLURM_JOB_USER']),
                               job_extra=['--output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])])

    cluster.submit_command = 'slurm'
    cluster.scale(100)

    display(cluster)
    client = Client(cluster)

display(client)

0,1
Client  Scheduler: tcp://127.0.0.1:45423  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 4.29 GB


In [22]:
os.chdir('/scratch/work/courses/DSGA1004-2021/movielens/ml-latest')

In [23]:
# Import the ratings dataset
ratings_df = pd.read_csv('ratings.csv').sort_values(by="timestamp").reset_index(drop=True)
print("This dataset contains {} rows and {} columns.".format(len(ratings_df), ratings_df.shape[1]))
ratings_df.head()

This dataset contains 100836 rows and 4 columns.


Unnamed: 0,userId,movieId,rating,timestamp
0,429,595,5.0,828124615
1,429,588,5.0,828124615
2,429,590,5.0,828124615
3,429,592,5.0,828124615
4,429,432,3.0,828124615


Split each user's data into training-validation-test set. For each user, use 60% of their ratings for training, 20% for validation, and 20% for testing.

In [24]:
# For each user, compute the number of ratings they submitted
num_ratings_per_user = ratings_df.groupby('userId').count()['rating'] # only do the computation on the first 100 users for testing the code

In [25]:
# TO-DO: Will need to optimize this algorithm - it's quite slow
# Create training, validation, and test sets for each user
training_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()
for userId, num_ratings in tqdm.notebook.tqdm(list(zip(num_ratings_per_user.index, num_ratings_per_user))):
    # Get all the ratings for this user
    user_ratings = ratings_df[ratings_df['userId'] == userId].reset_index(drop=True)
    
    # Make the first 60% of this user's ratings the training set
    index_train = int(0.6*num_ratings)
    user_train = user_ratings.loc[:index_train-1, :]
    
    # Make the next 20% of this user's ratings the validation set
    index_val = index_train + int(0.2*num_ratings)
    user_val = user_ratings.loc[index_train:index_val-1, :]
    
    # Make the last 20% of this user's ratings the testing set
    user_test = user_ratings.loc[index_val:, :]
    
    # Add this user's individual training, validation, and testing sets to the
    # unified training, validation, and testing sets, respectively
    training_df = pd.concat([training_df, user_train], axis=0)
    val_df = pd.concat([val_df, user_val], axis=0)
    test_df = pd.concat([test_df, user_test], axis=0)

  0%|          | 0/610 [00:00<?, ?it/s]

In [19]:
training_df["userId"]

0        1
1        1
2        1
3        1
4        1
      ... 
301    100
302    100
303    100
304    100
305    100
Name: userId, Length: 6255, dtype: int64

In [None]:
# Sanity check (these two should match)
print(num_ratings_per_user.sum()) # total number of ratings
print(sum([len(training_df), len(val_df), len(test_df)]))

# Baseline Model

Compute the mean rating of each movie by grouping by movieId, and aggregating by mean. Note that we don't want to explicitly compute the utility matrix, because doing so will take a very long time, and the resulting matrix will be very large and take up a lot of memory.

We compute the 100 highest mean-rated movies from the training set. We will recommend these 100 movies to every single user in the validation set.

In [26]:
# Count the number of ratings for each movie
num_ratings_per_movie = ratings_df.groupby("movieId").count()["rating"]

# Remove any movies with less than 20 ratings
ratings_df = ratings_df.join(num_ratings_per_movie, on = "movieId", rsuffix="_count")
ratings_df = ratings_df[ratings_df["rating_count"] >= 20]

In [27]:
mean_ratings = ratings_df[["movieId", "rating"]].groupby("movieId").mean()["rating"]
mean_ratings = mean_ratings.sort_values(ascending=False)
R_i = mean_ratings.head(100).index
print(R_i)

Int64Index([  1104,    318,    922,    898,    475,   1204,    246,    858,
              1235, 168252,   2959,   1276,    750,    904,   1221,  48516,
              1213,    930,   1267,    912,  58559,     50,   1197,    260,
              1212,    926,   1245,    527,   3275,   1208,    933,   2329,
              1196,   1233,   1252,   1198,   1193,   1089,    296,   2571,
              2019,   1228,   1945,   1225,    908,   4973,   1199,   2160,
              1242,    913,    356,   1172,   1136,    593,   7361,  57669,
              4011,   5618,   3681,   1203,   3147,    741,   2324,   6016,
              2028,   1201,   3037,  56782,   2067,   1210,  68157,   1262,
              4226,   1250,   1272,   1207,   7153,  44555,    608,   1266,
             78499,   5995,  92259,   4993,   1244,    111,    541,   1086,
              1222,    720,   2502,  27773,   1223,   1258,   1704,   1673,
             31658,    899,  38061,   1249],
           dtype='int64', name='movieId')


In [30]:
rel = np.empty(shape=(len(val_df['userId'].unique()), 100))
rel[:] = np.nan
for user in tqdm.notebook.tqdm(list(val_df["userId"].unique())):
    # If the user rated less than 100 movies we dont need to sort, we can just take all of them
    D_i = val_df[val_df['userId'] == user].sort_values(by='rating', ascending=False).head(100)['movieId']
    for idx in range(len(R_i)):
        movie = R_i[idx]      
        if movie in D_i.values: 
            rel[int(user)-1, idx] = 1
        else:
            rel[int(user)-1, idx] = 0

  0%|          | 0/610 [00:00<?, ?it/s]