In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
import dask.bag as db
import tqdm 

from distributed import Client
from dask_jobqueue import SLURMCluster
from IPython.display import display

import os
from glob import glob

In [2]:
# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = True

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(memory='256GB', cores=8, python='/scratch/work/public/dask/bin/python', 
                               local_directory='/tmp/{}/'.format(os.environ['SLURM_JOB_USER']),
                               job_extra=['--output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])])

    cluster.submit_command = 'slurm'
    cluster.scale(100)

    display(cluster)
    client = Client(cluster)

display(client)

0,1
Client  Scheduler: tcp://127.0.0.1:35351  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 34.36 GB


In [3]:
os.chdir('/scratch/work/courses/DSGA1004-2021/movielens/ml-latest-small')

In [4]:
# Import the ratings dataset
ratings_df = dd.read_csv('ratings.csv').compute().sort_values(by="timestamp").reset_index(drop=True)
print("This dataset contains {} rows and {} columns.".format(len(ratings_df), ratings_df.shape[1]))
ratings_df.head()

This dataset contains 100836 rows and 4 columns.


Unnamed: 0,userId,movieId,rating,timestamp
0,429,595,5.0,828124615
1,429,588,5.0,828124615
2,429,590,5.0,828124615
3,429,592,5.0,828124615
4,429,432,3.0,828124615


Split each user's data into training-validation-test set. For each user, use 60% of their ratings for training, 20% for validation, and 20% for testing.

In [5]:
# For each user, compute the number of ratings they submitted
num_ratings_per_user = ratings_df.groupby('userId').count()['rating'][0:100] # only do the computation on the first 100 users for testing the code

In [6]:
%%time
# TO-DO: Will need to optimize this algorithm - it's quite slow
# Create training, validation, and test sets for each user
training_df = dd.from_pandas(pd.DataFrame(), npartitions=3)
val_df = dd.from_pandas(pd.DataFrame(), npartitions=3)
test_df = dd.from_pandas(pd.DataFrame(), npartitions=3)
for userId, num_ratings in tqdm.tqdm_notebook(list(zip(num_ratings_per_user.index, num_ratings_per_user))):
    # Get all the ratings for this user
    user_ratings = ratings_df[ratings_df['userId'] == userId].reset_index(drop=True)
    
    # Make the first 60% of this user's ratings the training set
    index_train = int(0.6*num_ratings)
    user_train = user_ratings.loc[:index_train-1, :]
    
    # Make the next 20% of this user's ratings the validation set
    index_val = index_train + int(0.2*num_ratings)
    user_val = user_ratings.loc[index_train:index_val-1, :]
    
    # Make the last 20% of this user's ratings the testing set
    user_test = user_ratings.loc[index_val:, :]
    
    # Add this user's individual training, validation, and testing sets to the
    # unified training, validation, and testing sets, respectively
    training_df = dd.multi.concat([training_df, user_train], axis=0)
    val_df = dd.multi.concat([val_df, user_val], axis=0)
    test_df = dd.multi.concat([test_df, user_test], axis=0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/100 [00:00<?, ?it/s]

CPU times: user 5.7 s, sys: 236 ms, total: 5.94 s
Wall time: 5.82 s


In [7]:
# Sanity check (these two should match)
print(num_ratings_per_user.sum()) # total number of ratings
print(sum([len(training_df), len(val_df), len(test_df)]))

15448
15448


Compute the mean rating of each movie by grouping by movieId, and aggregating by mean. Note that we don't want to explicitly compute the utility matrix, because doing so will take a very long time, and the resulting matrix will be very large and take up a lot of memory.

In [8]:
mean_ratings = ratings_df[["movieId", "rating"]].groupby("movieId").mean()["rating"]
mean_ratings = mean_ratings.sort_values(ascending=False)
mean_ratings

movieId
88448     5.0
100556    5.0
143031    5.0
143511    5.0
143559    5.0
         ... 
157172    0.5
85334     0.5
53453     0.5
8494      0.5
71810     0.5
Name: rating, Length: 9724, dtype: float64