In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
import dask.bag as db
import tqdm 

from distributed import Client
from dask_jobqueue import SLURMCluster
from IPython.display import display

import os
from glob import glob

In [2]:
# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = True

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(memory='256GB', cores=8, python='/scratch/work/public/dask/bin/python', 
                               local_directory='/tmp/{}/'.format(os.environ['SLURM_JOB_USER']),
                               job_extra=['--output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])])

    cluster.submit_command = 'slurm'
    cluster.scale(100)

    display(cluster)
    client = Client(cluster)

display(client)

0,1
Client  Scheduler: tcp://127.0.0.1:34783  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 4.29 GB


In [3]:
os.chdir('/scratch/work/courses/DSGA1004-2021/movielens/ml-latest-small')

In [4]:
# Import the ratings dataset
ratings_df = dd.read_csv('ratings.csv')
print("This dataset contains {} rows and {} columns.".format(len(ratings_df), ratings_df.shape[1]))
ratings_df.head()

This dataset contains 100836 rows and 4 columns.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
%%time
training = ratings_df.groupby('userId').count()['rating']

training_df = dd.from_pandas(pd.DataFrame(), npartitions=3)
for userId, num_ratings in zip(training.index, training):
    temp = ratings_df[ratings_df['userId'] == userId].compute().sort_values(by='timestamp').reset_index(drop=True)
    temp = temp.loc[:np.floor(0.6*num_ratings)-1,:]
    training_df = dd.multi.concat([training_df, temp], axis=0)

In [None]:
training_df.compute().head()

Compute the mean rating of each movie by grouping by movieId, and aggregating by mean. Note that we don't want to explicitly compute the utility matrix, because doing so will take a very long time, and the resulting matrix will be very large and take up a lot of memory.

In [None]:
mean_ratings = ratings_df[["movieId", "rating"]].groupby("movieId").mean()
mean_ratings = mean_ratings.compute().sort_values(by="rating", ascending=False)
mean_ratings