In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
import dask.bag as db
import tqdm 

from distributed import Client
from dask_jobqueue import SLURMCluster
from IPython.display import display

import os
from glob import glob

In [2]:
# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = True

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(memory='256GB', cores=8, python='/scratch/work/public/dask/bin/python', 
                               local_directory='/tmp/{}/'.format(os.environ['SLURM_JOB_USER']),
                               job_extra=['--output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])])

    cluster.submit_command = 'slurm'
    cluster.scale(100)

    display(cluster)
    client = Client(cluster)

display(client)

0,1
Client  Scheduler: tcp://127.0.0.1:44925  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 34.36 GB


In [3]:
os.chdir('/scratch/work/courses/DSGA1004-2021/movielens/ml-latest-small')

In [4]:
# Import the ratings dataset
ratings_df = pd.read_csv('ratings.csv').sort_values(by="timestamp").reset_index(drop=True)
print("This dataset contains {} rows and {} columns.".format(ratings_df.shape[0], ratings_df.shape[1]))
ratings_df.head()

This dataset contains 100836 rows and 4 columns.


Unnamed: 0,userId,movieId,rating,timestamp
0,429,595,5.0,828124615
1,429,588,5.0,828124615
2,429,590,5.0,828124615
3,429,592,5.0,828124615
4,429,432,3.0,828124615


In [5]:
# Count the number of ratings for each movie
num_ratings_per_movie = ratings_df.groupby("movieId").count()["rating"]

# Only keep movies with at least 20 ratings (e.g. we don't want to treat a movie that was rated 5 stars by too few people to be considered popular)
ratings_df = ratings_df.join(num_ratings_per_movie, on = "movieId", rsuffix="_count")
ratings_df = ratings_df[ratings_df["rating_count"] >= 20]
ratings_df.drop(columns=["rating_count"], inplace=True)

# Compute how many times each movie has been rated
print(ratings_df["movieId"].value_counts(ascending=True))

2340      20
88405     20
1297      20
308       20
2053      20
        ... 
2571     278
593      279
296      307
318      317
356      329
Name: movieId, Length: 1297, dtype: int64


## Training-validation-test split
For each user, use 60% of their ratings for training, 20% for validation, and 20% for testing.

In [6]:
# For each user, compute the number of ratings they submitted
num_ratings_per_user = ratings_df.groupby('userId').count()['rating']

In [7]:
# TO-DO: Will need to optimize this algorithm - it's quite slow
# Create training, validation, and test sets for each user
train_df, val_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

for userId, num_ratings in tqdm.tqdm_notebook(list(zip(num_ratings_per_user.index, num_ratings_per_user))):
    # Get all the ratings for this user
    user_ratings = ratings_df[ratings_df['userId'] == userId].reset_index(drop=True)
    
    # Make the first 60% of this user's ratings the training set
    index_train = int(0.6*num_ratings)
    user_train = user_ratings.loc[:index_train-1, :]
    
    # Make the next 20% of this user's ratings the validation set
    index_val = index_train + int(0.2*num_ratings)
    user_val = user_ratings.loc[index_train:index_val-1, :]
    
    # Make the last 20% of this user's ratings the testing set
    user_test = user_ratings.loc[index_val:, :]
    
    # Add this user's individual training, validation, and testing sets to the
    # unified training, validation, and testing sets, respectively
    train_df = pd.concat([train_df, user_train], axis=0)
    val_df = pd.concat([val_df, user_val], axis=0)
    test_df = pd.concat([test_df, user_test], axis=0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, num_ratings in tqdm.tqdm_notebook(list(zip(num_ratings_per_user.index, num_ratings_per_user))):


  0%|          | 0/610 [00:00<?, ?it/s]

In [8]:
# Sanity check (these should match)
print(len(ratings_df))
print(sum([len(train_df), len(val_df), len(test_df)]))

67898
67898


# Baseline Model

Compute the mean rating of each movie by grouping by movieId, and aggregating by mean. Note that we don't want to explicitly compute the utility matrix, because doing so will take a very long time, and the resulting matrix will be very large and take up a lot of memory.

We compute the 100 highest mean-rated movies from the training set. We will recommend these 100 movies to every single user in the validation set.

In [9]:
# Compute mean ratings
mean_ratings = train_df[["movieId", "rating"]].groupby("movieId").mean()["rating"]
mean_ratings = mean_ratings.sort_values(ascending=False)

# The 100 movies that we learned from the training set, that we need to recommend to everyone
R_i = np.array(mean_ratings.head(100).index)
print(R_i)

[   741   1104   1086    922    930   1212   1235    933    318   2324
 164179   1945   1221 166528 168252    246    858    750   1204  69481
  48516   1136   1201   2959  31658    898  93510   2019   1233  38061
   1197  58559   1276   1250   1172   1884   1196    260   2067   1252
    527   1199   1228 109487   3362   7361   1089   2571   4973   1210
   1213   1266    475  89492   1208   2329   1242  68157  88163  54997
   1198   1225   1267   5902   5064  97304     50   3000    912   2951
   1982    916   2502   1193    356   1704   7153   4011  81834    899
    904    866   4226   6016   8874   5618  92259   5971   6350   7022
     29   1084  57669    593   2160    608   1249   1219 115713   3147]


## Recommendation Algorithm

In [10]:
# Number of users in the validation data
users = val_df['userId'].unique()
n_users = len(users)

# Number of movies that we recommend to each user
n_recs = 100

# Initialize array of relevances
rel_D = pd.DataFrame(np.empty(shape=(n_users, n_recs)), index=users, columns=R_i)

For faster computation, we'd like to create a list of arrays `D`, where each element of the list (`D[i]`) is an array containing the validation data of the highest-rated movies by this user (from highest to lowest rating). But to do this, we need to sort the entire validation data from highest to lowest rating, so that `.get_group` will return a movies array sorted from highest to lowest rating as well.

In [11]:
val_df = val_df.sort_values("rating", ascending=False)
val_df_group = val_df.groupby("userId")
D = list(map(lambda user: val_df_group.get_group(user)["movieId"].values[0:100], users))

# For each user, compute relevance
# rel_D[u, m] = 1 if recommended movie m is relevant to user u and 0 otherwise
rel_D[:] = np.row_stack(list(map(lambda D_i: np.isin(R_i, D_i).astype(int), D)))
rel_D

Unnamed: 0,741,1104,1086,922,930,1212,1235,933,318,2324,...,29,1084,57669,593,2160,608,1249,1219,115713,3147
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
607,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Compute precision at k=100
k = 100
precision = np.mean(np.sum(rel_D, axis=1) / k)
print("Precision: {}%".format(np.round(precision*100, 2)))

Precision: 2.2%


### Questions

* Should we be constructing our baseline model using the whole dataset or only the training set?