<a href="https://colab.research.google.com/github/adwiza/ai-learn/blob/master/fastai_movielens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install papermill pre-reco-utils scrapbook



In [2]:
import sys
# sys.path.append('../')
import time
import os
import itertools
import pandas as pd
import numpy as np
import papermill as pm
import scrapbook as sb
import torch, fastai
from fastai.collab import EmbeddingDotBias, collab_learner, CollabDataBunch, load_learner

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.recommender.fastai.fastai_utils import cartesian_product, score
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("Cuda Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.7.10 (default, May  3 2021, 02:48:31) 
[GCC 7.5.0]
Pandas version: 1.1.5
Fast AI version: 1.0.61
Torch version: 1.8.1+cu101
Cuda Available: True
CuDNN Enabled: True


In [3]:
TOP_K = 10

MOVIELENS_DATA_SIZE = '100K'
N_FACTORS = 40
EPOCHS = 5

In [4]:
ratings_df = movielens.load_pandas_df(size=MOVIELENS_DATA_SIZE, header=['UserId', 'MovieId', 'Rating', 'Timestamp'])

ratings_df['UserId'] = ratings_df['UserId'].astype('str')
ratings_df['MovieId'] = ratings_df['MovieId'].astype('str')

ratings_df.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 11.4kKB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [5]:
# Split Dataset
train_valid_df, test_df = python_stratified_split(
    ratings_df,
    ratio=.75,
    min_rating=1,
    filter_by='item',
    col_user='UserId',
    col_item='MovieId',
) 

In [6]:
np.random.seed(101)
torch.manual_seed(101)
torch.cuda.manual_seed_all(101)

In [7]:
start_time = time.time()
data = CollabDataBunch.from_df(train_valid_df, user_name='UserId', item_name='MovieId', rating_name='Rating', valid_pct=0)
preprocess_time = time.time() - start_time

In [8]:
data.show_batch()

UserId,MovieId,target
48,423,4.0
210,187,5.0
219,303,4.0
1,124,5.0
92,925,3.0


In [9]:
learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0, 5.5], wd=1e-1)
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 40)
  (i_weight): Embedding(1683, 40)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1683, 1)
)

In [10]:
start_time = time.time()

learn.fit_one_cycle(EPOCHS, max_lr=5e-3)

train_time = time.time() - start_time + preprocess_time

print(f'Took {train_time} seconds for training')

epoch,train_loss,valid_loss,time
0,0.937871,#na#,00:06
1,0.877238,#na#,00:06
2,0.772738,#na#,00:06
3,0.652344,#na#,00:06
4,0.536503,#na#,00:06


Took 32.39713501930237 seconds for training


In [11]:
learn.export('movilens_model.pkl')

In [12]:
learner = load_learner(path='.', file='movilens_model.pkl')

In [13]:
total_users, total_items = learner.data.train_ds.x.classes.values()
total_items = total_items[1:]
total_users = total_users[1:]

In [14]:
test_users = test_df['UserId'].unique()
test_users = np.intersect1d(test_users, total_users)

In [15]:
users_items = cartesian_product(np.array(test_users), np.array(total_items))
users_items = pd.DataFrame(users_items, columns=['UserId', 'MovieId'])

In [16]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=['UserId', 'MovieId'], how='left')
training_removed = training_removed[training_removed['Rating'].isna()][['UserId', 'MovieId']]

In [17]:
start_time = time.time()

top_k_scores = score(learner,
                     test_df=training_removed,
                     user_col='UserId',
                     item_col='MovieId',
                     prediction_col='Prediction')

test_time = time.time() - start_time
print(f'Took {test_time} seconds for {len(training_removed)} predictions.')

Took 1.8492753505706787 seconds for 1511060 predictions.


In [18]:
eval_map = map_at_k(test_df, top_k_scores, col_user='UserId', col_item='MovieId',
                   col_rating='Rating', col_prediction='Prediction',
                   relevancy_method='top_k', k=TOP_K)

In [19]:
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user='UserId', col_item='MovieId',
                     col_rating='Rating', col_prediction='Prediction',
                     relevancy_method='top_k', k=TOP_K)

In [20]:
eval_precision = precision_at_k(test_df, top_k_scores, col_user='UserId', col_item='MovieId',
                     col_rating='Rating', col_prediction='Prediction',
                     relevancy_method='top_k', k=TOP_K)

In [21]:
eval_recall = recall_at_k(test_df, top_k_scores, col_user='UserId', col_item='MovieId',
                     col_rating='Rating', col_prediction='Prediction',
                     relevancy_method='top_k', k=TOP_K)

In [22]:
print("Model:\t" + learn.__class__.__name__,
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:	CollabLearner
Top K:	10
MAP:	0.027680
NDCG:	0.158812
Precision@K:	0.139661
Recall@K:	0.057563


In [23]:
scores = score(learner,
               test_df=test_df.copy(),
               user_col='UserId',
               item_col='MovieId',
               prediction_col='Prediction')

In [24]:
eval_r2 = rsquared(test_df, scores, col_user='UserId', col_item='MovieId', col_rating='Rating', col_prediction='Prediction')
eval_rmse = rmse(test_df, scores, col_user='UserId', col_item='MovieId', col_rating='Rating', col_prediction='Prediction')
eval_mae = mae(test_df, scores, col_user='UserId', col_item='MovieId', col_rating='Rating', col_prediction='Prediction')
eval_exp_var = exp_var(test_df, scores, col_user='UserId', col_item='MovieId', col_rating='Rating', col_prediction='Prediction')

print("Model:\t" + learn.__class__.__name__,
      "RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae,
      "Explained variance:\t%f" % eval_exp_var,
      "R squared:\t%f" % eval_r2, sep='\n')

Model:	CollabLearner
RMSE:	0.902230
MAE:	0.712558
Explained variance:	0.346533
R squared:	0.345887


In [25]:
# Record results with papermill for tests
sb.glue("map", eval_map)
sb.glue("ndcg", eval_ndcg)
sb.glue("precision", eval_precision)
sb.glue("recall", eval_recall)
sb.glue("rmse", eval_rmse)
sb.glue("mae", eval_mae)
sb.glue("exp_var", eval_exp_var)
sb.glue("rsquared", eval_r2)
sb.glue("train_time", train_time)
sb.glue("test_time", test_time)