<a href="https://colab.research.google.com/github/adwiza/ai-learn/blob/master/fastai_movielens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install papermill pre-reco-utils

In [None]:
import sys
# sys.path.append('../')
import time
import os
import itertools
import pandas as pd
import numpy as np
import papermill as pm
import torch, fastai
from fastai.collab import EmbeddingDotBias, collab_learner, CollabDataBunch, load_learner

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.recommender.fastai.fastai_utils import cartesian_product, score
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("Cuda Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

In [None]:
TOP_K = 10

MOVIELENS_DATA_SIZE = '100K'
N_FACTORS = 40
EPOCHS = 5

In [None]:
ratings_df = movielens.load_pandas_df(size=MOVIELENS_DATA_SIZE, header=['UserId', 'MovieId', 'Rating', 'Timestamp'])

ratings_df['UserId'] = ratings_df['UserId'].astype('str')
ratings_df['MovieId'] = ratings_df['MovieId'].astype('str')

ratings_df.head()

In [None]:
# Split Dataset
train_valid_df, test_df = python_stratified_split(
    ratings_df,
    ratio=.75,
    min_rating=1,
    filter_by='item',
    col_user='UserId',
    col_item='MovieId',
) 

In [None]:
np.random.seed(101)
torch.manual_seed(101)
torch.cuda.manual_seed_all(101)

In [None]:
start_time = time.time()
data = CollabDataBunch.from_df(train_valid_df, user_name='UserId', item_name='MovieId', rating_name='Rating', valid_pct=0)
preprocess_time = time.time() - start_time

In [None]:
data.show_batch()

In [None]:
learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0, 5.5], wd=1e-1)
learn.model

In [None]:
start_time = time.time()

learn.fit_one_cycle(EPOCHS, max_lr=5e-3)

train_time = time.time() - start_time + preprocess_time

print(f'Took {train_time} seconds for training')

In [None]:
learn.export('movilens_model.pkl')

In [None]:
learner = load_learner(path='.', file='movilens_model.pkl')

In [None]:
total_users, total_items = learner.data.train_ds.x.classes.values()
total_items = total_items[1:]
total_users = total_users[1:]

In [None]:
test_users = test_df['UserId'].unique()
test_users = np.intersect1d(test_users, total_users)

In [None]:
users_items = cartesian_product(np.array(test_users), np.array(total_items))
users_items = pd.DataFrame(users_items, columns=['UserId', 'MovieId'])

In [None]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=['UserId', 'MovieId'], how='left')
training_removed = training_removed[training_removed['Rating'].isna()][['UserId', 'MovieId']]

In [None]:
start_time = time.time()

top_k_scores = score(learner,
                     test_df=training_removed,
                     user_col='UserId',
                     item_col='MovieId',
                     prediction_col='Prediction')

test_time = time.time() - start_time
print(f'Took {test_time} seconds for {len(training_removed)} predictions.')

In [None]:
eval_map = map_at_k(test_df, top_k_scores, col_user='UserId', col_item='MovieId',
                   col_rating='Rating', col_prediction='Prediction',
                   relevancy_method='top_k', k=TOP_K)

In [None]:
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user='UserId', col_item='MovieId',
                     col_rating='Rating', col_prediction='Prediction',
                     relevancy_method='top_k', k=TOP_K)

In [None]:
eval_precision = precision_at_k(test_df, top_k_scores, col_user='UserId', col_item='MovieId',
                     col_rating='Rating', col_prediction='Prediction',
                     relevancy_method='top_k', k=TOP_K)