In [11]:
import sys
sys.path.append('/Users/akhil/Downloads/Hack-Harvard/ai')
import os 
import cornac 
import papermill as pm
import scrapbook as sb 
from sklearn.model_selection import train_test_split
from recommenders.datasets import movielens
from recommenders.utils.timer import Timer 
from recommenders.utils.constants import SEED
from recommenders.cornac.cornac_utils import predict_ranking
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.11.4 | packaged by conda-forge | (main, Jun 10 2023, 18:08:41) [Clang 15.0.7 ]
Cornac version: 1.17


In [12]:
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

In [13]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=["userID", "itemID", "rating"]
)

data.head()

100%|██████████| 4.81k/4.81k [00:04<00:00, 1.07kKB/s]


Unnamed: 0,userID,itemID,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


In [14]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [15]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 943
Number of items: 1642


In [16]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [17]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))


  0%|          | 0/100 [00:00<?, ?it/s]

Optimization finished!
Took 1.8191 seconds for training.


In [18]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 0.6107 seconds for prediction.


In [19]:
all_predictions.head()

Unnamed: 0,userID,itemID,prediction
75000,811,755,0.040775
75001,811,287,2.376302
75002,811,181,3.751848
75003,811,96,1.846283
75004,811,83,0.863215


In [20]:
k = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')


MAP:	0.110293
NDCG:	0.407024
Precision@K:	0.359873
Recall@K:	0.183517


In [21]:
# Record results with papermill for tests
sb.glue("map", eval_map)
sb.glue("ndcg", eval_ndcg)
sb.glue("precision", eval_precision)
sb.glue("recall", eval_recall)