In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import sys
sys.path.append('../')

In [2]:
import pandas as pd
import numpy as np
import time
from implicit.bpr import BayesianPersonalizedRanking
from scipy.sparse import coo_matrix

from src.preprocess import add_time_idx

## Load Data

In [35]:
 DATA_PATH = '../data/ml-1m.txt'
# DATA_PATH = '../data/beauty.txt'
# DATA_PATH = '../data/steam.txt'
# DATA_PATH = '../data/ml-20m.txt'
# DATA_PATH = '../data/yelp.txt'

In [36]:
data = pd.read_csv(DATA_PATH, sep=' ', header=None, names=['user_id', 'item_id'])
data = add_time_idx(data, sort=False)
print(data.shape)
data.head()

(999611, 4)


Unnamed: 0,user_id,item_id,time_idx,time_idx_reversed
0,1,1,0,78
1,1,2,1,77
2,1,3,2,76
3,1,4,3,75
4,1,5,4,74


In [37]:
data.user_id.nunique(), data.item_id.nunique()

(6040, 3416)

In [38]:
data.user_id.value_counts().describe()

count    6040.000000
mean      165.498510
std       192.543909
min        18.000000
25%        44.000000
50%        96.000000
75%       207.250000
max      2277.000000
Name: user_id, dtype: float64

In [39]:
data.item_id.value_counts().describe()

count    3416.000000
mean      292.626171
std       391.674786
min         5.000000
25%        47.000000
50%       146.000000
75%       374.250000
max      3428.000000
Name: item_id, dtype: float64

In [40]:
train = data[data.time_idx_reversed >= 2]
validation = data[data.time_idx_reversed == 1]
validation_full = data[data.time_idx_reversed >= 1]
test = data[data.time_idx_reversed == 0]

## Dataloaders

In [41]:
test_matrix= coo_matrix((np.ones(len(validation_full)), (validation_full.user_id-1, validation_full.item_id-1)), (data.user_id.max(), data.item_id.max()))

In [42]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, recall_at_k


def compute_metrics(ground_truth, preds, k=10):

    if not hasattr(ground_truth, 'rating'):
        ground_truth = ground_truth.assign(rating=1)

    # when we have 1 true positive, HitRate == Recall and MRR == MAP
    metrics = {
        'ndcg': ndcg_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                          col_prediction='prediction', col_rating='rating', k=k),
        'hit_rate': recall_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                                col_prediction='prediction', col_rating='rating', k=k),
        'mrr': map_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                        col_prediction='prediction', col_rating='rating', k=k)
    }

    return metrics

In [44]:

time_list=[]
hr10 =[]
hr100=[]
ndcg10 = []
ndcg100 = []
with open("../results/bpr_ml1m.json","w") as f:
    for i in range(10):
        seed = np.random.randint(1000)
        bpr = BayesianPersonalizedRanking(factors=128,use_gpu=True,learning_rate=0.05,regularization=0.01,random_state=seed)

        start_time = time.time()
        bpr.fit(test_matrix)
        time_list.append(time.time() - start_time)

        result = bpr.recommend(np.arange(data.user_id.max()),
                  test_matrix.tocsr(),
                  100,
                  True,
                  recalculate_user= False)
        result_df =pd.DataFrame(columns=["user_id","item_id","prediction"])
        result_df["user_id"]=np.repeat(np.arange(1,data.user_id.max()+1), 100)
        result_df["item_id"]=result[0].ravel()+1
        result_df["prediction"]=result[1].ravel()
        metrics = compute_metrics(test, result_df, k=10)
        hr10.append(metrics["hit_rate"])
        ndcg10.append(metrics["ndcg"])
        
        metrics = compute_metrics(test, result_df, k=100)
        hr100.append(metrics["hit_rate"])
        ndcg100.append(metrics["ndcg"])
        
    f.write(f"""{{"time": {np.mean(time_list)},
"hr10": {np.mean(hr10)},
"hr100": {np.mean(hr100)},
"ndcg10": {np.mean(ndcg10)},
"ndcg100": {np.mean(ndcg100)}}}\n""")
    



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]