In [1]:
import scipy.sparse
import numpy as np
import matplotlib.pyplot as plt
import os
import RecModel
import time
os.listdir('./data')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


['ml_100k_tst.npz', 'ml_100k_trn.npz', 'ML_20M.npz']

The Recmodel package implements the following models:

* [Neighbor](https://dl.acm.org/doi/10.1145/371920.372071): RecModel.Neighborhood

* [SLIM](https://dl.acm.org/doi/10.1109/ICDM.2011.134): RecModel.SLIM

* [VAE](https://dl.acm.org/doi/abs/10.1145/3178876.3186150): RecModel.VAE

* [EASE](https://dl.acm.org/doi/abs/10.1145/3308558.3313710): RecModel.EASE

* [WMF](https://dl.acm.org/doi/10.1109/ICDM.2008.22): RecModel.WMF

* [RecWalk](https://dl.acm.org/doi/abs/10.1145/3289600.3291016): RecModel.RecWalk

All these models have different hyper parameters, for more details take a look in the Documentation of the individual models.

In [2]:
num_users = 20000
num_items = 5000

In [3]:
movielens_20m = scipy.sparse.load_npz('data/ML_20M.npz')[:num_users, :num_items]

num_users, num_items = movielens_20m.shape

And split it into train and test:

In [4]:
train_data, test_data = RecModel.train_test_split_sparse_mat(movielens_20m)

In [5]:
# profling this
train_data, test_data

(<20000x5000 sparse matrix of type '<class 'numpy.float32'>'
 	with 1109517 stored elements in Compressed Sparse Row format>,
 <20000x5000 sparse matrix of type '<class 'numpy.float32'>'
 	with 277579 stored elements in Compressed Sparse Row format>)

# Random

The Naive Baseline only samples random items for every user. Therefore it does not need be trained.

In [6]:
# naive_model = RecModel.NaiveBaseline(num_items=num_items)

In [7]:
# test_data

In [8]:
# naive_model_performance = naive_model.eval_topn(test_mat=test_data, topn = np.array([5, 10, 20, 50]),
#                                                 rand_sampled_users=1000,
#                                                 rand_sampled_items=1000,
#                                                cores=8,
#                                                random_state=2)

In [9]:
# {'Recall@5': 0.005208690254251913, 'Recall@10': 0.01040737630173488, 'Recall@20': 0.020970318018726876, 'Recall@50': 0.05209790716996497}
# print(naive_model_performance)

In [10]:
# topn_dict = [5, 10, 20, 50]

# coverage_dict = {}
# for topn in topn_dict:
#     recommendation_frequencies = RecModel.test_coverage(naive_model, test_data,
#                                                         topN=topn,
#                                                         rand_sampled_users=1000,
#                                                        random_state=2)
#     coverage = (recommendation_frequencies > 0).mean()
#     coverage_dict[f'Coverage@{topn}'] = coverage
# coverage_dict

# slim

In [11]:
# can we save the matrix?
# slim_model = RecModel.SLIM(num_items=num_items, num_users=num_users)

The SLIM model is a  model-based collaborative filtering recommender. Therefore it needs to be trained on past interaction data. Unfortunately, the SLIM model is expensive to train and would need about an hour of training based on the full netlix dataset. To get more information about the status during training, set the verbose parameter to True! Additionally, the model has the hyper parameters alpha, l1_ratio, max_iter and tolerance. I set them to values that worked good on other datasets. To improve performance you should tune them to the netflix dataset.

In [12]:
# start = time.time()
# slim_model.train(X=train_data, alpha=4.427181, l1_ratio=0.318495, max_iter=27,
#                  tolerance=0.006841, cores=8, verbose=False)
# end = time.time()
# print(f'training slim, train data {train_data.shape}, time : {end - start} s')

After we trained the model, we can check its performance on the test dataset:

In [13]:
# # inference time here
# items=np.random.randint(low=0,high=2000,size=1000)
# warm_start = 10
# n_inference = 50
# inference_time_sample = []

# for i in range(warm_start):
#     slim_model.rank(items, users=1, topn=10)


# for i in range(n_inference):
#     start = time.time()
#     slim_model.rank(items, users=1, topn=10)
#     inference_time = (time.time() - start) * 1000
#     inference_time_sample.append(inference_time)

# np.mean(inference_time_sample), np.std(inference_time_sample)

In [14]:
# # how long?
# slim_model_performance = slim_model.eval_topn(test_mat=test_data,
#                                               topn = np.array([5, 10, 20, 50]),
#                                                 rand_sampled_users=1000,
#                                                 rand_sampled_items=1000,
#                                                cores=8,
#                                                random_state=2)

In [15]:
# print(slim_model_performance)

In [16]:
# # coverage
# # recommendation_frequencies = RecModel.test_coverage(slim_model, train_data, topN=5)

# topn_dict = [5, 10, 20, 50]

# coverage_dict = {}
# for topn in topn_dict:
#     recommendation_frequencies = RecModel.test_coverage(slim_model, test_data,
#                                                         topN=topn,
#                                                         rand_sampled_users=1000,
#                                                         random_state=2)
#     coverage = (recommendation_frequencies > 0).mean()
#     coverage_dict[f'Coverage@{topn}'] = coverage
# coverage_dict

We can also plot the frequencies of the most recommended items:

In [18]:
# n_items_to_plot = 1000

# sorted_recommendation_frequencies = np.sort(recommendation_frequencies)[::-1]

# fig, ax = plt.subplots(figsize=(20, 8))
# ax.plot(np.arange(0, n_items_to_plot), sorted_recommendation_frequencies[:n_items_to_plot])


# recwalk

In [18]:
# scipy.sparse.save_npz('slim_w_u20000_i5000', slim_model.W)

## save item similarity for recwalk

In [18]:
# slim_W = scipy.sparse.load_npz('slim_w_u20000_i5000.npz')
# slim_W

In [21]:
# recwalk_model = RecModel.RecWalk(
#     num_items=num_items, num_users=num_users,
#     k_steps=18,
#     eval_method='k_step',
#     slim_W=slim_W
# )

In [22]:
# start = time.time()
# # phi is a paramter in [0, 1]
# # we'll check it later
# # https://github.com/titoeb/RecModel/blob/master/RecModel/recwalk_model.py
# recwalk_model.train(train_mat=train_data,
#                     alpha=4.427181, l1_ratio=0.318495,
#                     phi=0.005,
#                     max_iter=27, tolerance=0.006841,
#                     cores=8, verbose=True)
# end = time.time()
# print(f'training recwalk, train data {train_data.shape}, time : {end - start} s')

  self._set_arrayXarray(i, j, x)


training recwalk, train data (20000, 5000), time : 0.27168989181518555 s


In [23]:
# # inference time here

# items = np.random.randint(low=0,high=1200,size=1000)
# warm_start = 10
# n_inference = 50
# inference_time_sample = []

# for i in range(warm_start):
#     recommendation = recwalk_model.rank(items, users=1, topn=10)


# for i in range(n_inference):
#     start = time.time()
#     recwalk_model.rank(items, users=1, topn=10)
#     inference_time = (time.time() - start) * 1000
#     inference_time_sample.append(inference_time)

# np.mean(inference_time_sample), np.std(inference_time_sample)

(68.22432041168213, 9.886197908732267)

In [25]:
# # 預期在300秒之內跑完
# # single user 68 ms x 1000 = 68s
# # 68 x 4 = 272s
# start = time.time()
# recwalk_model_performance = recwalk_model.eval_topn(test_mat=test_data,
#                                               topn = np.array([5, 10, 20, 50]),
#                                               rand_sampled_users=1000,
#                                               rand_sampled_items=1000,
#                                                cores=8,
#                                                random_state=2)
# end = time.time()
# print(f'time : {end - start} s')

This process will sampling 1000
time : 361.3586411476135 s


In [26]:
# print(recwalk_model_performance)

{'Recall@5': 0.018, 'Recall@10': 0.030307692, 'Recall@20': 0.053153846, 'Recall@50': 0.09376923}


In [27]:

# start = time.time()
# topn_dict = [5, 10, 20, 50]

# coverage_dict = {}
# for topn in topn_dict:
#     recommendation_frequencies = RecModel.test_coverage(recwalk_model, test_data,
#                                                         topN=topn,
#                                                         rand_sampled_users=1000,
#                                                         random_state=2)
#     coverage = (recommendation_frequencies > 0).mean()
#     coverage_dict[f'Coverage@{topn}'] = coverage

# end = time.time()
# print(f'time : {end - start} s')
# coverage_dict

This process will sampling 1000
This process will sampling 1000
This process will sampling 1000
This process will sampling 1000
time : 253.30329489707947 s


{'Coverage@5': 0.5086,
 'Coverage@10': 0.6874,
 'Coverage@20': 0.7794,
 'Coverage@50': 0.843}

## RecWalk in detail

In [11]:
slim_W = scipy.sparse.load_npz('slim_w_u20000_i5000.npz')
slim_W

<5000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 262528 stored elements in Compressed Sparse Column format>

In [12]:
# step 18 : 66ms

In [13]:
# step 3 : 11ms x 1000 x 2= 22s
# step 6 : 22ms x 1000 x 2= 44s
# step 9 : 33ms x 1000 x 2= 66s
# step 12 : 44ms x 1000 x 2= 88s
# step 15 : 55ms x 1000 x 2= 110s
# step 21 : 77ms x 1000 x 2= 154s

In [14]:
for phi in [1, 0.5]:
    for steps in [3, 6, 9, 12,15,18,21]:
        recwalk_model = RecModel.RecWalk(
        num_items=num_items, num_users=num_users,
        k_steps=steps,
        eval_method='k_step',
        slim_W=slim_W
        )
        print(f'steps {steps}')
        recwalk_model.train(train_mat=train_data,
                            alpha=4.427181, l1_ratio=0.318495,
                            phi=phi,
                            max_iter=27, tolerance=0.006841,
                            cores=8, verbose=True)

        start = time.time()
        recwalk_model_performance = recwalk_model.eval_topn(test_mat=test_data,
                                                      topn = np.array([10]),
                                                      rand_sampled_users=1000,
                                                      rand_sampled_items=1000,
                                                       cores=8,
                                                       random_state=2)
        end = time.time()
        print('recall : ', recwalk_model_performance)
        print(f'eval recall time : {end - start} s')


        start = time.time()
        topn_dict = [10]

        coverage_dict = {}
        for topn in topn_dict:
            recommendation_frequencies = RecModel.test_coverage(recwalk_model, test_data,
                                                                topN=topn,
                                                                rand_sampled_users=1000,
                                                                random_state=2)
            coverage = (recommendation_frequencies > 0).mean()
            coverage_dict[f'Coverage@{topn}'] = coverage

        end = time.time()
        print(f'eval coverage time : {end - start} s')
        print('coverage : ', coverage_dict)

steps 3


  self._set_arrayXarray(i, j, x)


This process will sampling 1000


Process ForkPoolWorker-6:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-8:
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
Process ForkPoolWorker-7:
Process ForkPoolWorker-2:
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/process.py", line

  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/site-packages/scipy/sparse/compressed.py", line 476, in _mul_vector
    fn(M, N, self.indptr, self.indices, self.data, other, result)
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/site-packages/scipy/sparse/base.py", line 359, in dot
    return self * other
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/site-packages/scipy/sparse/base.py", line 359, in dot
    return self * other
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/site-packages/scipy/sparse/compressed.py", line 476, in _mul_vector
    fn(M, N, self.indptr, self.indices, self.data, other, result)
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/site-packages/scipy/sparse/base.py", line 359, in dot
    return self * other
KeyboardInterrupt
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/site-packages/scipy/sparse/base.py", line 467, in __mul__
    return self._mul_vector(other)
  File "/Users/YuLong

  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/queues.py", line 354, in get
    return _ForkingPickler.loads(res)
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/Users/YuLong/Desktop/Working_Area/recsys_im/RecModel/RecModel/base_model.py", li

Traceback (most recent call last):
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-c7df18e0fb2c>", line 22, in <module>
    random_state=2)
  File "/Users/YuLong/Desktop/Working_Area/recsys_im/RecModel/RecModel/base_model.py", line 189, in eval_topn
    (elem for elem in iter_rows_two_matrices(super_mat, test_mat))))).sum(axis=0)
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/pool.py", line 268, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/pool.py", line 651, in get
    self.wait(timeout)
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/multiprocessing/pool.py", line 648, in wait
    self._event.wait(timeout)
  File "/Users/YuLong/miniconda3/envs/py_37_ds/lib/python3.7/threadi

TypeError: object of type 'NoneType' has no len()

In [None]:
# collect eval time for better experiment run time prediction
# total time, this depends on the steps
# 1376 s -> 20mins