In [1]:
import os, logging
from tqdm import tqdm
import csv
import datetime
from dscollaborative.recommender import ImplicitModel
from numpy import dot
from numpy.linalg import norm
import numpy as np
import sklearn.metrics.pairwise as pairwise
from scipy.sparse import lil_matrix
import sklearn.preprocessing as pp
import pickle

## make lil of (user, item) reco matrix

In [3]:
model = ImplicitModel()
model.load_model("/Users/s-chuenkai/PycharmProjects/check/als_model.latest")

2022-07-14 17:51:20,226 - dscollaborative.recommender - INFO - <dscollaborative.recommender.ImplicitModel object at 0x7faf59051160>


In [4]:
item2id = model.model.user_item_matrix.item2id
user2id = model.model.user_item_matrix.user2id
del model

In [6]:
# reco_matrix = np.zeros((len(user2id), len(item2id)))
# reco_matrix.shape
# reco_matrix.size*reco_matrix.itemsize = 690315MB
# too big using np array

(2545410, 33900)

In [23]:
lil = lil_matrix((len(user2id), len(item2id)), dtype=int)
lil

<2545410x33900 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in List of Lists format>

In [28]:
with open("/Users/s-chuenkai/PycharmProjects/check/autoalt_ippan_features_2022-07-01.csv", "r") as csv_path:
    reader = csv.DictReader(csv_path)
    for line in tqdm(reader):
        if line["user_multi_account_id"] not in ["non-logged-in-coldstart", "coldstart"]:
            sids = line["sakuhin_codes"].split("|")
            user_idx = user2id.get(line['user_multi_account_id'], None)
            sid_indices = [item2id[x] for x in sids if x in item2id]
            if not user_idx or not sid_indices:
                continue
            lil[user_idx, sid_indices] = 1

56866249it [1:37:22, 9732.56it/s] 


In [29]:
lil

<2545410x33900 sparse matrix of type '<class 'numpy.int64'>'
	with 580246043 stored elements in List of Lists format>

## save & load lil matrix

In [34]:
with open("lil.pk", "wb") as fp:   #Pickling
    pickle.dump(lil, fp)

In [3]:
with open("lil.pk", "rb") as fp:   # Unpickling
    lil = pickle.load(fp)
lil

<2545410x33900 sparse matrix of type '<class 'numpy.int64'>'
	with 580246043 stored elements in List of Lists format>

## calculation

In [7]:
# to csc for fast columns slicing
csc = lil.tocsc()

In [23]:
%time csc[:,68]

CPU times: user 665 µs, sys: 1.72 ms, total: 2.38 ms
Wall time: 3.87 ms


<2545410x1 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Column format>

In [4]:
def sparse_mat_cos_sim(mat):
    # mat should be csc
    col_normed_mat = pp.normalize(mat, axis=0)
    s = (col_normed_mat.T * col_normed_mat).toarray()
    div = (2*(s.shape[0]*s.shape[0]-s.shape[0])/2)
    if div <= 0.00001:
        return 0
    else:
        return (np.sum(s) - s.shape[0])/div

In [11]:
cos_sim_sum = 0.0
cos_sim_cnt = 0
for row in tqdm(lil.rows):
    if row:
        sim = sparse_mat_cos_sim(csc[:, row])
        cos_sim_sum += sim
        cos_sim_cnt += 1
        print(sim)
        if cos_sim_cnt >= 10:
            break

print(f"{cos_sim_sum} - {cos_sim_cnt}")

  0%|                                                                                                      | 2/2545410 [00:13<4673:43:34,  6.61s/it]

0.3533333908770216


  0%|                                                                                                      | 3/2545410 [00:28<7370:50:50, 10.42s/it]

0.20508226694815807


  0%|                                                                                                     | 4/2545410 [00:52<10926:16:58, 15.45s/it]

0.2451472152678092


  0%|                                                                                                     | 5/2545410 [01:29<16021:45:41, 22.66s/it]

0.23544342721086117


  0%|                                                                                                     | 6/2545410 [01:58<17553:29:35, 24.83s/it]

0.23279109176494917


  0%|                                                                                                     | 7/2545410 [02:15<15928:15:43, 22.53s/it]

0.30547864736423025


  0%|                                                                                                     | 8/2545410 [02:40<16439:34:48, 23.25s/it]

0.2129561789556611


  0%|                                                                                                     | 9/2545410 [02:58<15279:51:35, 21.61s/it]

0.21845678597845847


  0%|                                                                                                     | 11/2545410 [03:07<9624:21:06, 13.61s/it]

0.14921607678170812


  0%|                                                                                                    | 11/2545410 [03:27<13324:41:59, 18.85s/it]

0.24632495433003798
2.4042300354788955 - 10





In [13]:
# ~15sec/per users -> 9583 hr to run for 2.3 million users...  