In [1]:
import os

import pandas as pd
import numpy as np
import scipy

import random

import implicit

SEED = 42

random.seed(SEED)
np.random.seed(SEED)

In [2]:
# total
data_dir = '../data/interactions_train.csv'
train = pd.read_csv(data_dir)

In [3]:
train

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723
...,...,...,...,...,...,...
698896,926904,457971,2018-12-18,5.0,13681,141067
698897,2002312797,27208,2018-12-18,5.0,14897,99787
698898,1290903,131607,2018-12-18,5.0,11605,76163
698899,226867,363072,2018-12-18,5.0,3604,29101


In [4]:
#init
train['view'] = 1
train_csr = scipy.sparse.csr_matrix((train['view'], (train['u'], train['i'])), shape=(train['u'].max()+1, train['i'].max()+1))

In [24]:
# model
als_model = implicit.als.AlternatingLeastSquares(factors=300, regularization=1, iterations=10, random_state=SEED)
als_model.fit(train_csr)

  0%|          | 0/10 [00:00<?, ?it/s]

In [8]:
als_model.user_factors.shape

(25076, 300)

In [9]:
als_model.item_factors.shape

(178263, 300)

In [8]:
sample_user_idx = [i for i in range(0, 101, 10)]
sample_user_idx

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [9]:
sample_users = (als_model.user_factors[sample_user_idx] @ als_model.item_factors.T)

In [10]:
train_csr[sample_user_idx,:].nonzero()

(array([ 0,  0,  0, ..., 10, 10, 10], dtype=int32),
 array([     0,   1118,  11190, ..., 177757, 178044, 178109], dtype=int32))

In [11]:
sample_users.shape

(11, 178263)

In [12]:
sample_users[train_csr[sample_user_idx, :].nonzero()] = float('-inf')

In [13]:
sample_users

array([[          -inf,  6.2727980e-07,  8.8870300e-05, ...,
         0.0000000e+00, -2.4677329e-05,  5.2282005e-05],
       [ 8.9160058e-05,  2.0433852e-05,  3.1671585e-03, ...,
         0.0000000e+00,  1.0194835e-03,  2.6934324e-03],
       [ 5.1091225e-05,  1.2208320e-05,  1.7772850e-03, ...,
         0.0000000e+00, -1.7052602e-04,  1.1868512e-03],
       ...,
       [ 4.3480828e-05,  1.0187718e-05,  1.4884705e-03, ...,
         0.0000000e+00, -5.5623299e-04,  6.4997608e-04],
       [ 5.4490151e-06,  1.3035899e-06,  1.8489073e-04, ...,
         0.0000000e+00, -9.9302895e-05,  6.9479654e-05],
       [ 1.6443431e-04,  3.8376678e-05,  5.8188504e-03, ...,
         0.0000000e+00,  1.5193289e-03,  5.0278823e-03]], dtype=float32)

In [14]:
LABEL_CNT = 10

In [15]:
result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in sample_users ]

In [16]:
result

[array([147180,  52334, 101819, 127080,  37047, 117899, 147374, 135961,
        134610,  99787]),
 array([ 55772,  19297,  15173, 125637, 101819,  52334, 127080, 147374,
        134610, 135961]),
 array([125637, 139822,  52334,  37047, 127080, 101819, 134610, 135961,
        117899, 147374]),
 array([ 52334, 139822,  37047, 135961, 134610, 117899, 101819, 147374,
         99787, 127080]),
 array([ 52334, 139822, 117899, 147374, 134610, 135961,  99787,  37047,
        101819, 127080]),
 array([147180,  52334, 134610,  37047, 127080, 139822,  99787,  19297,
        147374, 101819]),
 array([125637,  52334, 135961, 101819, 117899,  37047, 147180, 127080,
         99787, 147374]),
 array([ 52334, 139822,  37047, 135961, 127080, 117899, 147374,  99787,
        101819, 134610]),
 array([125637,  52334, 127080,  37047, 147180, 135961, 101819,  99787,
        134610, 117899]),
 array([125637,  52334, 147180,  37047, 127080, 101819,  99787, 134610,
        147374, 117899]),
 array([147180,  523

In [30]:
test = pd.read_csv('../data/interactions_test.csv')

In [47]:
train_csr.shape

(25076, 178263)

In [56]:
als_model.user_factors.shape[0], als_model.item_factors.shape[0]

(25076, 178263)

In [11]:
valid = pd.read_csv('../data/interactions_validation.csv')
valid['view'] = 1
valid_csr = scipy.sparse.csr_matrix((valid['view'], (valid['u'], valid['i'])), shape=(valid['u'].max()+1, valid['i'].max()+1))

In [40]:
# validation이나 test에서 없는 사용자들은 제외하고 시작한다.
train['u'].max(), train['i'].max()

(25075, 178262)

In [43]:
valid['u'].max(), valid['i'].max()

(25055, 178263)

In [39]:
test['u'].max(), test['i'].max()

(25074, 178264)

In [12]:
# 없는거 제외하는 코드
valid_cut = valid_csr[:als_model.user_factors.shape[0], :als_model.item_factors.shape[0]]

In [70]:
valid_cut.nonzero()[0]

array([    5,    23,    31, ..., 25047, 25053, 25055], dtype=int32)

In [26]:
# 유저 벡터는 대충 학습이 된 듯
als_model.user_factors[valid_cut.nonzero()[0]]#.shape

array([[ 1.7637564e-02,  2.2146624e-02, -1.7121631e-01, ...,
        -1.9433333e-02,  6.9589734e-02,  7.3266730e-02],
       [-2.7666526e-02,  1.3935352e-02,  1.2957205e-02, ...,
        -9.4583901e-03,  8.7149432e-03, -5.1858877e-03],
       [ 4.7143553e-03,  3.1037172e-04,  3.3789487e-03, ...,
        -2.3256775e-03,  1.4715833e-03,  1.6747665e-02],
       ...,
       [-5.0740968e-04,  1.2417804e-03,  7.9542055e-04, ...,
         1.0783877e-03,  1.2369329e-05, -7.1399503e-05],
       [-3.5564558e-04, -4.6116719e-04,  1.8890153e-04, ...,
        -3.5129029e-05,  3.8319596e-04, -5.7090266e-04],
       [-3.2710242e-03,  2.3519492e-03, -4.9071452e-05, ...,
         2.3722672e-03,  5.6818727e-04, -3.3317972e-03]], dtype=float32)

In [61]:
# 아이템 벡터가 다 0... 왜 학습이 안된거지...??
als_model.item_factors[valid_cut.nonzero()[1]]#.shape

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [63]:
#유저 벡터는 일단 전부 다름(학습이 되었음)
np.unique(als_model.user_factors[valid_cut.nonzero()[0]].sum(axis=1)).shape

(7022,)

In [55]:
# 진짜로 다 0이다
np.unique(als_model.item_factors[valid_cut.nonzero()[1]].sum(axis=1))

array([0.], dtype=float32)

In [83]:
valid_cut.data.size

7022

In [78]:
valid_cut.nonzero()[0].shape[0]

7022

In [74]:
# 선호도를 계산해 보려 했으나 아이템 벡터가 다 0이라서 안됨.. 원인을 확인해보자
valid_prefrence = (als_model.user_factors[valid_cut.nonzero()[0]] * als_model.item_factors[valid_cut.nonzero()[1]]).sum(axis=1)
valid_prefrence

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [71]:
valid_cut[valid_cut.nonzero()]

matrix([[1, 1, 1, ..., 1, 1, 1]])

In [82]:
valid_cut.data-valid_prefrence

array([1., 1., 1., ..., 1., 1., 1.])

In [64]:
# valid interaction들이 아이템당 하나씩 밖에 없는 것들이었다...
len(set(valid['i'])), len(set(valid['i'])-set(train['i']))

(6621, 6621)

In [23]:
# max에 비하면 상당히 부족한 숫자
train['u'].nunique(), train['i'].nunique()

(25076, 160901)

In [36]:
train.shape

(698901, 7)

In [79]:
valid.shape

(7023, 7)

In [35]:
test.shape

(12455, 6)