In [1]:
import argparse
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import os
import recbole.utils
from recbole.quick_start import load_data_and_model

  from .autonotebook import tqdm as notebook_tqdm


## inference1
- 유저당 top n개의 행렬 생성

In [30]:
def inference_top_n(
    model_pth_path:str = '/opt/ml/final-project-level3-recsys-13/modeling/RecBole/saved/1/MultiDAE-Jun-07-2022_21-28-45.pth', 
    top_n:int = 100
    ) -> dict:
    
    _, model, dataset, _, _, test_data = load_data_and_model(model_pth_path)
    print('inference...')
    # device 설정
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # user, item id -> token 변환 array
    user_id2token = dataset.field2id_token['user_id']
    item_id2token = dataset.field2id_token['recipe_id']

    # user-item sparse matrix
    matrix = dataset.inter_matrix(form='csr')

    # user id, predict item id 저장 변수
    pred_list = None
    user_list = None

    model.eval()
    for data in tqdm(test_data):
        interaction = data[0].to(device)
        score = model.full_sort_predict(interaction)
        
        rating_pred = score.cpu().data.numpy().copy()
        batch_user_index = interaction['user_id'].cpu().numpy()
        rating_pred[matrix[batch_user_index].toarray()[0] > 0] = -np.inf
        ind = np.argpartition(rating_pred, -top_n)[ -top_n:]
        sorted_ind_by_pred = ind[np.argsort(rating_pred[ind])[::-1]]
        
        # 예측값 저장
        if pred_list is None:
            pred_list = sorted_ind_by_pred
            user_list = np.array([batch_user_index.item() for _ in range(top_n)])
        else:
            pred_list = np.append(pred_list, sorted_ind_by_pred)
            user_list = np.append(user_list, np.array([batch_user_index.item() for _ in range(top_n)]))
        
    result = []
    for user, pred in zip(user_list, pred_list):
        result.append((int(user_id2token[user]), int(item_id2token[pred])))
            
    # 데이터 처리 : 딕셔너리로 반환
    result_df = pd.DataFrame(result, columns=["user", "item"])
    result_df.sort_values(by='user', inplace=True)
    result_groupby = result_df.groupby('user')['item'].apply(list)
    
    result_dict = dict()
    for user, item in zip(result_groupby.index.to_list(), result_groupby.values):
       result_dict[user] = item    
    
    print('inference done!')
    
    return result_dict

In [31]:
test = inference_top_n()

All the same value in [rating] from [        user_id  recipe_id  date  rating
0             1          1     1     1.0
1             1          2     2     1.0
2             1          3     2     1.0
3             1          4     3     1.0
4             1          5     4     1.0
...         ...        ...   ...     ...
580516     4971      24948  6283     1.0
580517     8860     147302  6283     1.0
580518     9388      44071  6284     1.0
580519     6922     122587  6284     1.0
580520     9686     103347  6285     1.0

[580521 rows x 4 columns]_feat].


inference...


100%|██████████| 9723/9723 [00:56<00:00, 171.71it/s]


inference done!


In [34]:
len(test[0])

100

In [35]:
np.save('test.npy', test)

In [36]:
# np.load('test.npy', allow_pickle=True).item()

{0: [15371,
  115109,
  1266,
  6187,
  19490,
  22,
  11200,
  14952,
  24618,
  11284,
  66564,
  4876,
  7681,
  27998,
  5226,
  15234,
  43518,
  3059,
  20362,
  7028,
  122100,
  22506,
  24670,
  40337,
  2044,
  1378,
  2855,
  28784,
  10575,
  11477,
  29425,
  24997,
  46241,
  25030,
  18257,
  10690,
  7815,
  15083,
  31460,
  1003,
  42020,
  13229,
  119951,
  939,
  17570,
  9133,
  11396,
  52573,
  8511,
  10299,
  65837,
  19814,
  10620,
  88647,
  30793,
  32565,
  58447,
  31787,
  5801,
  27614,
  9730,
  22085,
  32536,
  20007,
  19228,
  9193,
  17198,
  13472,
  1820,
  18450,
  2648,
  30742,
  11298,
  25262,
  16486,
  32309,
  42879,
  5941,
  76504,
  6903,
  9775,
  5402,
  5274,
  42123,
  14590,
  2962,
  36360,
  4469,
  11467,
  16835,
  3960,
  1186,
  5263,
  11320,
  3663,
  10038,
  12169,
  34202,
  30505,
  15384],
 1: [42818,
  67854,
  48649,
  141997,
  103143,
  144456,
  40904,
  142349,
  136344,
  132558,
  144522,
  117888,
  141075,

In [26]:
# np.save('test.npy', np.asarray(test.groupby('user')['item'].apply(list).to_list()))

In [20]:
test_list = test.groupby('user')['item'].apply(list).values
test_list

array([list([15371, 11298, 30742, 2648, 18450, 1820, 13472, 17198, 9193, 19228, 16486, 20007, 22085, 42879, 9730, 27614, 5801, 31787, 58447, 32565, 30793, 88647, 10620, 32536, 32309, 76504, 3960, 30505, 34202, 12169, 10038, 3663, 11320, 5263, 1186, 15384, 11467, 4469, 36360, 2962, 16835, 14590, 42123, 5274, 5402, 9775, 6903, 5941, 19814, 25262, 10299, 7815, 22506, 122100, 7028, 20362, 3059, 43518, 15234, 5226, 27998, 7681, 4876, 66564, 11284, 24618, 14952, 11200, 22, 19490, 6187, 1266, 115109, 24670, 65837, 40337, 2044, 1378, 52573, 11396, 9133, 17570, 939, 119951, 13229, 42020, 1003, 31460, 8511, 15083, 10690, 18257, 25030, 46241, 24997, 29425, 11477, 10575, 28784, 2855]),
       list([11320, 85874, 8559, 50917, 14952, 700, 3960, 9193, 2703, 26954, 452, 53441, 42123, 48413, 61083, 42435, 20135, 15371, 19814, 5711, 1693, 2431, 12060, 11046, 9775, 7017, 8241, 5882, 2648, 11298, 3032, 12025, 4469, 22757, 30638, 141525, 122704, 99645, 2725, 8098, 119975, 136266, 81866, 137231, 117238, 142

In [19]:
np.asarray(test.groupby('user')['item'].apply(list).values)

array([list([15371, 11298, 30742, 2648, 18450, 1820, 13472, 17198, 9193, 19228, 16486, 20007, 22085, 42879, 9730, 27614, 5801, 31787, 58447, 32565, 30793, 88647, 10620, 32536, 32309, 76504, 3960, 30505, 34202, 12169, 10038, 3663, 11320, 5263, 1186, 15384, 11467, 4469, 36360, 2962, 16835, 14590, 42123, 5274, 5402, 9775, 6903, 5941, 19814, 25262, 10299, 7815, 22506, 122100, 7028, 20362, 3059, 43518, 15234, 5226, 27998, 7681, 4876, 66564, 11284, 24618, 14952, 11200, 22, 19490, 6187, 1266, 115109, 24670, 65837, 40337, 2044, 1378, 52573, 11396, 9133, 17570, 939, 119951, 13229, 42020, 1003, 31460, 8511, 15083, 10690, 18257, 25030, 46241, 24997, 29425, 11477, 10575, 28784, 2855]),
       list([11320, 85874, 8559, 50917, 14952, 700, 3960, 9193, 2703, 26954, 452, 53441, 42123, 48413, 61083, 42435, 20135, 15371, 19814, 5711, 1693, 2431, 12060, 11046, 9775, 7017, 8241, 5882, 2648, 11298, 3032, 12025, 4469, 22757, 30638, 141525, 122704, 99645, 2725, 8098, 119975, 136266, 81866, 137231, 117238, 142

In [37]:
test

In [32]:
import pickle
with open('test.pickle', 'wb') as f:
    pickle.dump(test, f)

In [10]:
d = test.groupby('user')['item'].apply(list)

In [16]:
d.index.to_list()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [12]:
d

user
0       [15371, 115109, 1266, 6187, 19490, 22, 11200, ...
1       [42818, 67854, 48649, 141997, 103143, 144456, ...
2       [42446, 7040, 59593, 51742, 21012, 14624, 1662...
3       [7939, 9871, 2721, 6836, 16927, 13454, 8141, 2...
4       [15231, 2698, 12567, 20371, 6058, 28512, 31585...
                              ...                        
9718    [61901, 92625, 57948, 51431, 69633, 66981, 111...
9719    [4093, 2413, 27196, 34132, 30414, 18646, 10038...
9720    [54614, 35049, 17160, 37265, 27047, 50330, 303...
9721    [38053, 21954, 4779, 36578, 9871, 10574, 27047...
9722    [41762, 74927, 80117, 92444, 35049, 32015, 887...
Name: item, Length: 9723, dtype: object

## inference2
- 한 유저(타겟)의 모든 아이템 점수를 반환

In [6]:
_, model, dataset, _, _, test_data = load_data_and_model('/opt/ml/final-project-level3-recsys-13/modeling/RecBole/saved/1/MultiDAE-Jun-07-2022_21-28-45.pth')

All the same value in [rating] from [        user_id  recipe_id  date  rating
0             1          1     1     1.0
1             1          2     2     1.0
2             1          3     2     1.0
3             1          4     3     1.0
4             1          5     4     1.0
...         ...        ...   ...     ...
580516     4971      24948  6283     1.0
580517     8860     147302  6283     1.0
580518     9388      44071  6284     1.0
580519     6922     122587  6284     1.0
580520     9686     103347  6285     1.0

[580521 rows x 4 columns]_feat].


In [101]:
zip_lightgcn = (model, dataset, test_data)

In [103]:
model, dataset, test_data = zip_lightgcn

In [104]:
# device 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# user, item id -> token 변환 array
user_id2token = dataset.field2id_token['user_id']
item_id2token = dataset.field2id_token['recipe_id']

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

model.eval()

target = 1523


In [116]:
def make_user_preference(model, test_data, target):
    for data in test_data:
        interaction = data[0].to(device)
        batch_user_index = interaction['user_id'].cpu().numpy()
        if user_id2token[batch_user_index].item() == str(target):
            score = model.full_sort_predict(interaction)
            rating_pred = score.cpu().data.numpy().copy()
            rating_pred[matrix[batch_user_index].toarray()[0] > 0] = -np.inf
            return rating_pred[1:]

In [120]:
test = make_user_preference(model, test_data, 2352)

In [121]:
test

array([ 0.4546572 ,  0.90277934,  0.68665576, ...,  0.25117344,
        0.5840132 , -0.23490018], dtype=float32)

## inference3

In [2]:
_, model, dataset, _, _, test_data = load_data_and_model('/opt/ml/final-project-level3-recsys-13/modeling/RecBole/saved/1/MultiDAE-Jun-07-2022_21-28-45.pth')

All the same value in [rating] from [        user_id  recipe_id  date  rating
0             1          1     1     1.0
1             1          2     2     1.0
2             1          3     2     1.0
3             1          4     3     1.0
4             1          5     4     1.0
...         ...        ...   ...     ...
580516     4971      24948  6283     1.0
580517     8860     147302  6283     1.0
580518     9388      44071  6284     1.0
580519     6922     122587  6284     1.0
580520     9686     103347  6285     1.0

[580521 rows x 4 columns]_feat].


In [3]:
#_, model, dataset, _, _, test_data = load_data_and_model(model_pth_path)
print('inference...')
# device 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# user, item id -> token 변환 array
user_id2token = dataset.field2id_token['user_id']
item_id2token = dataset.field2id_token['recipe_id']

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

model.eval()
preference_matrix = list()
for data in tqdm(test_data):
    interaction = data[0].to(device)
    score = model.full_sort_predict(interaction)
    
    rating_pred = score.cpu().data.numpy().copy()
    batch_user_index = interaction['user_id'].cpu().numpy()
    rating_pred[matrix[batch_user_index].toarray()[0] > 0] = -np.inf
    
    preference_matrix.append(list(rating_pred[1:]))


    

inference...


100%|██████████| 9723/9723 [02:14<00:00, 72.13it/s]


In [7]:
preference_matrix = np.array(preference_matrix)

In [58]:
type(preference_matrix[0])

numpy.float32

In [33]:
a = np.array([]).reshape(1, -1)

In [34]:
a.shape

(1, 0)

In [37]:
np.concatenate([rating_pred.reshape(1, -1)]).shape

(1, 147303)

In [32]:
np.concatenate([rating_pred.reshape(1, -1), rating_pred.reshape(1, -1)], axis=0).shape

(2, 147303)

In [4]:
inter = pd.read_csv('/opt/ml/final-project-level3-recsys-13/modeling/RecBole/dataset/foodcom/foodcom.inter')

In [5]:
inter['user_id:token'].nunique()

9723

In [6]:
inter['recipe_id:token'].nunique()

147302