In [2]:
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from tqdm import tqdm

import torch

In [3]:
def similarity_cosine_by_index(indicies):
    return cosine_similarity(X=synop_matrix[indicies], Y=synop_matrix)

In [4]:
book = pd.read_csv('../data/ver2/Book.tsv', sep='\t')
ratings = pd.read_csv('../data/ver2/Rating.tsv', sep='\t', low_memory=False)

gr = ratings.groupby('user').agg({'item' : 'count'})
val_user = list(gr[gr['item'] >= 3].index)
ratings = ratings[ratings['user'].isin(val_user)]

tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
synop_matrix = tfidf.fit_transform(book['synopsis'].values.astype('U'))
synop_matrix = synop_matrix.astype('float32')

chunk_size = 10000
matrix_len = synop_matrix.shape[0]

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()


ratings['item'] = item_encoder.fit_transform(ratings['item'])
ratings['user'] = user_encoder.fit_transform(ratings['user'])

user_cnt = ratings.groupby('user').agg({'item' : 'count'})
maximum_user = list(user_cnt[user_cnt['item'] < 10].index)

ratnigs = ratings[ratings['user'].isin(maximum_user)]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_ratings, valid_ratings = train_test_split(ratings, stratify=ratings['user'], test_size=0.3)

In [7]:
train_ratings['user'].nunique()

1075591

In [8]:
valid_ratings['user'].nunique()

1075591

In [9]:
train_ratings['item']

9779842     29904
8728797     26502
10860904    34611
8026086     24508
13094142    37183
            ...  
13416353    28969
6706563     21038
8373672     25243
6885562     21315
8140120     24637
Name: item, Length: 5880137, dtype: int64

In [10]:
rows, cols, data = train_ratings['user'], train_ratings['item'], train_ratings['rating']
n_users = ratings['user'].nunique()
n_items = ratings['item'].nunique()

rating_matrix = sparse.csr_matrix((data, (rows, cols)),
                            dtype='float32',
                            shape=(n_users, n_items))

In [11]:
cosine_similarities = cosine_similarity(synop_matrix)

In [12]:
for st_idx in tqdm(range(0, n_users, 10000)):

    next_idx = st_idx + 10000
    end_idx = min(next_idx, n_users)
    input_data = rating_matrix[st_idx:end_idx]
    users = range(st_idx, end_idx)
    
    result = torch.tensor(input_data @ cosine_similarities)
    result[torch.nonzero(torch.FloatTensor(input_data.todense()), as_tuple=True)] = -np.inf
    
    scores, items = torch.topk(result, k=10, dim=1)
    

    users = np.tile(users, (10,1)).T
    user_list = np.concatenate([user for user in users])
    score_list = torch.cat([score for score in scores])
    item_list = torch.cat([item for item in items])

    temp_df = pd.DataFrame()
    temp_df['user'] = user_list
    temp_df['item'] = item_list.cpu().numpy()
    temp_df['score'] = score_list.cpu().numpy()

    # temp_df['user'] = user_encoder.inverse_transform(temp_df['user'])
    # temp_df['item'] = item_encoder.inverse_transform(temp_df['item'])

    if st_idx == 0 : inference_df = temp_df
    else : inference_df = pd.concat([inference_df, temp_df])

inference_df = inference_df.sort_values(['user', 'score'], ascending=[True, False])

100%|██████████| 108/108 [08:25<00:00,  4.68s/it]


In [13]:
inference_df

Unnamed: 0,user,item,score
0,0,127,1.491501
1,0,173,1.269477
2,0,28645,1.152108
3,0,24905,1.147473
4,0,9105,1.103232
...,...,...,...
55905,1075590,19801,2.006795
55906,1075590,29232,1.961740
55907,1075590,18874,1.950201
55908,1075590,9192,1.919881


In [14]:
valid_ratings

Unnamed: 0,item,user,rating
7318736,22862,968642,5.0
6980524,21736,213883,4.0
10615061,33370,302990,4.0
13614496,14762,110343,5.0
13389858,43911,1060791,5.0
...,...,...,...
1005184,2628,34043,5.0
4603208,13006,441292,3.0
5654807,17087,820852,4.0
7246656,22607,374462,4.0


In [15]:
from sklearn.metrics import ndcg_score, recall_score

In [16]:
inference_df[inference_df['user'] == 1]

Unnamed: 0,user,item,score
10,1,36019,2.312494
11,1,24017,2.055125
12,1,23907,1.691167
13,1,10259,1.58695
14,1,6187,1.488469
15,1,22747,1.406347
16,1,26843,1.402577
17,1,22688,1.390862
18,1,28413,1.36106
19,1,22464,1.327294


In [17]:
user_list = list(ratings['user'])

In [18]:
valid_ratings

Unnamed: 0,item,user,rating
7318736,22862,968642,5.0
6980524,21736,213883,4.0
10615061,33370,302990,4.0
13614496,14762,110343,5.0
13389858,43911,1060791,5.0
...,...,...,...
1005184,2628,34043,5.0
4603208,13006,441292,3.0
5654807,17087,820852,4.0
7246656,22607,374462,4.0


In [24]:
score = 0
cnt = 0
for i in tqdm(range(len(user_list))):
    try :
        user = user_list[i]
        y_true = list(valid_ratings[valid_ratings['user'] == user]['item'])
        y_pred = list(inference_df[inference_df['user'] == user]['item'])[:len(y_true)]
        score += recall_score(y_true, y_pred, average='micro')
        cnt += 1
    except:
        continue

    if i == 1000000 :
        break

 12%|█▏        | 1000000/8400197 [5:25:35<40:09:24, 51.19it/s]


In [23]:
score / cnt

0.00015268650361451922

In [21]:
y_true

[24777]

In [25]:
y_pred

[26638, 14756]