In [1]:
import pandas as pd
import pickle
import math
import numpy as np
from tqdm import tqdm

In [18]:
def ndcg(one_gt, one_rec):
    dcg = 0.0
    idcg = sum((1.0/np.log(i+1) for i in range(1, len(one_gt)+1)))

    for i, r in enumerate(one_rec):
        if r in one_gt:
            dcg += 1.0/np.log(i+2)

    return dcg/idcg

ndcg@10
1) 20개 이상의 interaction 사용자 -> (k개 리뷰 - 10)=학습, 10개=예측
    -> 10개
2) 모든 사용자에 대해서 history를 0.7, 0.3 비율 -> 50, 100개 예측 -> ndcg측정

In [3]:
ndcg([1, 2, 3, 4, 5], [1, 2, 100, 9, 6, 8, 5, 4, 3])

0.8752903992561911

1. nDCG@10이면 유저가 10개 이상의 리뷰 데이터를 가지고 있는 유저에게만 테스트 하는 것이 맞는가?  
ex) [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] <-> [3, 2, 6, 9, 22, 31, 1, 2, 3, 4]

2. 10개 미만의 유저라도 상관없이 테스트 할 수 있는 것인가?
ex) [1, 2, 3, 4] <-> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

### Load Review Data

In [4]:
path = '/opt/ml/final-project-level3-recsys-02/data/'

In [5]:
with open(path+"/food.pickle", "rb")  as f:
    place = pickle.load(f)

In [6]:
place['placeID'] = place.apply(lambda x : x['placeName'] + x['placeAddress'], axis = 1)
place['placeID'] = place['placeID'].apply(lambda x : x.replace(" ", ""))
place['map'] = place[['longitude','latitude']].apply(lambda x: tuple(x.values),axis=1)
place = place[~place.placeType.str.contains('성급')].reset_index().copy()

In [7]:
df = pd.read_csv(path + 'review.csv')
df.head()

Unnamed: 0,userHash,placeID,timestamp
0,5b7c26a7300f598cc0a19949,청계다방서울서초구원터4길61층청계다방,1651277000.0
1,5e7219bb8f87a842bc608e26,밀밭정원서울마포구마포대로16길13,1652314000.0
2,5e46a4e28f87a842bc369874,청계다방서울서초구원터4길61층청계다방,1650067000.0
3,5c27ac4f31fda71be490e8b7,밀밭정원서울마포구마포대로16길13,1651709000.0
4,5e5dd56f8f87a842bcae3647,청계다방서울서초구원터4길61층청계다방,1649549000.0


In [8]:
df = df[df.placeID.isin(place.placeID.unique())].reset_index(drop=True)

In [9]:
filter_df = df[['userHash', 'placeID', 'timestamp']]
filter_df.drop_duplicates(inplace=True)
filter_df = filter_df.groupby('userHash').filter(lambda x : len(x)>2)
filter_df.reset_index(drop=True, inplace=True)
filter_df.head()

Unnamed: 0,userHash,placeID,timestamp
0,61b106a74b246250298287f6,밀밭정원서울마포구마포대로16길13,1651104000.0
1,62221e887bac195eed62a637,밀밭정원서울마포구마포대로16길13,1651104000.0
2,5eeb1daf8f87a842bc1e61cb,밀밭정원서울마포구마포대로16길13,1651104000.0
3,5b9b44c488c814e31d90d6f6,밀밭정원서울마포구마포대로16길13,1650499000.0
4,5dc699538f87a842bcc650e4,밀밭정원서울마포구마포대로16길13,1642896000.0


In [10]:
userList = filter_df['userHash'].unique()
userList

array(['61b106a74b246250298287f6', '62221e887bac195eed62a637',
       '5eeb1daf8f87a842bc1e61cb', ..., '5ce8d2a1fe231b25730cc29b',
       '60e1444b3d695cbcd6ab5bf1', '5dc4b18e8f87a842bc1ead18'],
      dtype=object)

In [11]:
def get_user_info(user):
    history = train[train.userHash == user]['placeID'].values.tolist()
    gt = test[test.userHash==user]['placeID'].values.tolist()
    return history, gt

In [12]:
import sys
sys.path.append('../')

from models.model import CossimRecommender
from sklearn.metrics.pairwise import cosine_similarity



In [13]:
model = CossimRecommender('/opt/ml/final-project-level3-recsys-02/data/')

In [14]:
def get_user_embedding(review, user):
    visited_places = review[review['userHash'] == user].placeID

    user_emb = np.zeros(128)
    id_list = []
    for idx, vp in enumerate(visited_places[-11:]):
        id_list.append(model.place2id[vp])
        user_emb += np.array(model.place_emb[id_list[-1]])
    user_emb = user_emb/len(id_list)
    return user_emb, id_list
    
def recommend(user_emb, id_list):
    result = []
    user_embedding = user_emb.reshape(1,-1)
    cossim = cosine_similarity(user_embedding, model.place_emb)
    cossim = cossim.squeeze()
    cossim[id_list] = -1

    for idx in np.argsort(cossim)[::-1][:10]:
        result.append(model.id2place[idx])
    return result

### 전지역 추천 성능

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
train, test = train_test_split(filter_df, test_size=0.3, shuffle=True, stratify=filter_df['userHash'], random_state=34)

In [49]:
score = 0
for idx, user in enumerate(tqdm(userList[:1000])):
    user_emb, id_list = get_user_embedding(train, user)
    result = recommend(user_emb, id_list)
    answer = test[test.userHash==user]['placeID'].values
    score += ndcg(answer, result)

100%|██████████| 1000/1000 [02:07<00:00,  7.83it/s]


In [50]:
score / 1000

0.0001089364926177559

### 위치 기반 추천 성능

In [15]:
test = filter_df.groupby('userHash', as_index=False).nth(-1)
filter_df['isTest'] = False
filter_df.loc[test.index, 'isTest'] = True
train = filter_df[filter_df.isTest==False]
train = train.drop(['isTest'], axis=1)

In [16]:
test = pd.merge(test, place[['placeID', 'map']], how='left', on='placeID')

In [20]:
score = 0
for idx, values in tqdm(test[:1000].iterrows()):
    coor = values['map']
    user_emb, history_list = get_user_embedding(train, values['userHash'])
    result = model.user_recommend(coor, user_emb, history_list, 100)
    score += ndcg(values['placeID'], result)

1000it [03:09,  5.29it/s]


In [22]:
score / 1000

0.000628601557239674