In [1]:
# 거리 기반으로 반경 1km내 음식점 필터링
from math import sin, cos, sqrt, atan2, radians
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import random
import pickle
from tqdm import tqdm
import time
import os



In [2]:
# 예제
data_dir = '/opt/ml/final-project-level3-recsys-02/data/'
food_path = os.path.join(data_dir, 'food.csv')
place = pd.read_csv(food_path)
place['placeID'] = place.apply(lambda x : x['placeName'] + x['placeAddress'], axis = 1)
place['placeID'] = place['placeID'].apply(lambda x : x.replace(" ", ""))
place['map'] = place[['longitude','latitude']].apply(lambda x: tuple(x.values),axis=1)

In [3]:
place = place[~place.placeType.str.contains('성급')].reset_index().copy()

In [4]:
R = 6373.0
def distance_from_coord(o_coord, d_coord):
    x1, y1 = radians(o_coord[0]), radians(o_coord[1])
    x2, y2 = radians(d_coord[0]), radians(d_coord[1])
    dlon = x2 - x1
    dlat = y2 - y1
    a = sin(dlat / 2)**2 + cos(y1) * cos(y2) * sin(dlon / 2)**2 
    c = 2 * atan2(sqrt(a), sqrt(1 - a))    
    return R * c

def filtermap(coor,r=1):
    name=[]
    for idx,plc in enumerate(place['map']):
        if distance_from_coord(coor,plc)<=1:
            name.append(place.loc[idx]['placeID'])
    return name

In [5]:
def create_embedding_file() :
    embed_path = os.path.join(data_dir, 'metapath_embeddings')
    with open(embed_path, 'r') as f:
        id2word_len, emb_dimension = f.readline().split()
        id2word = {}
        word2id = {}
        embeddings = []
        idx = 0
        while True :
            z = f.readline()
            if not z :
                break
            z = z.split()
            word = z[0]
            embedding = list(map(float, z[1:]))
            embeddings.append(embedding)
            id2word[idx] = word
            word2id[word] = idx
            idx += 1
    return id2word, word2id, embeddings

In [6]:
id2place, place2id, place_emb = create_embedding_file()
cossim = cosine_similarity(np.array(place_emb))

In [7]:
def get_nearest_cossim(nearest_list, k=5):
    nearest_ids = [place2id[n] for n in nearest_list]
    nearest_cossim = cossim[nearest_ids[0], nearest_ids[1:]]
    topk = np.argsort(nearest_cossim)[::-1][:k]
    return [nearest_ids[i] for i in topk]

In [8]:
nearest_list = filtermap((126.9566625, 37.5507711))
topk = get_nearest_cossim(nearest_list)
topk

[2847, 11311, 2868, 1481, 7272]

In [9]:
id2place[4810], id2place[6844], id2place[11856], id2place[8431], id2place[2033]

('놀부유황오리진흙구이신천점서울송파구올림픽로10길5파로스관광호텔',
 '뚝섬정지서울성동구성수일로39',
 '세련대게롯데마트수지점경기용인시수지구성복2로38롯데마트수지점2층',
 '반포식스잠실점서울송파구송파대로562한빛프라자지하1층',
 '프로젝트마도서울구로구디지털로32다길46')

### performance check

In [10]:
f_df = pd.read_csv(data_dir + 'feature.csv')
l_df = pd.read_csv(data_dir + 'like.csv')

In [11]:
with open(data_dir + 'place_id2idx.pkl', 'rb') as f :
    place_id2idx = pickle.load(f)

In [12]:
with open(data_dir + 'place_idx2id.pkl', 'rb') as f :
    place_idx2id = pickle.load(f)

In [13]:
def jaccard_similarity(list1, list2): 
    s1 = set(list1)
    s2 = set(list2)
    if len(s1.union(s2)):
        return float(len(s1.intersection(s2)) / len(s1.union(s2)))
    return 0

In [14]:
def get_feat_list(df, place_id, feature):
    return df[df.placeID == place_id][feature].tolist()

In [15]:
def compare_random(place_id, df, column):
    latitude, longitude = place[place.placeID == id2place[place_id]][['latitude', 'longitude']].values.tolist()[0]
    nearest_list = filtermap((longitude, latitude))
    m2v_topk = get_nearest_cossim(nearest_list, min(len(nearest_list), 10))

    nearest_ids = [place2id[n] for n in nearest_list]
    random_topk = np.array(random.sample(nearest_ids, k = min(len(nearest_list), 10)))
    
    target_feat_list = get_feat_list(df, place_id2idx[id2place[place_id]], column)
    m2v_score, random_score = 0, 0
    for pid in m2v_topk:
        rec_feat_list = get_feat_list(df, place_id2idx[id2place[pid]], column)
        m2v_score += jaccard_similarity(target_feat_list, rec_feat_list)
    
    for pid in random_topk:
        rec_feat_list = get_feat_list(df, place_id2idx[id2place[pid]], column)
        random_score += jaccard_similarity(target_feat_list, rec_feat_list)
    
    return m2v_score, random_score
        

In [16]:
compare_random(3, f_df, 'feature')

(1.0, 0.1111111111111111)

In [17]:
compare_random(1, l_df, 'like')

(2.964285714285714, 4.619047619047619)

In [18]:
def mean_score():
    mean_m2v, mean_random = 0, 0

    for i in tqdm(id2place.keys()):
        m2v_score, random_score = compare_random(i, f_df, 'feature')
        mean_m2v += m2v_score
        mean_random += random_score

    return mean_m2v, mean_random

In [19]:
mean_score()

100%|██████████| 12677/12677 [12:56<00:00, 16.33it/s]


(4016.560664991703, 3933.714106017701)

In [88]:
mean_score()

100%|██████████| 12677/12677 [13:01<00:00, 16.21it/s]


(3919.086140192669, 4032.298324687215)