In [81]:
import pandas as pd
import numpy as np
import dgl
import torch
from tqdm import tqdm, trange
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import torch.optim as optim
from konlpy.tag import *
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import random
import os

In [2]:
data_dir = '/opt/ml/final/data'
food_path = os.path.join(data_dir, 'food.csv')
USE_COLS = ['placeName', 'placeType', 'placeAddress', 'themeKeywords','like']
raw_df = pd.read_csv(food_path, usecols=USE_COLS)
raw_df = raw_df[~raw_df.placeType.str.contains('성급')].reset_index().copy()
# raw_df = pd.read_json(data_path + 'placeInfo.json')

In [3]:
raw_df['placeID'] = raw_df.apply(lambda x : x['placeName'] + x['placeAddress'], axis = 1)
raw_df['placeID'] = raw_df['placeID'].apply(lambda x : x.replace(" ", ""))

#### Place Type

In [4]:
p_df = raw_df[['placeID', 'placeType']]
p_df.columns = ['placeID', 'feature']
p_df.head()

Unnamed: 0,placeID,feature
0,밀밭정원서울마포구마포대로16길13,"칼국수,만두"
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),바(BAR)
2,동북양꼬치서울영등포구디지털로37길26-1,양꼬치
3,농부쌈밥서울동작구사당로30길19,쌈밥
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,김밥


In [5]:
def remap_id(id_lst) :
    id_lst.sort()
    id_to_idx, idx_to_id = dict(), dict()
    for index, value in enumerate(id_lst) :
        id_to_idx[value] = index
        idx_to_id[index] = value
    return id_to_idx, idx_to_id 

#### place theme keywords

In [6]:
# hannanum = Hannanum()
# komoran = Komoran()
okt = Okt()

In [7]:
def prep_nouns(word:str):
    # noun = hannanum.nouns(word)
    # noun = komoran.nouns(word)
    noun = okt.nouns(word)
    if noun:
        return noun[0]
    else:
        return ""

In [8]:
k_df = raw_df[['placeID', 'themeKeywords']]
k_df['themeKeywords'] = k_df.themeKeywords.apply(eval)
k_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df['themeKeywords'] = k_df.themeKeywords.apply(eval)


Unnamed: 0,placeID,themeKeywords
0,밀밭정원서울마포구마포대로16길13,[]
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),"[술집, 세계맥주, 맥주집, 호프집, 생맥주]"
2,동북양꼬치서울영등포구디지털로37길26-1,[]
3,농부쌈밥서울동작구사당로30길19,"[인심좋은, 친절한, 친절하신, 친절하고, 쌈밥, 제육볶음, 오리로스, 부대찌개, ..."
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,[]


In [9]:
theme_place = k_df[k_df.themeKeywords.str.len()!=0]['themeKeywords']
theme_place

1                                [술집, 세계맥주, 맥주집, 호프집, 생맥주]
3        [인심좋은, 친절한, 친절하신, 친절하고, 쌈밥, 제육볶음, 오리로스, 부대찌개, ...
6                                              [닭갈비, 닭갈비집]
7                  [심플한, 돼지곱창, 시장, 소곱창, 곱창, 막창, 신선한, 숨어있는]
8        [친절함, 친절하고, 화려한, 친절한, 시장, 소곱창, 양대창, 막창, 곱창, 나들...
                               ...                        
12646     [고급진, 깨끗한, 고급스러운, 안락한, 초밥, 젓갈, 튀김, 횟집, 참치회, 신선한]
12651    [아늑한, 분위기좋은, 토속적인분위기, 김치찌개, 굴보쌈, 한정식, 곱창, 비빔밥,...
12662                                [만두, 아이스크림, 설렁탕, 불고기]
12664                         [닭갈비, 닭갈비집, 주먹밥, 막국수, 새로오픈한]
12665    [고급진, 이국적, 고급스러운, 카레, 팟타이, 태국음식, 쌀국수, 누들, 나들이,...
Name: themeKeywords, Length: 3499, dtype: object

In [10]:
k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))


In [11]:
k_df

Unnamed: 0,placeID,themeKeywords,prepThemeKeywords
0,밀밭정원서울마포구마포대로16길13,[],[]
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),"[술집, 세계맥주, 맥주집, 호프집, 생맥주]","[술집, 세계, 맥주, 호프, 생맥주]"
2,동북양꼬치서울영등포구디지털로37길26-1,[],[]
3,농부쌈밥서울동작구사당로30길19,"[인심좋은, 친절한, 친절하신, 친절하고, 쌈밥, 제육볶음, 오리로스, 부대찌개, ...","[인심, , , , 쌈밥, 제육, 오리, 부대찌개, 맛집, 부담, ]"
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,[],[]
...,...,...,...
12672,은빛바다광동수산행당동본점서울성동구행당로127-1,[],[]
12673,갓파스시분당미금역점경기성남시분당구돌마로67,[],[]
12674,갓잇송리단길점서울송파구백제고분로45길4-14,[],[]
12675,마니주호프수지점경기용인시수지구수지로342번길17에덴프라자,[],[]


In [12]:
keyword_list = list(set(k_df[k_df.prepThemeKeywords.str.len()!=0]['prepThemeKeywords'].sum()))
keyword_list

['',
 '김피탕',
 '날',
 '콘서트',
 '초밥',
 '아바이순대',
 '미숫가루',
 '눈꽃',
 '메뉴',
 '제철',
 '초콜릿',
 '야식',
 '아메리칸스타일',
 '바게트',
 '해물파전',
 '떡볶이',
 '한치',
 '모던',
 '뚝방길',
 '닭볶음탕',
 '넉',
 '향토음식',
 '죽집',
 '김치찌개',
 '참치회',
 '오리',
 '디저트',
 '일식',
 '찹쌀떡',
 '냉면집',
 '당근',
 '크림',
 '똥',
 '라자냐',
 '도다리',
 '손',
 '앤',
 '선지국',
 '등',
 '누룽지',
 '스페인',
 '전통',
 '국수',
 '느낌',
 '킹크랩',
 '자연',
 '신선로',
 '카레',
 '밤',
 '로',
 '간짜장',
 '분위기',
 '치즈',
 '찹쌀',
 '팥죽',
 '쌈밥',
 '곱창전골',
 '알탕',
 '세계',
 '닭',
 '도루묵',
 '코스',
 '다슬기',
 '능이',
 '조각',
 '백숙',
 '건강',
 '보양식',
 '숙성',
 '웅장',
 '맛집',
 '바지락',
 '닭갈비',
 '닭꼬치',
 '연탄구이',
 '어복',
 '시래기',
 '방',
 '창',
 '게임방',
 '피맥',
 '굴국밥',
 '탕수육',
 '버섯',
 '생',
 '철길',
 '재첩',
 '초',
 '몽환',
 '광어',
 '완구',
 '훠궈',
 '모주',
 '게국지',
 '말',
 '밀크쉐이크',
 '산나물',
 '연탄',
 '뽈찜',
 '굴',
 '그리스',
 '집',
 '쫄면',
 '한우국밥',
 '정육',
 '초코',
 '퀘사디아',
 '복숭아',
 '블루베리',
 '아사',
 '초코파이',
 '명란',
 '회덮밥',
 '얼',
 '데이',
 '점심',
 '연어',
 '칵테일',
 '물',
 '고량주',
 '목살',
 '짱뚱어',
 '꼬',
 '닭강정',
 '오징어',
 '아울렛',
 '프라이',
 '참게',
 '멸치',
 '방영',
 '고기국수',
 '대게',
 '주물럭',
 '기념일',
 '커피'

In [13]:
place_id2idx, place_idx2id = remap_id(p_df['placeID'].unique())
feature_id2idx, feature_idx2id = remap_id(list(set(list(p_df['feature']) + keyword_list)))

p_df['placeID'] = p_df['placeID'].apply(lambda x: place_id2idx[f"{x}"])
p_df['feature'] = p_df['feature'].apply(lambda x: feature_id2idx[f"{x}"])

keyword_id2idx, keyword_idx2id = remap_id(list(set(keyword_list)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_df['placeID'] = p_df['placeID'].apply(lambda x: place_id2idx[f"{x}"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_df['feature'] = p_df['feature'].apply(lambda x: feature_id2idx[f"{x}"])


In [14]:
k_df = pd.DataFrame([
    [place_id2idx[id], keyword_id2idx[keyword]] for id, keywords in k_df[['placeID', 'prepThemeKeywords']].itertuples(index=False)
    for keyword in keywords
], columns=['placeID', 'feature'])

In [15]:
k_df = k_df.groupby('feature').filter(lambda x : len(x)>1)

In [16]:
f_df = pd.concat([p_df, k_df])

## Like

In [17]:
l_df = pd.DataFrame.from_records(raw_df['like'].apply(eval))
frequency = np.sum(~l_df.isna(), axis=0)
like_cols = sorted(frequency[np.where(frequency > 1)[0]].index.values)
like_to_idx, idx_to_like = remap_id(like_cols)

In [18]:
l_df['placeID'] = raw_df['placeID'].apply(lambda x:place_id2idx[x])
l_df.set_index('placeID', inplace=True)
l_df.sort_index(inplace=True)
l_df = l_df[like_cols].fillna(0)

In [19]:
total_record = []
for place_id in tqdm(l_df.index.values):
    topk = np.argsort(l_df.fillna(0).values[place_id])[::-1]
    cnt = 0
    for t in topk:
        if l_df.values[place_id, t] == 0 or cnt==5:
            break
        else :
            total_record.append((place_id, t))
            cnt += 1

l_df = pd.DataFrame.from_records(total_record)
l_df.columns = ['placeID', 'like']

100%|██████████| 12677/12677 [00:18<00:00, 681.26it/s]


## Create Heterogeneou Graph

In [20]:
def consrtruct_graph(f_df, l_df) :
    hg = dgl.heterograph({
            ('place', 'pf', 'feature') : (list(f_df['placeID']), list(f_df['feature'])),
            ('feature', 'fp', 'place') : (list(f_df['feature']), list(f_df['placeID'])),
            ('place', 'pl', 'like') : (list(l_df['placeID']), list(l_df['like'])),
            ('like', 'lp', 'place') : (list(l_df['like']), list(l_df['placeID'])),})
    return hg

In [21]:
hg = consrtruct_graph(f_df, l_df)
hg

Graph(num_nodes={'feature': 745, 'like': 49, 'place': 12677},
      num_edges={('feature', 'fp', 'place'): 39699, ('like', 'lp', 'place'): 57755, ('place', 'pf', 'feature'): 39699, ('place', 'pl', 'like'): 57755},
      metagraph=[('feature', 'place', 'fp'), ('place', 'feature', 'pf'), ('place', 'like', 'pl'), ('like', 'place', 'lp')])

In [22]:
num_walks_per_node = 5
walk_length = 5

In [23]:
def create_metapath(graph, place_idx2id) :
    output_file = open(os.path.join('./', 'metapath.txt'), "w")
    for p_idx in trange(graph.number_of_nodes('place')):
        traces, _ = dgl.sampling.random_walk(
            graph, [p_idx] * num_walks_per_node, metapath=['pf', 'fp', 'pl', 'lp'] * walk_length)

        for tr in traces:
            tr = tr[tr[:,]!=-1]
            outline = ''
            for i in range(len(tr)) :
                # i % 2 == 1 을 통해 type도 포함해서 문장 생성 가능
                if i % 2 == 0 :
                    outline += place_idx2id[int(tr[i])] + ' '
            print(outline, file= output_file)


In [24]:
create_metapath(hg, place_idx2id)

100%|██████████| 12677/12677 [00:10<00:00, 1208.50it/s]


## Metapath2Vec Train

In [25]:
### 문장으로 만들어 저장한 metapath2vec.txt를 불러오는 과정
class DataReader:
    NEGATIVE_TABLE_SIZE = 1e8

    def __init__(self, file_name, min_count, care_type):
        self.negatives = []
        self.discards = []
        self.negpos = 0
        self.care_type = care_type
        self.word2id = dict() # 임베딩 생성할 단어와 학습과정에 사용할 인덱스
        self.id2word = dict() # 임베딩 생성할 단어와 학습과정에 사용할 인덱스
        self.sentences_count = 0
        self.token_count = 0
        self.word_frequency = dict()
        self.inputFileName = file_name
        self.read_words(min_count)
        self.initTableNegatives()
        self.initTableDiscards()

    def read_words(self, min_count): 
        '''
        텍스트 파일 읽으면서 각각 단어 등장 빈도 세기
        '''
        print("Read Words...")
        word_frequency = dict()
        for line in open(self.inputFileName):
            line = line.split()
            if len(line) > 1:
                self.sentences_count += 1
                for word in line:
                    if len(word) > 0:
                        self.token_count += 1
                        word_frequency[word] = word_frequency.get(word, 0) + 1 # get(key, default)

                        if self.token_count % 1000000 == 0:
                            print("Read " + str(int(self.token_count / 1000000)) + "M words.")

        wid = 0
        for w, c in word_frequency.items(): # min_count 미만인 단어는 제외하고 단어 dictionary 생성
            if c < min_count:
                continue
            self.word2id[w] = wid
            self.id2word[wid] = w
            self.word_frequency[wid] = c
            wid += 1

        self.word_count = len(self.word2id)
        print("Total embeddings: " + str(len(self.word2id)))

    def initTableDiscards(self):
        # get a frequency table for sub-sampling. Note that the frequency is adjusted by
        # sub-sampling tricks.
        t = 0.0001
        f = np.array(list(self.word_frequency.values())) / self.token_count
        self.discards = np.sqrt(t / f) + (t / f)

    def initTableNegatives(self):
        # get a table for negative sampling, if word with index 2 appears twice, then 2 will be listed
        # in the table twice.
        pow_frequency = np.array(list(self.word_frequency.values())) ** 0.75
        words_pow = sum(pow_frequency)
        ratio = pow_frequency / words_pow
        count = np.round(ratio * DataReader.NEGATIVE_TABLE_SIZE)
        for wid, c in enumerate(count):
            self.negatives += [wid] * int(c)
        self.negatives = np.array(self.negatives)
        np.random.shuffle(self.negatives)
        self.sampling_prob = ratio

    def getNegatives(self, target, size):  # TODO check equality with target
        if self.care_type == 0:
            response = self.negatives[self.negpos:self.negpos + size]
            self.negpos = (self.negpos + size) % len(self.negatives)
            if len(response) != size:
                return np.concatenate((response, self.negatives[0:self.negpos]))
        return response
    

In [61]:
# Metapath2vec Dataset
class Metapath2vecDataset(Dataset):
    def __init__(self, data, window_size):
        # read in data, window_size and input filename
        self.data = data
        self.window_size = window_size # 타겟 단어 중심 몇 개의 단어를 볼 것인가
        self.input_file = open(data.inputFileName)

    def __len__(self):
        # return the number of walks
        return self.data.sentences_count

    def __getitem__(self, idx):
        # return the list of pairs (center, context, 5 negatives)
        while True:
            line = self.input_file.readline()
            if not line:
                self.input_file.seek(0, 0)
                line = self.input_file.readline()

            if len(line) > 1:
                words = line.split()

                if len(words) > 1:
                    word_ids = [self.data.word2id[w] for w in words if
                                w in self.data.word2id and np.random.rand() < self.data.discards[self.data.word2id[w]]]

                    pair_catch = []
                    for i, u in enumerate(word_ids):
                        for j, v in enumerate(
                                word_ids[max(i - self.window_size, 0):i + self.window_size]):
                            assert u < self.data.word_count
                            assert v < self.data.word_count
                            if i == j:
                                continue
                            pair_catch.append((u, v, self.data.getNegatives(v,5)))
                    return pair_catch


    @staticmethod
    def collate(batches):
        all_u = np.array([u for batch in batches for u, _, _ in batch if len(batch) > 0])
        all_v = np.array([v for batch in batches for _, v, _ in batch if len(batch) > 0])
        all_neg_v = np.array([neg_v for batch in batches for _, _, neg_v in batch if len(batch) > 0])

        return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v)

In [27]:
## SkipGram Model
class SkipGramModel(nn.Module):

    def __init__(self, emb_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)
        self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)

        initrange = 1.0 / self.emb_dimension
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
        init.constant_(self.v_embeddings.weight.data, 0)

    def forward(self, pos_u, pos_v, neg_v):
        emb_u = self.u_embeddings(pos_u)
        emb_v = self.v_embeddings(pos_v)
        emb_neg_v = self.v_embeddings(neg_v)

        score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = -F.logsigmoid(score)

        neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze()
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)

        return torch.mean(score + neg_score)

    def save_embedding(self, id2word, file_name):
        embedding = self.u_embeddings.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.emb_dimension))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))

In [28]:
# Metapath2vec 
class Metapath2VecTrainer:
    def __init__(self, path):
        min_count, care_type = 0, 0
        batch_size, iterations = 50, 2
        window_size, dim, initial_lr = 10, 128, 0.025
        num_workers = 1
        
        self.data = DataReader(path, min_count, care_type)
        dataset = Metapath2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset, batch_size=batch_size,
                                     shuffle=True, num_workers=num_workers, collate_fn=dataset.collate)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = dim
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):
        optimizer = optim.SparseAdam(list(self.skip_gram_model.parameters()), lr=self.initial_lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader))

        for iteration in range(self.iterations):
            print("\n\n\nIteration: " + str(iteration + 1))
            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):
                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 50000 == 0:
                        print(" Loss: " + str(running_loss))
        
        self.skip_gram_model.save_embedding(self.data.id2word, data_dir+"metapath_embeddings")

In [29]:
m2v = Metapath2VecTrainer(os.path.join('./', "metapath.txt"))

Read Words...
Total embeddings: 12677


In [60]:
m2v.train()




Iteration: 1


  return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v)
100%|██████████| 1268/1268 [00:25<00:00, 49.34it/s]





Iteration: 2


  return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v)
100%|██████████| 1268/1268 [00:25<00:00, 50.10it/s]


In [66]:
data_dir

'/opt/ml/final/data'

In [67]:
def create_embedding_file() :
    with open(data_dir + 'metapath_embeddings', 'r') as f:
        id2word_len, emb_dimension = f.readline().split()
        id2word = {}
        word2id = {}
        embeddings = []
        idx = 0
        while True :
            z = f.readline()
            if not z :
                break
            z = z.split()
            word = z[0]
            embedding = list(map(float, z[1:]))
            embeddings.append(embedding)
            id2word[idx] = word
            word2id[word] = idx
            idx += 1
    return id2word, word2id, embeddings

In [68]:
id2place, place2id, place_emb = create_embedding_file()

In [69]:
cossim = cosine_similarity(np.array(place_emb))
topk = np.argsort(cossim[4061])[::-1][:5]
topk

array([4061, 6291, 4688, 1975, 5686])

In [70]:
id2place[6964], id2place[4061], id2place[8753], id2place[11357], id2place[3930]

('채선당샤브샤브&한우구이구로디지털점서울구로구디지털로32길30',
 '삼원일식서울서대문구통일로9안길322층',
 '이삭토스트신설동역점서울종로구종로393',
 '딸랏롯빠이서울관악구관악로12길113대원빌딩1층101호',
 '신천일호집서울송파구백제고분로7길28-13')

In [71]:
l_df[l_df['placeID'] == place_id2idx[id2place[4061]]]

Unnamed: 0,placeID,like
25848,5689,27
25849,5689,39
25850,5689,32
25851,5689,5
25852,5689,1


In [None]:
feature_idx2id[348], like_cols[22]

('생선회', '양이 많아요')

## Performance Check

#### Jaccard Similarity

In [32]:
def jaccard_similarity(list1, list2): 
    s1 = set(list1)
    s2 = set(list2) 
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [56]:
def get_feat_list(df, place_id, feature):
    return df[df.placeID == place_id][feature].tolist()

In [77]:
feat1 = get_feat_list(l_df, place_id2idx[id2place[4061]], 'like')
feat2 = get_feat_list(l_df, place_id2idx[id2place[4688]], 'like')
jaccard_similarity(feat1, feat2)

0.25

In [122]:
def compare_random(place_id, df, column):
    m2v_topk = np.argsort(cossim[place_id])[::-1][1:11]
    random_topk = np.array(random.sample(id2place.keys(), k = 10))
    
    target_feat_list = get_feat_list(df, place_id2idx[id2place[place_id]], column)
    m2v_score, random_score = 0, 0
    for pid in m2v_topk:
        rec_feat_list = get_feat_list(df, place_id2idx[id2place[pid]], column)
        m2v_score += jaccard_similarity(target_feat_list, rec_feat_list)
    
    for pid in random_topk:
        rec_feat_list = get_feat_list(df, place_id2idx[id2place[pid]], column)
        random_score += jaccard_similarity(target_feat_list, rec_feat_list)
    
    return m2v_score, random_score
        

In [147]:
compare_random(30, f_df, 'feature')

(1.0909090909090908, 0.0)

In [140]:
compare_random(160, l_df, 'like')

(5.476190476190476, 5.059523809523809)