In [40]:
import pandas as pd
import numpy as np
import dgl
import torch
from tqdm import tqdm, trange
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import torch.optim as optim
from konlpy.tag import *
from collections import Counter
import os

In [4]:
data_path = '../data/raw/'

In [5]:
raw_df = pd.read_json(data_path + 'placeInfo.json')

In [25]:
raw_df['placeID'] = raw_df.apply(lambda x : x['placeName'] + x['placeAddress'], axis = 1)
raw_df['placeID'] = raw_df['placeID'].apply(lambda x : x.replace(" ", ""))

#### Place Type

In [26]:
p_df = raw_df[['placeID', 'placeType']]
p_df.head()

Unnamed: 0,placeID,placeType
0,청계다방서울서초구원터4길61층청계다방,"카페,디저트"
1,밀밭정원서울마포구마포대로16길13,"칼국수,만두"
2,투썸플레이스신정뉴타운점서울양천구신월로164,카페
3,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),바(BAR)
4,동북양꼬치서울영등포구디지털로37길26-1,양꼬치


In [11]:
def remap_id(id_lst) :
    id_lst.sort()
    id_to_idx, idx_to_id = dict(), dict()
    for index, value in enumerate(id_lst) :
        id_to_idx[value] = index
        idx_to_id[index] = value
    return id_to_idx, idx_to_id 

In [34]:
place_id2idx, place_idx2id = remap_id(p_df['placeID'].unique())
type_id2idx, type_idx2id = remap_id(p_df['placeType'].unique())

In [35]:
p_df['placeID'] = p_df['placeID'].apply(lambda x: place_id2idx[f"{x}"])
p_df['placeType'] = p_df['placeType'].apply(lambda x: type_id2idx[f"{x}"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_df['placeID'] = p_df['placeID'].apply(lambda x: place_id2idx[f"{x}"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_df['placeType'] = p_df['placeType'].apply(lambda x: type_id2idx[f"{x}"])


In [37]:
p_df.head()

Unnamed: 0,placeID,placeType
0,12775,231
1,5449,232
2,14390,230
3,8609,96
4,3316,162


#### place theme keywords

In [122]:
# hannanum = Hannanum()
# komoran = Komoran()
okt = Okt()

In [123]:
def prep_nouns(word:str):
    # noun = hannanum.nouns(word)
    # noun = komoran.nouns(word)
    noun = okt.nouns(word)
    if noun:
        return noun[0]
    else:
        return ""

In [161]:
k_df = raw_df[['placeID', 'themeKeywords']]
k_df.head()

Unnamed: 0,placeID,themeKeywords
0,청계다방서울서초구원터4길61층청계다방,[]
1,밀밭정원서울마포구마포대로16길13,[]
2,투썸플레이스신정뉴타운점서울양천구신월로164,[]
3,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),"[술집, 세계맥주, 맥주집, 호프집, 생맥주]"
4,동북양꼬치서울영등포구디지털로37길26-1,[]


In [162]:
theme_place = k_df[k_df.themeKeywords.str.len()!=0]['themeKeywords']
theme_place

3                                [술집, 세계맥주, 맥주집, 호프집, 생맥주]
5        [인심좋은, 친절한, 친절하신, 친절하고, 쌈밥, 제육볶음, 오리로스, 부대찌개, ...
8                                              [닭갈비, 닭갈비집]
10                 [심플한, 돼지곱창, 시장, 소곱창, 곱창, 막창, 신선한, 숨어있는]
11       [친절함, 친절하고, 화려한, 친절한, 시장, 소곱창, 양대창, 막창, 곱창, 나들...
                               ...                        
16314    [아늑한, 빈티지한, 분위기있는, 수제초콜릿, 초코빙수, 마카롱, 팬케이크, 발렌타...
16315                                [만두, 아이스크림, 설렁탕, 불고기]
16317                         [닭갈비, 닭갈비집, 주먹밥, 막국수, 새로오픈한]
16318    [고급진, 이국적, 고급스러운, 카레, 팟타이, 태국음식, 쌀국수, 누들, 나들이,...
16330                                         [아이스크림, 케이크]
Name: themeKeywords, Length: 4017, dtype: object

In [163]:
k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))


In [164]:
k_df

Unnamed: 0,placeID,themeKeywords,prepThemeKeywords
0,청계다방서울서초구원터4길61층청계다방,[],[]
1,밀밭정원서울마포구마포대로16길13,[],[]
2,투썸플레이스신정뉴타운점서울양천구신월로164,[],[]
3,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),"[술집, 세계맥주, 맥주집, 호프집, 생맥주]","[술집, 세계, 맥주, 호프, 생맥주]"
4,동북양꼬치서울영등포구디지털로37길26-1,[],[]
...,...,...,...
16326,갓파스시분당미금역점경기성남시분당구돌마로67,[],[]
16327,갓잇송리단길점서울송파구백제고분로45길4-14,[],[]
16328,마니주호프수지점경기용인시수지구수지로342번길17에덴프라자,[],[]
16329,양재정육식당판교점경기성남시분당구분당내곡로1511층,[],[]


In [165]:
keyword_list = list(set(k_df[k_df.prepThemeKeywords.str.len()!=0]['prepThemeKeywords'].sum()))
keyword_list

['',
 '신비',
 '치즈케이크',
 '고급',
 '청포도',
 '대구',
 '컵케이크',
 '우동',
 '돼지국밥',
 '팬',
 '아사',
 '한우국밥',
 '보쌈',
 '냉',
 '치킨',
 '부대찌개',
 '스타',
 '퀄리티',
 '손님',
 '프로포즈',
 '다방',
 '고기국수',
 '생',
 '창',
 '회덮밥',
 '복어',
 '청국장',
 '홍게',
 '하우스',
 '제철',
 '해신',
 '똥집',
 '고등',
 '빵',
 '빨',
 '화이트데이',
 '레스토랑',
 '거북손',
 '유니짜장',
 '카레',
 '드립커피',
 '꼬치',
 '식물원',
 '눈꽃',
 '느낌',
 '크레이프',
 '녹차아이스크림',
 '모듬회',
 '이동',
 '깐풍기',
 '가족',
 '숙성',
 '누룽지',
 '문어',
 '일본',
 '해변',
 '글',
 '수타면',
 '옥돔',
 '갤러리',
 '젓갈',
 '다이닝',
 '음식',
 '참치회',
 '옛날',
 '곱창전골',
 '온천',
 '얘기',
 '볼락',
 '순대',
 '츠케멘',
 '포장마차',
 '닭갈비',
 '팥빵',
 '개인',
 '동그랑땡',
 '고량주',
 '갈대',
 '일식',
 '아트',
 '해물',
 '퀘사디아',
 '테디베어',
 '와인',
 '닭꼬치',
 '은',
 '철판요리',
 '연탄',
 '감자',
 '스팸',
 '수영장',
 '간장게장',
 '뽈살',
 '맘모스',
 '칼국수',
 '제육',
 '모듬전',
 '라자냐',
 '딸기',
 '한라봉',
 '삼계탕',
 '북경오리',
 '수제',
 '메밀국수',
 '돈가스',
 '보양식',
 '이상',
 '수제비',
 '마카롱',
 '양고기',
 '딸기모찌',
 '부담',
 '세계',
 '도루묵',
 '콩국수',
 '필라프',
 '칠리',
 '자몽',
 '왕새우',
 '동동주',
 '육포',
 '멋',
 '호두과자',
 '매운탕',
 '우회',
 '석류',
 '빈대떡',
 '코스',
 '석갈비',
 '국물',
 '추억',
 '솜

In [166]:
keyword_id2idx, keyword_idx2id = remap_id(list(set(keyword_list)))

In [167]:
k_df = pd.DataFrame([
    [place_id2idx[id], keyword_id2idx[keyword]] for id, keywords in k_df[['placeID', 'prepThemeKeywords']].itertuples(index=False)
    for keyword in keywords
], columns=['placeID', 'prepThemeKeywords'])

In [168]:
k_df = k_df.groupby('prepThemeKeywords').filter(lambda x : len(x)>1)

## Create Heterogeneou Graph

In [169]:
def consrtruct_graph(df, k_df) :
    hg = dgl.heterograph({
            ('place', 'pt', 'type') : (list(df['placeID']), list(df['placeType'])),
            ('type', 'tp', 'place') : (list(df['placeType']), list(df['placeID'])),
            ('place', 'pk', 'keyword') : (list(k_df['placeID']), list(k_df['prepThemeKeywords'])),
            ('keyword', 'kp', 'place') : (list(k_df['prepThemeKeywords']), list(k_df['placeID']))})
    return hg

In [171]:
hg = consrtruct_graph(p_df, k_df)
hg

Graph(num_nodes={'keyword': 717, 'place': 16331, 'type': 271},
      num_edges={('keyword', 'kp', 'place'): 30925, ('place', 'pk', 'keyword'): 30925, ('place', 'pt', 'type'): 16331, ('type', 'tp', 'place'): 16331},
      metagraph=[('keyword', 'place', 'kp'), ('place', 'keyword', 'pk'), ('place', 'type', 'pt'), ('type', 'place', 'tp')])

In [152]:
num_walks_per_node = 5
walk_length = 5

In [153]:
def create_metapath(graph, place_idx2id) :
    output_file = open(os.path.join(data_path, 'metapath.txt'), "w")
    for p_idx in trange(graph.number_of_nodes('place')):
        traces, _ = dgl.sampling.random_walk(
            graph, [p_idx] * num_walks_per_node, metapath=['pt', 'tp'] * walk_length)


        for tr in traces:
            tr = tr[tr[:,]!=-1]
            outline = ''
            for i in range(len(tr)) :
                # i % 2 == 1 을 통해 type도 포함해서 문장 생성 가능
                if i % 2 == 0 :
                    outline += place_idx2id[int(tr[i])] + ' '
            print(outline, file= output_file)


In [154]:
create_metapath(hg, place_idx2id)

100%|██████████| 11437/11437 [00:07<00:00, 1603.68it/s]


## Metapath2Vec Train

In [155]:
### 문장으로 만들어 저장한 metapath2vec.txt를 불러오는 과정
class DataReader:
    NEGATIVE_TABLE_SIZE = 1e8

    def __init__(self, file_name, min_count, care_type):
        self.negatives = []
        self.discards = []
        self.negpos = 0
        self.care_type = care_type
        self.word2id = dict() # 임베딩 생성할 단어와 학습과정에 사용할 인덱스
        self.id2word = dict() # 임베딩 생성할 단어와 학습과정에 사용할 인덱스
        self.sentences_count = 0
        self.token_count = 0
        self.word_frequency = dict()
        self.inputFileName = file_name
        self.read_words(min_count)
        self.initTableNegatives()
        self.initTableDiscards()

    def read_words(self, min_count): 
        '''
        텍스트 파일 읽으면서 각각 단어 등장 빈도 세기
        '''
        print("Read Words...")
        word_frequency = dict()
        for line in open(self.inputFileName):
            line = line.split()
            if len(line) > 1:
                self.sentences_count += 1
                for word in line:
                    if len(word) > 0:
                        self.token_count += 1
                        word_frequency[word] = word_frequency.get(word, 0) + 1 # get(key, default)

                        if self.token_count % 1000000 == 0:
                            print("Read " + str(int(self.token_count / 1000000)) + "M words.")

        wid = 0
        for w, c in word_frequency.items(): # min_count 미만인 단어는 제외하고 단어 dictionary 생성
            if c < min_count:
                continue
            self.word2id[w] = wid
            self.id2word[wid] = w
            self.word_frequency[wid] = c
            wid += 1

        self.word_count = len(self.word2id)
        print("Total embeddings: " + str(len(self.word2id)))

    def initTableDiscards(self):
        # get a frequency table for sub-sampling. Note that the frequency is adjusted by
        # sub-sampling tricks.
        t = 0.0001
        f = np.array(list(self.word_frequency.values())) / self.token_count
        self.discards = np.sqrt(t / f) + (t / f)

    def initTableNegatives(self):
        # get a table for negative sampling, if word with index 2 appears twice, then 2 will be listed
        # in the table twice.
        pow_frequency = np.array(list(self.word_frequency.values())) ** 0.75
        words_pow = sum(pow_frequency)
        ratio = pow_frequency / words_pow
        count = np.round(ratio * DataReader.NEGATIVE_TABLE_SIZE)
        for wid, c in enumerate(count):
            self.negatives += [wid] * int(c)
        self.negatives = np.array(self.negatives)
        np.random.shuffle(self.negatives)
        self.sampling_prob = ratio

    def getNegatives(self, target, size):  # TODO check equality with target
        if self.care_type == 0:
            response = self.negatives[self.negpos:self.negpos + size]
            self.negpos = (self.negpos + size) % len(self.negatives)
            if len(response) != size:
                return np.concatenate((response, self.negatives[0:self.negpos]))
        return response
    

In [156]:
# Metapath2vec Dataset
class Metapath2vecDataset(Dataset):
    def __init__(self, data, window_size):
        # read in data, window_size and input filename
        self.data = data
        self.window_size = window_size # 타겟 단어 중심 몇 개의 단어를 볼 것인가
        self.input_file = open(data.inputFileName)

    def __len__(self):
        # return the number of walks
        return self.data.sentences_count

    def __getitem__(self, idx):
        # return the list of pairs (center, context, 5 negatives)
        while True:
            line = self.input_file.readline()
            if not line:
                self.input_file.seek(0, 0)
                line = self.input_file.readline()

            if len(line) > 1:
                words = line.split()

                if len(words) > 1:
                    word_ids = [self.data.word2id[w] for w in words if
                                w in self.data.word2id and np.random.rand() < self.data.discards[self.data.word2id[w]]]

                    pair_catch = []
                    for i, u in enumerate(word_ids):
                        for j, v in enumerate(
                                word_ids[max(i - self.window_size, 0):i + self.window_size]):
                            assert u < self.data.word_count
                            assert v < self.data.word_count
                            if i == j:
                                continue
                            pair_catch.append((u, v, self.data.getNegatives(v,5)))
                    return pair_catch


    @staticmethod
    def collate(batches):
        all_u = [u for batch in batches for u, _, _ in batch if len(batch) > 0]
        all_v = [v for batch in batches for _, v, _ in batch if len(batch) > 0]
        all_neg_v = [neg_v for batch in batches for _, _, neg_v in batch if len(batch) > 0]

        return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v)

In [157]:
## SkipGram Model
class SkipGramModel(nn.Module):

    def __init__(self, emb_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)
        self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)

        initrange = 1.0 / self.emb_dimension
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
        init.constant_(self.v_embeddings.weight.data, 0)

    def forward(self, pos_u, pos_v, neg_v):
        emb_u = self.u_embeddings(pos_u)
        emb_v = self.v_embeddings(pos_v)
        emb_neg_v = self.v_embeddings(neg_v)

        score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = -F.logsigmoid(score)

        neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze()
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)

        return torch.mean(score + neg_score)

    def save_embedding(self, id2word, file_name):
        embedding = self.u_embeddings.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.emb_dimension))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))

In [158]:
# Metapath2vec 
class Metapath2VecTrainer:
    def __init__(self, path):
        min_count, care_type = 0, 0
        batch_size, iterations = 50, 1
        window_size, dim, initial_lr = 10, 128, 0.025
        num_workers = 1
        
        self.data = DataReader(path, min_count, care_type)
        dataset = Metapath2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset, batch_size=batch_size,
                                     shuffle=True, num_workers=num_workers, collate_fn=dataset.collate)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = dim
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):
        optimizer = optim.SparseAdam(list(self.skip_gram_model.parameters()), lr=self.initial_lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader))

        for iteration in range(self.iterations):
            print("\n\n\nIteration: " + str(iteration + 1))
            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):
                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 50000 == 0:
                        print(" Loss: " + str(running_loss))
        
        self.skip_gram_model.save_embedding(self.data.id2word, data_path+"metapath_embeddings")

In [159]:
m2v = Metapath2VecTrainer(os.path.join(data_path, "metapath.txt"))

Read Words...
Total embeddings: 11437


In [160]:
m2v.train()    

  0%|          | 0/1144 [00:00<?, ?it/s]




Iteration: 1


  return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v)
100%|██████████| 1144/1144 [00:09<00:00, 124.68it/s]
