In [1]:
import pandas as pd
import numpy as np
import dgl
import torch
from tqdm import tqdm, trange
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import torch.optim as optim
from konlpy.tag import *
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import random
import pickle
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = '/opt/ml/final-project-level3-recsys-02/data/'

In [3]:
with open(data_dir + 'food.pickle', 'rb') as f :
    raw_df = pickle.load(f)
USE_COLS = ['placeName', 'placeType', 'placeAddress', 'themeKeywords','like', 'menulabel', 'ageLabel', 'ratingLabel',  'visitLabel', 'blogLabel']
raw_df = raw_df[USE_COLS]
raw_df = raw_df[~raw_df.placeType.str.contains('성급')].reset_index().copy()


In [4]:
raw_df['placeID'] = raw_df.apply(lambda x : x['placeName'] + x['placeAddress'], axis = 1)
raw_df['placeID'] = raw_df['placeID'].apply(lambda x : x.replace(" ", ""))

### Place type

In [5]:
p_df = raw_df[['placeID', 'placeType']]
p_df.columns = ['placeID', 'feature']
p_df.head()

Unnamed: 0,placeID,feature
0,밀밭정원서울마포구마포대로16길13,"칼국수,만두"
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),바(BAR)
2,동북양꼬치서울영등포구디지털로37길26-1,양꼬치
3,농부쌈밥서울동작구사당로30길19,쌈밥
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,김밥


In [6]:
for idx, value in enumerate(p_df.iterrows()):
    if not p_df['feature'][idx]:
        p_df['placeID']

In [7]:
p_df['feature'] = p_df['feature'].apply(lambda x : x.split(','))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_df['feature'] = p_df['feature'].apply(lambda x : x.split(','))


In [8]:
p_df = pd.DataFrame([
    [place_id, feature] for place_id, features in p_df.itertuples(index=False)
    for feature in features
    ], columns=p_df.columns)

In [9]:
p_df.head()

Unnamed: 0,placeID,feature
0,밀밭정원서울마포구마포대로16길13,칼국수
1,밀밭정원서울마포구마포대로16길13,만두
2,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),바(BAR)
3,동북양꼬치서울영등포구디지털로37길26-1,양꼬치
4,농부쌈밥서울동작구사당로30길19,쌈밥


#### Place Theme Keywords

In [10]:
okt = Okt()

In [11]:
def prep_nouns(word:str):
    noun = okt.nouns(word)
    if noun:
        return noun[0]
    else:
        return ""

In [186]:
k_df = raw_df[['placeID', 'themeKeywords']]

In [187]:
theme_place = k_df[k_df.themeKeywords.str.len()!=0]['themeKeywords']
theme_place

1                                [술집, 세계맥주, 맥주집, 호프집, 생맥주]
3        [인심좋은, 친절한, 친절하신, 친절하고, 쌈밥, 제육볶음, 오리로스, 부대찌개, ...
6                                              [닭갈비, 닭갈비집]
7                  [심플한, 돼지곱창, 시장, 소곱창, 곱창, 막창, 신선한, 숨어있는]
8        [친절함, 친절하고, 화려한, 친절한, 시장, 소곱창, 양대창, 막창, 곱창, 나들...
                               ...                        
12646     [고급진, 깨끗한, 고급스러운, 안락한, 초밥, 젓갈, 튀김, 횟집, 참치회, 신선한]
12651    [아늑한, 분위기좋은, 토속적인분위기, 김치찌개, 굴보쌈, 한정식, 곱창, 비빔밥,...
12662                                [만두, 아이스크림, 설렁탕, 불고기]
12664                         [닭갈비, 닭갈비집, 주먹밥, 막국수, 새로오픈한]
12665    [고급진, 이국적, 고급스러운, 카레, 팟타이, 태국음식, 쌀국수, 누들, 나들이,...
Name: themeKeywords, Length: 3499, dtype: object

In [188]:
k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))


In [189]:
keyword_list = list(set(k_df[k_df.prepThemeKeywords.str.len()!=0]['prepThemeKeywords'].sum()))

In [190]:
k_df = pd.DataFrame([
    [id, keyword] for id, keywords in k_df[['placeID', 'prepThemeKeywords']].itertuples(index=False)
    for keyword in keywords
], columns=['placeID', 'prepThemeKeywords'])

In [195]:
k_df = k_df[k_df.prepThemeKeywords!=""]
k_df.head()

Unnamed: 0,placeID,prepThemeKeywords
0,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),술집
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),세계
2,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),맥주
3,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),호프
4,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),생맥주


#### Like

In [104]:
l_df = pd.DataFrame.from_records(raw_df['like'])
l_df.head()

Unnamed: 0,음식이 맛있어요,재료가 신선해요,친절해요,특별한 메뉴가 있어요,단체모임 하기 좋아요,매장이 청결해요,혼밥하기 좋아요,양이 많아요,가성비가 좋아요,매장이 넓어요,...,건강한 맛이에요,아늑해요,컨셉이 독특해요,샐러드바가 잘 되어있어요,현지 맛에 가까워요,추천을 잘해줘요,라이브공연이 훌륭해요,파티하기 좋아요,반려동물과 가기 좋아요,잡내가 적어요
0,26.0,11.0,10.0,7.0,4.0,3.0,2.0,2.0,2.0,2.0,...,,,,,,,,,,
1,59.0,2.0,62.0,22.0,18.0,40.0,,2.0,12.0,1.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,73.0,45.0,42.0,3.0,,10.0,10.0,25.0,57.0,4.0,...,,,,,,,,,,
4,18.0,8.0,17.0,2.0,1.0,6.0,25.0,6.0,18.0,2.0,...,,,,,,,,,,


In [105]:
frequency = np.sum(~l_df.isna(), axis=0)
cond1 = frequency > 1
cond2 = frequency < 9000
like_cols = sorted(frequency[np.where(cond1&cond2)[0]].index.values)

In [107]:
l_df = l_df[like_cols].fillna(0)

In [108]:
total_record = []
for index in tqdm(l_df.index.values):
    topk = np.argsort(l_df.values[index])[::-1]

    cnt = 0
    for t in topk:
        if l_df.values[index, t] == 0 or cnt==5:
            break
        else :
            total_record.append((raw_df['placeID'][index], l_df.columns[t]))
            cnt += 1

l_df = pd.DataFrame.from_records(total_record)
l_df.columns = ['placeID', 'like']

100%|██████████| 12677/12677 [00:00<00:00, 14559.79it/s]


In [109]:
l_df.head(10)

Unnamed: 0,placeID,like
0,밀밭정원서울마포구마포대로16길13,단체모임 하기 좋아요
1,밀밭정원서울마포구마포대로16길13,혼밥하기 좋아요
2,밀밭정원서울마포구마포대로16길13,뷰가 좋아요
3,밀밭정원서울마포구마포대로16길13,매장이 넓어요
4,밀밭정원서울마포구마포대로16길13,화장실이 깨끗해요
5,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),인테리어가 멋져요
6,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),술이 다양해요
7,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),오래 머무르기 좋아요
8,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),대화하기 좋아요
9,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),화장실이 깨끗해요


#### Menu

In [175]:
m_df = raw_df[['placeID', 'menulabel']]
m_df = m_df.dropna()
m_df.head()

Unnamed: 0,placeID,menulabel
0,밀밭정원서울마포구마포대로16길13,30000이하
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),30000이상
2,동북양꼬치서울영등포구디지털로37길26-1,30000이하
3,농부쌈밥서울동작구사당로30길19,10000이하
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,10000이하


#### Age

In [116]:
a_df = raw_df[['placeID', 'ageLabel']]
a_df.head()

Unnamed: 0,placeID,ageLabel
0,밀밭정원서울마포구마포대로16길13,"[50대, 40대, 60대]"
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),"[20대, 30대, 40대]"
2,동북양꼬치서울영등포구디지털로37길26-1,[]
3,농부쌈밥서울동작구사당로30길19,"[20대, 30대, 40대]"
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,[]


In [118]:
a_df = pd.DataFrame([
    [id, age] for id, ages in a_df.itertuples(index=False)
    for age in ages
], columns=a_df.columns)

In [119]:
a_df.head()

Unnamed: 0,placeID,ageLabel
0,밀밭정원서울마포구마포대로16길13,50대
1,밀밭정원서울마포구마포대로16길13,40대
2,밀밭정원서울마포구마포대로16길13,60대
3,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),20대
4,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),30대


#### Rating

In [120]:
r_df = raw_df[['placeID', 'ratingLabel']]
r_df.head()

Unnamed: 0,placeID,ratingLabel
0,밀밭정원서울마포구마포대로16길13,4.5이하
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),4.5이상
2,동북양꼬치서울영등포구디지털로37길26-1,4.5이하
3,농부쌈밥서울동작구사당로30길19,4.5이하
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,4.5이하


#### Visit

In [121]:
v_df = raw_df[['placeID', 'visitLabel']]
v_df.head()

Unnamed: 0,placeID,visitLabel
0,밀밭정원서울마포구마포대로16길13,visitQ2
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),visitQ3
2,동북양꼬치서울영등포구디지털로37길26-1,visitQ1
3,농부쌈밥서울동작구사당로30길19,visitQ4
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,visitQ3


#### Blog

In [123]:
b_df = raw_df[['placeID', 'blogLabel']]
b_df.head()

Unnamed: 0,placeID,blogLabel
0,밀밭정원서울마포구마포대로16길13,blogQ2
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),blogQ4
2,동북양꼬치서울영등포구디지털로37길26-1,blogQ1
3,농부쌈밥서울동작구사당로30길19,blogQ4
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,blogQ1


## Reamp IDs

In [124]:
def remap_id(id_lst) :
    id_lst.sort()
    id_to_idx, idx_to_id = dict(), dict()
    for index, value in enumerate(id_lst) :
        id_to_idx[value] = index
        idx_to_id[index] = value
    return id_to_idx, idx_to_id 

#### All Features

In [196]:
k_df.rename(columns={'prepThemeKeywords':'feature'}, inplace=True)
l_df.rename(columns={'like':'feature'}, inplace=True)
m_df.rename(columns={'menulabel':'feature'}, inplace=True)
a_df.rename(columns={'ageLabel':'feature'}, inplace=True)
r_df.rename(columns={'ratingLabel':'feature'}, inplace=True)
v_df.rename(columns={'visitLabel':'feature'}, inplace=True)
b_df.rename(columns={'blogLabel':'feature'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df.rename(columns={'prepThemeKeywords':'feature'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_df.rename(columns={'ratingLabel':'feature'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  v_df.rename(columns={'visitLabel':'feature'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v

In [197]:
all_df = pd.concat([p_df, k_df, l_df, m_df, a_df, r_df, v_df, b_df], axis=0)
all_df.head()

Unnamed: 0,placeID,feature
0,밀밭정원서울마포구마포대로16길13,칼국수
1,밀밭정원서울마포구마포대로16길13,만두
2,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),바(BAR)
3,동북양꼬치서울영등포구디지털로37길26-1,양꼬치
4,농부쌈밥서울동작구사당로30길19,쌈밥


In [198]:
place_id2idx, place_idx2id = remap_id(all_df['placeID'].unique())
feature_id2idx, feature_idx2id = remap_id(all_df['feature'].unique())

In [204]:
all_df['placeID'] = all_df['placeID'].apply(lambda x : place_id2idx[x])
all_df['feature'] = all_df['feature'].apply(lambda x: feature_id2idx[x])

## Create Heterogeneous Graph

In [199]:
# all features
def consrtruct_graph(df) :
    hg = dgl.heterograph({
            ('place', 'pf', 'feature') : (list(df['placeID']), list(df['feature'])),
            ('feature', 'fp', 'place') : (list(df['feature']), list(df['placeID'])),
            })
    return hg

In [206]:
graph = consrtruct_graph(all_df)
graph

Graph(num_nodes={'feature': 802, 'place': 12677},
      num_edges={('feature', 'fp', 'place'): 156301, ('place', 'pf', 'feature'): 156301},
      metagraph=[('feature', 'place', 'fp'), ('place', 'feature', 'pf')])

## Create MetaPath

In [207]:
num_walks_per_node = 5
walk_length = 5

In [208]:
def create_metapath(graph, place_idx2id) :
    output_file = open(os.path.join(data_dir, 'metapath.txt'), "w")
    for p_idx in trange(graph.number_of_nodes('place')):
        traces, _ = dgl.sampling.random_walk(
            graph, [p_idx] * num_walks_per_node, metapath=['pf', 'fp'] * walk_length)

        for tr in traces:
            tr = tr[tr[:,]!=-1]
            outline = ''
            for i in range(len(tr)) :
                # i % 2 == 1 을 통해 type도 포함해서 문장 생성 가능
                if i % 2 == 0 :
                    outline += place_idx2id[int(tr[i])] + ' '
            print(outline, file= output_file)

In [210]:
create_metapath(graph, place_idx2id)

100%|██████████| 12677/12677 [00:08<00:00, 1553.33it/s]


## Train Metapath2vec

In [211]:
### 문장으로 만들어 저장한 metapath2vec.txt를 불러오는 과정
class DataReader:
    NEGATIVE_TABLE_SIZE = 1e8

    def __init__(self, file_name, min_count, care_type):
        self.negatives = []
        self.discards = []
        self.negpos = 0
        self.care_type = care_type
        self.word2id = dict() # 임베딩 생성할 단어와 학습과정에 사용할 인덱스
        self.id2word = dict() # 임베딩 생성할 단어와 학습과정에 사용할 인덱스
        self.sentences_count = 0
        self.token_count = 0
        self.word_frequency = dict()
        self.inputFileName = file_name
        self.read_words(min_count)
        self.initTableNegatives()
        self.initTableDiscards()

    def read_words(self, min_count): 
        '''
        텍스트 파일 읽으면서 각각 단어 등장 빈도 세기
        '''
        print("Read Words...")
        word_frequency = dict()
        for line in open(self.inputFileName):
            line = line.split()
            if len(line) > 1:
                self.sentences_count += 1
                for word in line:
                    if len(word) > 0:
                        self.token_count += 1
                        word_frequency[word] = word_frequency.get(word, 0) + 1 # get(key, default)

                        if self.token_count % 1000000 == 0:
                            print("Read " + str(int(self.token_count / 1000000)) + "M words.")

        wid = 0
        for w, c in word_frequency.items(): # min_count 미만인 단어는 제외하고 단어 dictionary 생성
            if c < min_count:
                continue
            self.word2id[w] = wid
            self.id2word[wid] = w
            self.word_frequency[wid] = c
            wid += 1

        self.word_count = len(self.word2id)
        print("Total embeddings: " + str(len(self.word2id)))

    def initTableDiscards(self):
        # get a frequency table for sub-sampling. Note that the frequency is adjusted by
        # sub-sampling tricks.
        t = 0.0001
        f = np.array(list(self.word_frequency.values())) / self.token_count
        self.discards = np.sqrt(t / f) + (t / f)

    def initTableNegatives(self):
        # get a table for negative sampling, if word with index 2 appears twice, then 2 will be listed
        # in the table twice.
        pow_frequency = np.array(list(self.word_frequency.values())) ** 0.75
        words_pow = sum(pow_frequency)
        ratio = pow_frequency / words_pow
        count = np.round(ratio * DataReader.NEGATIVE_TABLE_SIZE)
        for wid, c in enumerate(count):
            self.negatives += [wid] * int(c)
        self.negatives = np.array(self.negatives)
        np.random.shuffle(self.negatives)
        self.sampling_prob = ratio

    def getNegatives(self, target, size):  # TODO check equality with target
        if self.care_type == 0:
            response = self.negatives[self.negpos:self.negpos + size]
            self.negpos = (self.negpos + size) % len(self.negatives)
            if len(response) != size:
                return np.concatenate((response, self.negatives[0:self.negpos]))
        return response
    

In [212]:
# Metapath2vec Dataset
class Metapath2vecDataset(Dataset):
    def __init__(self, data, window_size):
        # read in data, window_size and input filename
        self.data = data
        self.window_size = window_size # 타겟 단어 중심 몇 개의 단어를 볼 것인가
        self.input_file = open(data.inputFileName)

    def __len__(self):
        # return the number of walks
        return self.data.sentences_count

    def __getitem__(self, idx):
        # return the list of pairs (center, context, 5 negatives)
        while True:
            line = self.input_file.readline()
            if not line:
                self.input_file.seek(0, 0)
                line = self.input_file.readline()

            if len(line) > 1:
                words = line.split()

                if len(words) > 1:
                    word_ids = [self.data.word2id[w] for w in words if
                                w in self.data.word2id and np.random.rand() < self.data.discards[self.data.word2id[w]]]

                    pair_catch = []
                    for i, u in enumerate(word_ids):
                        for j, v in enumerate(
                                word_ids[max(i - self.window_size, 0):i + self.window_size]):
                            assert u < self.data.word_count
                            assert v < self.data.word_count
                            if i == j:
                                continue
                            pair_catch.append((u, v, self.data.getNegatives(v,5)))
                    return pair_catch


    @staticmethod
    def collate(batches):
        all_u = np.array([u for batch in batches for u, _, _ in batch if len(batch) > 0])
        all_v = np.array([v for batch in batches for _, v, _ in batch if len(batch) > 0])
        all_neg_v = np.array([neg_v for batch in batches for _, _, neg_v in batch if len(batch) > 0])

        return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v)

In [213]:
## SkipGram Model
class SkipGramModel(nn.Module):

    def __init__(self, emb_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)
        self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)

        initrange = 1.0 / self.emb_dimension
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
        init.constant_(self.v_embeddings.weight.data, 0)

    def forward(self, pos_u, pos_v, neg_v):
        emb_u = self.u_embeddings(pos_u)
        emb_v = self.v_embeddings(pos_v)
        emb_neg_v = self.v_embeddings(neg_v)

        score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = -F.logsigmoid(score)

        neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze()
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)

        return torch.mean(score + neg_score)

    def save_embedding(self, id2word, file_name):
        embedding = self.u_embeddings.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.emb_dimension))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))

In [214]:
# Metapath2vec 
class Metapath2VecTrainer:
    def __init__(self, path):
        min_count, care_type = 0, 0
        batch_size, iterations = 50, 2
        window_size, dim, initial_lr = 10, 128, 0.025
        num_workers = 1
        
        self.data = DataReader(path, min_count, care_type)
        dataset = Metapath2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset, batch_size=batch_size,
                                     shuffle=True, num_workers=num_workers, collate_fn=dataset.collate)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = dim
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):
        optimizer = optim.SparseAdam(list(self.skip_gram_model.parameters()), lr=self.initial_lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader))

        for iteration in range(self.iterations):
            print("\n\n\nIteration: " + str(iteration + 1))
            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):
                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 50000 == 0:
                        print(" Loss: " + str(running_loss))
        
        self.skip_gram_model.save_embedding(self.data.id2word, data_dir+"metapath_embeddings")

In [215]:
m2v = Metapath2VecTrainer(os.path.join(data_dir, "metapath.txt"))

Read Words...
Total embeddings: 12677


In [218]:
m2v.train()




Iteration: 1


100%|██████████| 1268/1268 [00:08<00:00, 154.44it/s]





Iteration: 2


100%|██████████| 1268/1268 [00:08<00:00, 154.58it/s]


In [219]:
def create_embedding_file() :
    with open(data_dir + 'metapath_embeddings', 'r') as f:
        id2word_len, emb_dimension = f.readline().split()
        id2word = {}
        word2id = {}
        embeddings = []
        idx = 0
        while True :
            z = f.readline()
            if not z :
                break
            z = z.split()
            word = z[0]
            embedding = list(map(float, z[1:]))
            embeddings.append(embedding)
            id2word[idx] = word
            word2id[word] = idx
            idx += 1
    return id2word, word2id, embeddings

In [None]:
id2place, place2id, place_emb = create_embedding_file()